diff options
Diffstat (limited to 'kernel')
41 files changed, 928 insertions, 617 deletions
| diff --git a/kernel/audit.c b/kernel/audit.c index d96045789b54..77770a034d59 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)  	struct task_struct *tsk;  	int err; -	read_lock(&tasklist_lock); +	rcu_read_lock();  	tsk = find_task_by_vpid(pid); -	err = -ESRCH; -	if (!tsk) -		goto out; -	err = 0; - -	spin_lock_irq(&tsk->sighand->siglock); -	if (!tsk->signal->audit_tty) -		err = -EPERM; -	spin_unlock_irq(&tsk->sighand->siglock); -	if (err) -		goto out; - -	tty_audit_push_task(tsk, loginuid, sessionid); -out: -	read_unlock(&tasklist_lock); +	if (!tsk) { +		rcu_read_unlock(); +		return -ESRCH; +	} +	get_task_struct(tsk); +	rcu_read_unlock(); +	err = tty_audit_push_task(tsk, loginuid, sessionid); +	put_task_struct(tsk);  	return err;  } @@ -506,7 +499,7 @@ int audit_send_list(void *_dest)  }  struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, -				 int multi, void *payload, int size) +				 int multi, const void *payload, int size)  {  	struct sk_buff	*skb;  	struct nlmsghdr	*nlh; @@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)   * Allocates an skb, builds the netlink message, and sends it to the pid.   * No failure notifications.   */ -void audit_send_reply(int pid, int seq, int type, int done, int multi, -		      void *payload, int size) +static void audit_send_reply(int pid, int seq, int type, int done, int multi, +			     const void *payload, int size)  {  	struct sk_buff *skb;  	struct task_struct *tsk; @@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	case AUDIT_TTY_GET: {  		struct audit_tty_status s;  		struct task_struct *tsk; +		unsigned long flags; -		read_lock(&tasklist_lock); +		rcu_read_lock();  		tsk = find_task_by_vpid(pid); -		if (!tsk) -			err = -ESRCH; -		else { -			spin_lock_irq(&tsk->sighand->siglock); +		if (tsk && lock_task_sighand(tsk, &flags)) {  			s.enabled = tsk->signal->audit_tty != 0; -			spin_unlock_irq(&tsk->sighand->siglock); -		} -		read_unlock(&tasklist_lock); -		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, -				 &s, sizeof(s)); +			unlock_task_sighand(tsk, &flags); +		} else +			err = -ESRCH; +		rcu_read_unlock(); + +		if (!err) +			audit_send_reply(NETLINK_CB(skb).pid, seq, +					 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));  		break;  	}  	case AUDIT_TTY_SET: {  		struct audit_tty_status *s;  		struct task_struct *tsk; +		unsigned long flags;  		if (nlh->nlmsg_len < sizeof(struct audit_tty_status))  			return -EINVAL;  		s = data;  		if (s->enabled != 0 && s->enabled != 1)  			return -EINVAL; -		read_lock(&tasklist_lock); +		rcu_read_lock();  		tsk = find_task_by_vpid(pid); -		if (!tsk) -			err = -ESRCH; -		else { -			spin_lock_irq(&tsk->sighand->siglock); +		if (tsk && lock_task_sighand(tsk, &flags)) {  			tsk->signal->audit_tty = s->enabled != 0; -			spin_unlock_irq(&tsk->sighand->siglock); -		} -		read_unlock(&tasklist_lock); +			unlock_task_sighand(tsk, &flags); +		} else +			err = -ESRCH; +		rcu_read_unlock();  		break;  	}  	default: diff --git a/kernel/audit.h b/kernel/audit.h index f7206db4e13d..91e7071c4d2c 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,  				    int *dirlen);  extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,  					     int done, int multi, -					     void *payload, int size); -extern void		    audit_send_reply(int pid, int seq, int type, -					     int done, int multi, -					     void *payload, int size); +					     const void *payload, int size);  extern void		    audit_panic(const char *message);  struct audit_netlink_list { diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7f18d3a4527e..37b2bea170c8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)  {  	struct audit_chunk *chunk = find_chunk(p);  	struct fsnotify_mark *entry = &chunk->mark; -	struct audit_chunk *new; +	struct audit_chunk *new = NULL;  	struct audit_tree *owner;  	int size = chunk->count - 1;  	int i, j; @@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)  	spin_unlock(&hash_lock); +	if (size) +		new = alloc_chunk(size); +  	spin_lock(&entry->lock);  	if (chunk->dead || !entry->i.inode) {  		spin_unlock(&entry->lock); +		if (new) +			free_chunk(new);  		goto out;  	} @@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)  		goto out;  	} -	new = alloc_chunk(size);  	if (!new)  		goto Fallback; +  	fsnotify_duplicate_mark(&new->mark, entry);  	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {  		free_chunk(new); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f0c9b2e7542d..d2e3c7866460 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -60,7 +60,7 @@ struct audit_parent {  };  /* fsnotify handle. */ -struct fsnotify_group *audit_watch_group; +static struct fsnotify_group *audit_watch_group;  /* fsnotify events we care about. */  #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ @@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)  	}  } -void audit_remove_watch(struct audit_watch *watch) +static void audit_remove_watch(struct audit_watch *watch)  {  	list_del(&watch->wlist);  	audit_put_parent(watch->parent); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eb7675499fb5..add2819af71b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,  		case AUDIT_LOGINUID:  			result = audit_comparator(cb->loginuid, f->op, f->val);  			break; +		case AUDIT_SUBJ_USER: +		case AUDIT_SUBJ_ROLE: +		case AUDIT_SUBJ_TYPE: +		case AUDIT_SUBJ_SEN: +		case AUDIT_SUBJ_CLR: +			if (f->lsm_rule) +				result = security_audit_rule_match(cb->sid, +								   f->type, +								   f->op, +								   f->lsm_rule, +								   NULL); +			break;  		}  		if (!result) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1b31c130d034..f49a0318c2ed 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -241,6 +241,10 @@ struct audit_context {  			pid_t			pid;  			struct audit_cap_data	cap;  		} capset; +		struct { +			int			fd; +			int			flags; +		} mmap;  	};  	int fds[2]; @@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)  		audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);  		audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);  		break; } +	case AUDIT_MMAP: { +		audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, +				 context->mmap.flags); +		break; }  	}  	audit_log_end(ab);  } @@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,  	context->type = AUDIT_CAPSET;  } +void __audit_mmap_fd(int fd, int flags) +{ +	struct audit_context *context = current->audit_context; +	context->mmap.fd = fd; +	context->mmap.flags = flags; +	context->type = AUDIT_MMAP; +} +  /**   * audit_core_dumps - record information about processes that end abnormally   * @signr: signal value diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9270d532ec3c..66a416b42c18 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);  } +static int clone_children(const struct cgroup *cgrp) +{ +	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} +  /*   * for_each_subsys() allows you to iterate on each subsystem attached to   * an active hierarchy @@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)  		seq_puts(seq, ",noprefix");  	if (strlen(root->release_agent_path))  		seq_printf(seq, ",release_agent=%s", root->release_agent_path); +	if (clone_children(&root->top_cgroup)) +		seq_puts(seq, ",clone_children");  	if (strlen(root->name))  		seq_printf(seq, ",name=%s", root->name);  	mutex_unlock(&cgroup_mutex); @@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {  	unsigned long subsys_bits;  	unsigned long flags;  	char *release_agent; +	bool clone_children;  	char *name;  	/* User explicitly requested empty subsystem */  	bool none; @@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {   */  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  { -	char *token, *o = data ?: "all"; +	char *token, *o = data; +	bool all_ss = false, one_ss = false;  	unsigned long mask = (unsigned long)-1;  	int i;  	bool module_pin_failed = false; @@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	while ((token = strsep(&o, ",")) != NULL) {  		if (!*token)  			return -EINVAL; -		if (!strcmp(token, "all")) { -			/* Add all non-disabled subsystems */ -			opts->subsys_bits = 0; -			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -				struct cgroup_subsys *ss = subsys[i]; -				if (ss == NULL) -					continue; -				if (!ss->disabled) -					opts->subsys_bits |= 1ul << i; -			} -		} else if (!strcmp(token, "none")) { +		if (!strcmp(token, "none")) {  			/* Explicitly have no subsystems */  			opts->none = true; -		} else if (!strcmp(token, "noprefix")) { +			continue; +		} +		if (!strcmp(token, "all")) { +			/* Mutually exclusive option 'all' + subsystem name */ +			if (one_ss) +				return -EINVAL; +			all_ss = true; +			continue; +		} +		if (!strcmp(token, "noprefix")) {  			set_bit(ROOT_NOPREFIX, &opts->flags); -		} else if (!strncmp(token, "release_agent=", 14)) { +			continue; +		} +		if (!strcmp(token, "clone_children")) { +			opts->clone_children = true; +			continue; +		} +		if (!strncmp(token, "release_agent=", 14)) {  			/* Specifying two release agents is forbidden */  			if (opts->release_agent)  				return -EINVAL; @@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);  			if (!opts->release_agent)  				return -ENOMEM; -		} else if (!strncmp(token, "name=", 5)) { +			continue; +		} +		if (!strncmp(token, "name=", 5)) {  			const char *name = token + 5;  			/* Can't specify an empty name */  			if (!strlen(name)) @@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  					      GFP_KERNEL);  			if (!opts->name)  				return -ENOMEM; -		} else { -			struct cgroup_subsys *ss; -			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -				ss = subsys[i]; -				if (ss == NULL) -					continue; -				if (!strcmp(token, ss->name)) { -					if (!ss->disabled) -						set_bit(i, &opts->subsys_bits); -					break; -				} -			} -			if (i == CGROUP_SUBSYS_COUNT) -				return -ENOENT; + +			continue; +		} + +		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +			struct cgroup_subsys *ss = subsys[i]; +			if (ss == NULL) +				continue; +			if (strcmp(token, ss->name)) +				continue; +			if (ss->disabled) +				continue; + +			/* Mutually exclusive option 'all' + subsystem name */ +			if (all_ss) +				return -EINVAL; +			set_bit(i, &opts->subsys_bits); +			one_ss = true; + +			break; +		} +		if (i == CGROUP_SUBSYS_COUNT) +			return -ENOENT; +	} + +	/* +	 * If the 'all' option was specified select all the subsystems, +	 * otherwise 'all, 'none' and a subsystem name options were not +	 * specified, let's default to 'all' +	 */ +	if (all_ss || (!all_ss && !one_ss && !opts->none)) { +		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { +			struct cgroup_subsys *ss = subsys[i]; +			if (ss == NULL) +				continue; +			if (ss->disabled) +				continue; +			set_bit(i, &opts->subsys_bits);  		}  	} @@ -1355,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)  		strcpy(root->release_agent_path, opts->release_agent);  	if (opts->name)  		strcpy(root->name, opts->name); +	if (opts->clone_children) +		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);  	return root;  } @@ -1418,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)  	return 0;  } -static int cgroup_get_sb(struct file_system_type *fs_type, +static struct dentry *cgroup_mount(struct file_system_type *fs_type,  			 int flags, const char *unused_dev_name, -			 void *data, struct vfsmount *mnt) +			 void *data)  {  	struct cgroup_sb_opts opts;  	struct cgroupfs_root *root; @@ -1554,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,  		drop_parsed_module_refcounts(opts.subsys_bits);  	} -	simple_set_mnt(mnt, sb);  	kfree(opts.release_agent);  	kfree(opts.name); -	return 0; +	return dget(sb->s_root);   drop_new_super:  	deactivate_locked_super(sb); @@ -1566,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,   out_err:  	kfree(opts.release_agent);  	kfree(opts.name); -	return ret; +	return ERR_PTR(ret);  }  static void cgroup_kill_sb(struct super_block *sb) { @@ -1616,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {  static struct file_system_type cgroup_fs_type = {  	.name = "cgroup", -	.get_sb = cgroup_get_sb, +	.mount = cgroup_mount,  	.kill_sb = cgroup_kill_sb,  }; @@ -1880,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,  				      const char *buffer)  {  	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); +	if (strlen(buffer) >= PATH_MAX) +		return -EINVAL;  	if (!cgroup_lock_live_group(cgrp))  		return -ENODEV;  	strcpy(cgrp->root->release_agent_path, buffer); @@ -3173,6 +3216,23 @@ fail:  	return ret;  } +static u64 cgroup_clone_children_read(struct cgroup *cgrp, +				    struct cftype *cft) +{ +	return clone_children(cgrp); +} + +static int cgroup_clone_children_write(struct cgroup *cgrp, +				     struct cftype *cft, +				     u64 val) +{ +	if (val) +		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +	else +		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +	return 0; +} +  /*   * for the common functions, 'private' gives the type of file   */ @@ -3203,6 +3263,11 @@ static struct cftype files[] = {  		.write_string = cgroup_write_event_control,  		.mode = S_IWUGO,  	}, +	{ +		.name = "cgroup.clone_children", +		.read_u64 = cgroup_clone_children_read, +		.write_u64 = cgroup_clone_children_write, +	},  };  static struct cftype cft_release_agent = { @@ -3332,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	if (notify_on_release(parent))  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +	if (clone_children(parent)) +		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +  	for_each_subsys(root, ss) {  		struct cgroup_subsys_state *css = ss->create(ss, cgrp); @@ -3346,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  				goto err_destroy;  		}  		/* At error, ->destroy() callback has to free assigned ID. */ +		if (clone_children(parent) && ss->post_clone) +			ss->post_clone(ss, cgrp);  	}  	cgroup_lock_hierarchy(root); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88f..e7bebb7c6c38 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)  			    struct freezer, css);  } -int cgroup_freezing_or_frozen(struct task_struct *task) +static inline int __cgroup_freezing_or_frozen(struct task_struct *task)  { -	struct freezer *freezer; -	enum freezer_state state; +	enum freezer_state state = task_freezer(task)->state; +	return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); +} +int cgroup_freezing_or_frozen(struct task_struct *task) +{ +	int result;  	task_lock(task); -	freezer = task_freezer(task); -	if (!freezer->css.cgroup->parent) -		state = CGROUP_THAWED; /* root cgroup can't be frozen */ -	else -		state = freezer->state; +	result = __cgroup_freezing_or_frozen(task);  	task_unlock(task); - -	return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); +	return result;  }  /* @@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,  	kfree(cgroup_freezer(cgroup));  } -/* Task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ -	return frozen(task) || -		(task_is_stopped_or_traced(task) && freezing(task)); -} -  /*   * The call to cgroup_lock() in the freezer.state write method prevents   * a write to that file racing against an attach, and hence the @@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,  	/*  	 * Anything frozen can't move or be moved to/from. -	 * -	 * Since orig_freezer->state == FROZEN means that @task has been -	 * frozen, so it's sufficient to check the latter condition.  	 */ -	if (is_task_frozen_enough(task)) +	freezer = cgroup_freezer(new_cgroup); +	if (freezer->state != CGROUP_THAWED)  		return -EBUSY; -	freezer = cgroup_freezer(new_cgroup); -	if (freezer->state == CGROUP_FROZEN) +	rcu_read_lock(); +	if (__cgroup_freezing_or_frozen(task)) { +		rcu_read_unlock();  		return -EBUSY; +	} +	rcu_read_unlock();  	if (threadgroup) {  		struct task_struct *c;  		rcu_read_lock();  		list_for_each_entry_rcu(c, &task->thread_group, thread_group) { -			if (is_task_frozen_enough(c)) { +			if (__cgroup_freezing_or_frozen(c)) {  				rcu_read_unlock();  				return -EBUSY;  			} @@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)  /*   * caller must hold freezer->lock   */ -static void update_freezer_state(struct cgroup *cgroup, +static void update_if_frozen(struct cgroup *cgroup,  				 struct freezer *freezer)  {  	struct cgroup_iter it;  	struct task_struct *task;  	unsigned int nfrozen = 0, ntotal = 0; +	enum freezer_state old_state = freezer->state;  	cgroup_iter_start(cgroup, &it);  	while ((task = cgroup_iter_next(cgroup, &it))) {  		ntotal++; -		if (is_task_frozen_enough(task)) +		if (frozen(task))  			nfrozen++;  	} -	/* -	 * Transition to FROZEN when no new tasks can be added ensures -	 * that we never exist in the FROZEN state while there are unfrozen -	 * tasks. -	 */ -	if (nfrozen == ntotal) -		freezer->state = CGROUP_FROZEN; -	else if (nfrozen > 0) -		freezer->state = CGROUP_FREEZING; -	else -		freezer->state = CGROUP_THAWED; +	if (old_state == CGROUP_THAWED) { +		BUG_ON(nfrozen > 0); +	} else if (old_state == CGROUP_FREEZING) { +		if (nfrozen == ntotal) +			freezer->state = CGROUP_FROZEN; +	} else { /* old_state == CGROUP_FROZEN */ +		BUG_ON(nfrozen != ntotal); +	} +  	cgroup_iter_end(cgroup, &it);  } @@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,  	if (state == CGROUP_FREEZING) {  		/* We change from FREEZING to FROZEN lazily if the cgroup was  		 * only partially frozen when we exitted write. */ -		update_freezer_state(cgroup, freezer); +		update_if_frozen(cgroup, freezer);  		state = freezer->state;  	}  	spin_unlock_irq(&freezer->lock); @@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)  	while ((task = cgroup_iter_next(cgroup, &it))) {  		if (!freeze_task(task, true))  			continue; -		if (is_task_frozen_enough(task)) +		if (frozen(task))  			continue;  		if (!freezing(task) && !freezer_should_skip(task))  			num_cant_freeze_now++; @@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,  	spin_lock_irq(&freezer->lock); -	update_freezer_state(cgroup, freezer); +	update_if_frozen(cgroup, freezer);  	if (goal_state == freezer->state)  		goto out; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51b143e2a07a..4349935c2ad8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);   * users. If someone tries to mount the "cpuset" filesystem, we   * silently switch it to mount "cgroup" instead   */ -static int cpuset_get_sb(struct file_system_type *fs_type, -			 int flags, const char *unused_dev_name, -			 void *data, struct vfsmount *mnt) +static struct dentry *cpuset_mount(struct file_system_type *fs_type, +			 int flags, const char *unused_dev_name, void *data)  {  	struct file_system_type *cgroup_fs = get_fs_type("cgroup"); -	int ret = -ENODEV; +	struct dentry *ret = ERR_PTR(-ENODEV);  	if (cgroup_fs) {  		char mountopts[] =  			"cpuset,noprefix,"  			"release_agent=/sbin/cpuset_release_agent"; -		ret = cgroup_fs->get_sb(cgroup_fs, flags, -					   unused_dev_name, mountopts, mnt); +		ret = cgroup_fs->mount(cgroup_fs, flags, +					   unused_dev_name, mountopts);  		put_filesystem(cgroup_fs);  	}  	return ret; @@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,  static struct file_system_type cpuset_fs_type = {  	.name = "cpuset", -	.get_sb = cpuset_get_sb, +	.mount = cpuset_mount,  };  /* diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe7..6a1aa004e376 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);  /*   * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_guard_mutex + * - The caller must hold ->cred_guard_mutex   */  struct cred *prepare_exec_creds(void)  { @@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)  	struct cred *new;  	int ret; -	mutex_init(&p->cred_guard_mutex); -  	if (  #ifdef CONFIG_KEYS  		!p->cred->thread_keyring && diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index fec596da9bd0..cefd4a11f6d9 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)  	return 0;  } -/** - *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - *	@regs: Current &struct pt_regs. - * - *	This function will be called if the particular architecture must - *	disable hardware debugging while it is processing gdb packets or - *	handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} -  /*   * Some architectures need cache flushes when we set/clear a   * breakpoint: @@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,  		atomic_inc(&masters_in_kgdb);  	else  		atomic_inc(&slaves_in_kgdb); -	kgdb_disable_hw_debug(ks->linux_regs); + +	if (arch_kgdb_ops.disable_hw_break) +		arch_kgdb_ops.disable_hw_break(regs);  acquirelock:  	/* diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d7bda21a106b..37755d621924 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,  		/* special case below */  	} else {  		kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", -			   kdb_current, kdb_current->pid); +			   kdb_current, kdb_current ? kdb_current->pid : 0);  #if defined(CONFIG_SMP)  		kdb_printf("on processor %d ", raw_smp_processor_id());  #endif @@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv)   */  static int kdb_per_cpu(int argc, const char **argv)  { -	char buf[256], fmtstr[64]; -	kdb_symtab_t symtab; -	cpumask_t suppress = CPU_MASK_NONE; -	int cpu, diag; -	unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; +	char fmtstr[64]; +	int cpu, diag, nextarg = 1; +	unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;  	if (argc < 1 || argc > 3)  		return KDB_ARGCOUNT; -	snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); -	if (!kdbgetsymval(buf, &symtab)) { -		kdb_printf("%s is not a per_cpu variable\n", argv[1]); -		return KDB_BADADDR; -	} +	diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); +	if (diag) +		return diag; +  	if (argc >= 2) {  		diag = kdbgetularg(argv[2], &bytesperword);  		if (diag) @@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv)  #define KDB_PCU(cpu) 0  #endif  #endif -  	for_each_online_cpu(cpu) { +		if (KDB_FLAG(CMD_INTERRUPT)) +			return 0; +  		if (whichcpu != ~0UL && whichcpu != cpu)  			continue; -		addr = symtab.sym_start + KDB_PCU(cpu); +		addr = symaddr + KDB_PCU(cpu);  		diag = kdb_getword(&val, addr, bytesperword);  		if (diag) {  			kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "  				   "read, diag=%d\n", cpu, addr, diag);  			continue;  		} -#ifdef	CONFIG_SMP -		if (!val) { -			cpu_set(cpu, suppress); -			continue; -		} -#endif	/* CONFIG_SMP */  		kdb_printf("%5d ", cpu);  		kdb_md_line(fmtstr, addr,  			bytesperword == KDB_WORD_SIZE,  			1, bytesperword, 1, 1, 0);  	} -	if (cpus_weight(suppress) == 0) -		return 0; -	kdb_printf("Zero suppressed cpu(s):"); -	for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); -	     cpu = next_cpu(cpu, suppress)) { -		kdb_printf(" %d", cpu); -		if (cpu == num_possible_cpus() - 1 || -		    next_cpu(cpu, suppress) != cpu + 1) -			continue; -		while (cpu < num_possible_cpus() && -		       next_cpu(cpu, suppress) == cpu + 1) -			++cpu; -		kdb_printf("-%d", cpu); -	} -	kdb_printf("\n"); -  #undef KDB_PCU -  	return 0;  } diff --git a/kernel/exit.c b/kernel/exit.c index 894179a32ec1..21aa7b3001fb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -96,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk)  		sig->tty = NULL;  	} else {  		/* +		 * This can only happen if the caller is de_thread(). +		 * FIXME: this is the temporary hack, we should teach +		 * posix-cpu-timers to handle this case correctly. +		 */ +		if (unlikely(has_group_leader_pid(tsk))) +			posix_cpu_timers_exit_group(tsk); + +		/*  		 * If there is any task waiting for the group exit  		 * then notify it:  		 */ @@ -703,6 +711,8 @@ static void exit_mm(struct task_struct * tsk)   * space.   */  static struct task_struct *find_new_reaper(struct task_struct *father) +	__releases(&tasklist_lock) +	__acquires(&tasklist_lock)  {  	struct pid_namespace *pid_ns = task_active_pid_ns(father);  	struct task_struct *thread; diff --git a/kernel/fork.c b/kernel/fork.c index e87aaaaf5131..3b159c5991b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -908,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	sig->oom_adj = current->signal->oom_adj;  	sig->oom_score_adj = current->signal->oom_score_adj; +	mutex_init(&sig->cred_guard_mutex); +  	return 0;  } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9d917ff72675..9988d03797f5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)  	struct irq_desc *desc = irq_to_desc(irq);  	return desc ? desc->kstat_irqs[cpu] : 0;  } + +#ifdef CONFIG_GENERIC_HARDIRQS +unsigned int kstat_irqs(unsigned int irq) +{ +	struct irq_desc *desc = irq_to_desc(irq); +	int cpu; +	int sum = 0; + +	if (!desc) +		return 0; +	for_each_possible_cpu(cpu) +		sum += desc->kstat_irqs[cpu]; +	return sum; +} +#endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 644e8d5fa367..5f92acc5f952 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -324,6 +324,10 @@ void enable_irq(unsigned int irq)  	if (!desc)  		return; +	if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, +	    KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) +		return; +  	chip_bus_lock(desc);  	raw_spin_lock_irqsave(&desc->lock, flags);  	__enable_irq(desc, irq, false); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7be868bf25c6..3b79bd938330 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -39,6 +39,16 @@ struct jump_label_module_entry {  	struct module *mod;  }; +void jump_label_lock(void) +{ +	mutex_lock(&jump_label_mutex); +} + +void jump_label_unlock(void) +{ +	mutex_unlock(&jump_label_mutex); +} +  static int jump_label_cmp(const void *a, const void *b)  {  	const struct jump_entry *jea = a; @@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type)  	struct jump_label_module_entry *e_module;  	int count; -	mutex_lock(&jump_label_mutex); +	jump_label_lock();  	entry = get_jump_label_entry((jump_label_t)key);  	if (entry) {  		count = entry->nr_entries; @@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type)  			count = e_module->nr_entries;  			iter = e_module->table;  			while (count--) { -				if (kernel_text_address(iter->code)) +				if (iter->key && +						kernel_text_address(iter->code))  					arch_jump_label_transform(iter, type);  				iter++;  			}  		}  	} -	mutex_unlock(&jump_label_mutex); +	jump_label_unlock();  }  static int addr_conflict(struct jump_entry *entry, void *start, void *end) @@ -231,6 +242,7 @@ out:   * overlaps with any of the jump label patch addresses. Code   * that wants to modify kernel text should first verify that   * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex.   *   * returns 1 if there is an overlap, 0 otherwise   */ @@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end)  	struct jump_entry *iter_stop = __start___jump_table;  	int conflict = 0; -	mutex_lock(&jump_label_mutex);  	iter = iter_start;  	while (iter < iter_stop) {  		if (addr_conflict(iter, start, end)) { @@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end)  	conflict = module_conflict(start, end);  #endif  out: -	mutex_unlock(&jump_label_mutex);  	return conflict;  } +/* + * Not all archs need this. + */ +void __weak arch_jump_label_text_poke_early(jump_label_t addr) +{ +} +  static __init int init_jump_label(void)  {  	int ret; @@ -267,7 +284,7 @@ static __init int init_jump_label(void)  	struct jump_entry *iter_stop = __stop___jump_table;  	struct jump_entry *iter; -	mutex_lock(&jump_label_mutex); +	jump_label_lock();  	ret = build_jump_label_hashtable(__start___jump_table,  					 __stop___jump_table);  	iter = iter_start; @@ -275,7 +292,7 @@ static __init int init_jump_label(void)  		arch_jump_label_text_poke_early(iter->code);  		iter++;  	} -	mutex_unlock(&jump_label_mutex); +	jump_label_unlock();  	return ret;  }  early_initcall(init_jump_label); @@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod)  	}  } +static void remove_jump_label_module_init(struct module *mod) +{ +	struct hlist_head *head; +	struct hlist_node *node, *node_next, *module_node, *module_node_next; +	struct jump_label_entry *e; +	struct jump_label_module_entry *e_module; +	struct jump_entry *iter; +	int i, count; + +	/* if the module doesn't have jump label entries, just return */ +	if (!mod->num_jump_entries) +		return; + +	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { +		head = &jump_label_table[i]; +		hlist_for_each_entry_safe(e, node, node_next, head, hlist) { +			hlist_for_each_entry_safe(e_module, module_node, +						  module_node_next, +						  &(e->modules), hlist) { +				if (e_module->mod != mod) +					continue; +				count = e_module->nr_entries; +				iter = e_module->table; +				while (count--) { +					if (within_module_init(iter->code, mod)) +						iter->key = 0; +					iter++; +				} +			} +		} +	} +} +  static int  jump_label_module_notify(struct notifier_block *self, unsigned long val,  			 void *data) @@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,  	switch (val) {  	case MODULE_STATE_COMING: -		mutex_lock(&jump_label_mutex); +		jump_label_lock();  		ret = add_jump_label_module(mod);  		if (ret)  			remove_jump_label_module(mod); -		mutex_unlock(&jump_label_mutex); +		jump_label_unlock();  		break;  	case MODULE_STATE_GOING: -		mutex_lock(&jump_label_mutex); +		jump_label_lock();  		remove_jump_label_module(mod); -		mutex_unlock(&jump_label_mutex); +		jump_label_unlock(); +		break; +	case MODULE_STATE_LIVE: +		jump_label_lock(); +		remove_jump_label_module_init(mod); +		jump_label_unlock();  		break;  	}  	return ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 56a891914273..9737a76e106f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];  /* NOTE: change this value only with kprobe_mutex held */  static bool kprobes_all_disarmed; -static DEFINE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */ +/* This protects kprobe_table and optimizing_list */ +static DEFINE_MUTEX(kprobe_mutex);  static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;  static struct {  	spinlock_t lock ____cacheline_aligned_in_smp; @@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)  }  #ifdef CONFIG_SYSCTL +/* This should be called with kprobe_mutex locked */  static void __kprobes optimize_all_kprobes(void)  {  	struct hlist_head *head; @@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)  		return;  	kprobes_allow_optimization = true; -	mutex_lock(&text_mutex);  	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {  		head = &kprobe_table[i];  		hlist_for_each_entry_rcu(p, node, head, hlist)  			if (!kprobe_disabled(p))  				optimize_kprobe(p);  	} -	mutex_unlock(&text_mutex);  	printk(KERN_INFO "Kprobes globally optimized\n");  } +/* This should be called with kprobe_mutex locked */  static void __kprobes unoptimize_all_kprobes(void)  {  	struct hlist_head *head; @@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)  	if (ret)  		return ret; +	jump_label_lock();  	preempt_disable();  	if (!kernel_text_address((unsigned long) p->addr) ||  	    in_kprobes_functions((unsigned long) p->addr) ||  	    ftrace_text_reserved(p->addr, p->addr) || -	    jump_label_text_reserved(p->addr, p->addr)) { -		preempt_enable(); -		return -EINVAL; -	} +	    jump_label_text_reserved(p->addr, p->addr)) +		goto fail_with_jump_label;  	/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */  	p->flags &= KPROBE_FLAG_DISABLED; @@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)  		 * We must hold a refcount of the probed module while updating  		 * its code to prohibit unexpected unloading.  		 */ -		if (unlikely(!try_module_get(probed_mod))) { -			preempt_enable(); -			return -EINVAL; -		} +		if (unlikely(!try_module_get(probed_mod))) +			goto fail_with_jump_label; +  		/*  		 * If the module freed .init.text, we couldn't insert  		 * kprobes in there. @@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)  		if (within_module_init((unsigned long)p->addr, probed_mod) &&  		    probed_mod->state != MODULE_STATE_COMING) {  			module_put(probed_mod); -			preempt_enable(); -			return -EINVAL; +			goto fail_with_jump_label;  		}  	}  	preempt_enable(); +	jump_label_unlock();  	p->nmissed = 0;  	INIT_LIST_HEAD(&p->list);  	mutex_lock(&kprobe_mutex); +	jump_label_lock(); /* needed to call jump_label_text_reserved() */ +  	get_online_cpus();	/* For avoiding text_mutex deadlock. */  	mutex_lock(&text_mutex); @@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)  out:  	mutex_unlock(&text_mutex);  	put_online_cpus(); +	jump_label_unlock();  	mutex_unlock(&kprobe_mutex);  	if (probed_mod)  		module_put(probed_mod);  	return ret; + +fail_with_jump_label: +	preempt_enable(); +	jump_label_unlock(); +	return -EINVAL;  }  EXPORT_SYMBOL_GPL(register_kprobe); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d415..17110a4a4fc2 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  	account_global_scheduler_latency(tsk, &lat); -	/* -	 * short term hack; if we're > 32 we stop; future we recycle: -	 */ -	tsk->latency_record_count++; -	if (tsk->latency_record_count >= LT_SAVECOUNT) -		goto out_unlock; - -	for (i = 0; i < LT_SAVECOUNT; i++) { +	for (i = 0; i < tsk->latency_record_count; i++) {  		struct latency_record *mylat;  		int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  		}  	} +	/* +	 * short term hack; if we're > 32 we stop; future we recycle: +	 */ +	if (tsk->latency_record_count >= LT_SAVECOUNT) +		goto out_unlock; +  	/* Allocated a new one: */ -	i = tsk->latency_record_count; +	i = tsk->latency_record_count++;  	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));  out_unlock: diff --git a/kernel/module.c b/kernel/module.c index 2df46301a7a4..437a74a7524a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)  {  } -static void add_kallsyms(struct module *mod, struct load_info *info) +static void add_kallsyms(struct module *mod, const struct load_info *info)  {  }  #endif /* CONFIG_KALLSYMS */ diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 2a5dfec8efe0..2c98ad94ba0e 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,  		return ERR_PTR(-EPERM);  	if (!cgroup_is_descendant(cgroup, current))  		return ERR_PTR(-EPERM); +	if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { +		printk("ns_cgroup can't be created with parent " +		       "'clone_children' set.\n"); +		return ERR_PTR(-EINVAL); +	} + +	printk_once("ns_cgroup deprecated: consider using the " +		    "'clone_children' flag without the ns_cgroup.\n");  	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);  	if (!ns_cgroup) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f309e8014c78..cb6c0d2af68f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)  	return event->cpu == -1 || event->cpu == smp_processor_id();  } -static int -__event_sched_out(struct perf_event *event, +static void +event_sched_out(struct perf_event *event,  		  struct perf_cpu_context *cpuctx,  		  struct perf_event_context *ctx)  { @@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,  	}  	if (event->state != PERF_EVENT_STATE_ACTIVE) -		return 0; +		return;  	event->state = PERF_EVENT_STATE_INACTIVE;  	if (event->pending_disable) {  		event->pending_disable = 0;  		event->state = PERF_EVENT_STATE_OFF;  	} +	event->tstamp_stopped = ctx->time;  	event->pmu->del(event, 0);  	event->oncpu = -1; @@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,  	ctx->nr_active--;  	if (event->attr.exclusive || !cpuctx->active_oncpu)  		cpuctx->exclusive = 0; -	return 1; -} - -static void -event_sched_out(struct perf_event *event, -		  struct perf_cpu_context *cpuctx, -		  struct perf_event_context *ctx) -{ -	int ret; - -	ret = __event_sched_out(event, cpuctx, ctx); -	if (ret) -		event->tstamp_stopped = ctx->time;  }  static void @@ -664,7 +652,7 @@ retry:  }  static int -__event_sched_in(struct perf_event *event, +event_sched_in(struct perf_event *event,  		 struct perf_cpu_context *cpuctx,  		 struct perf_event_context *ctx)  { @@ -684,6 +672,10 @@ __event_sched_in(struct perf_event *event,  		return -EAGAIN;  	} +	event->tstamp_running += ctx->time - event->tstamp_stopped; + +	event->shadow_ctx_time = ctx->time - ctx->timestamp; +  	if (!is_software_event(event))  		cpuctx->active_oncpu++;  	ctx->nr_active++; @@ -694,35 +686,6 @@ __event_sched_in(struct perf_event *event,  	return 0;  } -static inline int -event_sched_in(struct perf_event *event, -		 struct perf_cpu_context *cpuctx, -		 struct perf_event_context *ctx) -{ -	int ret = __event_sched_in(event, cpuctx, ctx); -	if (ret) -		return ret; -	event->tstamp_running += ctx->time - event->tstamp_stopped; -	return 0; -} - -static void -group_commit_event_sched_in(struct perf_event *group_event, -	       struct perf_cpu_context *cpuctx, -	       struct perf_event_context *ctx) -{ -	struct perf_event *event; -	u64 now = ctx->time; - -	group_event->tstamp_running += now - group_event->tstamp_stopped; -	/* -	 * Schedule in siblings as one group (if any): -	 */ -	list_for_each_entry(event, &group_event->sibling_list, group_entry) { -		event->tstamp_running += now - event->tstamp_stopped; -	} -} -  static int  group_sched_in(struct perf_event *group_event,  	       struct perf_cpu_context *cpuctx, @@ -730,19 +693,15 @@ group_sched_in(struct perf_event *group_event,  {  	struct perf_event *event, *partial_group = NULL;  	struct pmu *pmu = group_event->pmu; +	u64 now = ctx->time; +	bool simulate = false;  	if (group_event->state == PERF_EVENT_STATE_OFF)  		return 0;  	pmu->start_txn(pmu); -	/* -	 * use __event_sched_in() to delay updating tstamp_running -	 * until the transaction is committed. In case of failure -	 * we will keep an unmodified tstamp_running which is a -	 * requirement to get correct timing information -	 */ -	if (__event_sched_in(group_event, cpuctx, ctx)) { +	if (event_sched_in(group_event, cpuctx, ctx)) {  		pmu->cancel_txn(pmu);  		return -EAGAIN;  	} @@ -751,31 +710,42 @@ group_sched_in(struct perf_event *group_event,  	 * Schedule in siblings as one group (if any):  	 */  	list_for_each_entry(event, &group_event->sibling_list, group_entry) { -		if (__event_sched_in(event, cpuctx, ctx)) { +		if (event_sched_in(event, cpuctx, ctx)) {  			partial_group = event;  			goto group_error;  		}  	} -	if (!pmu->commit_txn(pmu)) { -		/* commit tstamp_running */ -		group_commit_event_sched_in(group_event, cpuctx, ctx); +	if (!pmu->commit_txn(pmu))  		return 0; -	} +  group_error:  	/*  	 * Groups can be scheduled in as one unit only, so undo any  	 * partial group before returning: +	 * The events up to the failed event are scheduled out normally, +	 * tstamp_stopped will be updated.  	 * -	 * use __event_sched_out() to avoid updating tstamp_stopped -	 * because the event never actually ran +	 * The failed events and the remaining siblings need to have +	 * their timings updated as if they had gone thru event_sched_in() +	 * and event_sched_out(). This is required to get consistent timings +	 * across the group. This also takes care of the case where the group +	 * could never be scheduled by ensuring tstamp_stopped is set to mark +	 * the time the event was actually stopped, such that time delta +	 * calculation in update_event_times() is correct.  	 */  	list_for_each_entry(event, &group_event->sibling_list, group_entry) {  		if (event == partial_group) -			break; -		__event_sched_out(event, cpuctx, ctx); +			simulate = true; + +		if (simulate) { +			event->tstamp_running += now - event->tstamp_stopped; +			event->tstamp_stopped = now; +		} else { +			event_sched_out(event, cpuctx, ctx); +		}  	} -	__event_sched_out(group_event, cpuctx, ctx); +	event_sched_out(group_event, cpuctx, ctx);  	pmu->cancel_txn(pmu); @@ -3428,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)  }  static void perf_output_read_one(struct perf_output_handle *handle, -				 struct perf_event *event) +				 struct perf_event *event, +				 u64 enabled, u64 running)  {  	u64 read_format = event->attr.read_format;  	u64 values[4]; @@ -3436,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,  	values[n++] = perf_event_count(event);  	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { -		values[n++] = event->total_time_enabled + +		values[n++] = enabled +  			atomic64_read(&event->child_total_time_enabled);  	}  	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { -		values[n++] = event->total_time_running + +		values[n++] = running +  			atomic64_read(&event->child_total_time_running);  	}  	if (read_format & PERF_FORMAT_ID) @@ -3453,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,   * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.   */  static void perf_output_read_group(struct perf_output_handle *handle, -			    struct perf_event *event) +			    struct perf_event *event, +			    u64 enabled, u64 running)  {  	struct perf_event *leader = event->group_leader, *sub;  	u64 read_format = event->attr.read_format; @@ -3463,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,  	values[n++] = 1 + leader->nr_siblings;  	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) -		values[n++] = leader->total_time_enabled; +		values[n++] = enabled;  	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) -		values[n++] = leader->total_time_running; +		values[n++] = running;  	if (leader != event)  		leader->pmu->read(leader); @@ -3491,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,  	}  } +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ +				 PERF_FORMAT_TOTAL_TIME_RUNNING) +  static void perf_output_read(struct perf_output_handle *handle,  			     struct perf_event *event)  { +	u64 enabled = 0, running = 0, now, ctx_time; +	u64 read_format = event->attr.read_format; + +	/* +	 * compute total_time_enabled, total_time_running +	 * based on snapshot values taken when the event +	 * was last scheduled in. +	 * +	 * we cannot simply called update_context_time() +	 * because of locking issue as we are called in +	 * NMI context +	 */ +	if (read_format & PERF_FORMAT_TOTAL_TIMES) { +		now = perf_clock(); +		ctx_time = event->shadow_ctx_time + now; +		enabled = ctx_time - event->tstamp_enabled; +		running = ctx_time - event->tstamp_running; +	} +  	if (event->attr.read_format & PERF_FORMAT_GROUP) -		perf_output_read_group(handle, event); +		perf_output_read_group(handle, event, enabled, running);  	else -		perf_output_read_one(handle, event); +		perf_output_read_one(handle, event, enabled, running);  }  void perf_output_sample(struct perf_output_handle *handle, diff --git a/kernel/printk.c b/kernel/printk.c index b2ebaee8c377..38e7d5868d60 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)  }  #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif +  int do_syslog(int type, char __user *buf, int len, bool from_file)  {  	unsigned i, j, limit, count; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a2..99bbaa3e5b0d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)  	 * under ptrace.  	 */  	retval = -ERESTARTNOINTR; -	if (mutex_lock_interruptible(&task->cred_guard_mutex)) +	if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))  		goto out;  	task_lock(task); @@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)  unlock_tasklist:  	write_unlock_irq(&tasklist_lock);  unlock_creds: -	mutex_unlock(&task->cred_guard_mutex); +	mutex_unlock(&task->signal->cred_guard_mutex);  out:  	return retval;  } @@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)   * and reacquire the lock.   */  void exit_ptrace(struct task_struct *tracer) +	__releases(&tasklist_lock) +	__acquires(&tasklist_lock)  {  	struct task_struct *p, *n;  	LIST_HEAD(ptrace_dead); @@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds  	return copied;  } -static int ptrace_setoptions(struct task_struct *child, long data) +static int ptrace_setoptions(struct task_struct *child, unsigned long data)  {  	child->ptrace &= ~PT_TRACE_MASK; @@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)  #define is_sysemu_singlestep(request)	0  #endif -static int ptrace_resume(struct task_struct *child, long request, long data) +static int ptrace_resume(struct task_struct *child, long request, +			 unsigned long data)  {  	if (!valid_signal(data))  		return -EIO; @@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,  #endif  int ptrace_request(struct task_struct *child, long request, -		   long addr, long data) +		   unsigned long addr, unsigned long data)  {  	int ret = -EIO;  	siginfo_t siginfo; +	void __user *datavp = (void __user *) data; +	unsigned long __user *datalp = datavp;  	switch (request) {  	case PTRACE_PEEKTEXT: @@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,  		ret = ptrace_setoptions(child, data);  		break;  	case PTRACE_GETEVENTMSG: -		ret = put_user(child->ptrace_message, (unsigned long __user *) data); +		ret = put_user(child->ptrace_message, datalp);  		break;  	case PTRACE_GETSIGINFO:  		ret = ptrace_getsiginfo(child, &siginfo);  		if (!ret) -			ret = copy_siginfo_to_user((siginfo_t __user *) data, -						   &siginfo); +			ret = copy_siginfo_to_user(datavp, &siginfo);  		break;  	case PTRACE_SETSIGINFO: -		if (copy_from_user(&siginfo, (siginfo_t __user *) data, -				   sizeof siginfo)) +		if (copy_from_user(&siginfo, datavp, sizeof siginfo))  			ret = -EFAULT;  		else  			ret = ptrace_setsiginfo(child, &siginfo); @@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,  		}  		mmput(mm); -		ret = put_user(tmp, (unsigned long __user *) data); +		ret = put_user(tmp, datalp);  		break;  	}  #endif @@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,  	case PTRACE_SETREGSET:  	{  		struct iovec kiov; -		struct iovec __user *uiov = (struct iovec __user *) data; +		struct iovec __user *uiov = datavp;  		if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))  			return -EFAULT; @@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)  #define arch_ptrace_attach(child)	do { } while (0)  #endif -SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) +SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, +		unsigned long, data)  {  	struct task_struct *child;  	long ret; @@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)  	return ret;  } -int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, +			    unsigned long data)  {  	unsigned long tmp;  	int copied; @@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)  	return put_user(tmp, (unsigned long __user *)data);  } -int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, +			    unsigned long data)  {  	int copied; diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb5..37fa9b99ad58 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)  int clean_sort_range(struct range *range, int az)  { -	int i, j, k = az - 1, nr_range = 0; +	int i, j, k = az - 1, nr_range = az;  	for (i = 0; i < k; i++) {  		if (range[i].end) diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb929..859ea5a9605f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {   */  static struct page **relay_alloc_page_array(unsigned int n_pages)  { -	struct page **array; -	size_t pa_size = n_pages * sizeof(struct page *); - -	if (pa_size > PAGE_SIZE) { -		array = vmalloc(pa_size); -		if (array) -			memset(array, 0, pa_size); -	} else { -		array = kzalloc(pa_size, GFP_KERNEL); -	} -	return array; +	const size_t pa_size = n_pages * sizeof(struct page *); +	if (pa_size > PAGE_SIZE) +		return vzalloc(pa_size); +	return kzalloc(pa_size, GFP_KERNEL);  }  /* diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5dea..9fad33efd0db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);  static DEFINE_RWLOCK(resource_lock); +/* + * By default, we allocate free space bottom-up.  The architecture can request + * top-down by clearing this flag.  The user can override the architecture's + * choice with the "resource_alloc_from_bottom" kernel boot option, but that + * should only be a debugging tool. + */ +int resource_alloc_from_bottom = 1; + +static __init int setup_alloc_from_bottom(char *s) +{ +	printk(KERN_INFO +	       "resource: allocating from bottom-up; please report a bug\n"); +	resource_alloc_from_bottom = 1; +	return 0; +} +early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); +  static void *r_next(struct seq_file *m, void *v, loff_t *pos)  {  	struct resource *p = v; @@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)  	return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;  } +static resource_size_t simple_align_resource(void *data, +					     const struct resource *avail, +					     resource_size_t size, +					     resource_size_t align) +{ +	return avail->start; +} + +static void resource_clip(struct resource *res, resource_size_t min, +			  resource_size_t max) +{ +	if (res->start < min) +		res->start = min; +	if (res->end > max) +		res->end = max; +} + +static bool resource_contains(struct resource *res1, struct resource *res2) +{ +	return res1->start <= res2->start && res1->end >= res2->end; +} + +/* + * Find the resource before "child" in the sibling list of "root" children. + */ +static struct resource *find_sibling_prev(struct resource *root, struct resource *child) +{ +	struct resource *this; + +	for (this = root->child; this; this = this->sibling) +		if (this->sibling == child) +			return this; + +	return NULL; +} + +/* + * Find empty slot in the resource tree given range and alignment. + * This version allocates from the end of the root resource first. + */ +static int find_resource_from_top(struct resource *root, struct resource *new, +				  resource_size_t size, resource_size_t min, +				  resource_size_t max, resource_size_t align, +				  resource_size_t (*alignf)(void *, +						   const struct resource *, +						   resource_size_t, +						   resource_size_t), +				  void *alignf_data) +{ +	struct resource *this; +	struct resource tmp, avail, alloc; + +	tmp.start = root->end; +	tmp.end = root->end; + +	this = find_sibling_prev(root, NULL); +	for (;;) { +		if (this) { +			if (this->end < root->end) +				tmp.start = this->end + 1; +		} else +			tmp.start = root->start; + +		resource_clip(&tmp, min, max); + +		/* Check for overflow after ALIGN() */ +		avail = *new; +		avail.start = ALIGN(tmp.start, align); +		avail.end = tmp.end; +		if (avail.start >= tmp.start) { +			alloc.start = alignf(alignf_data, &avail, size, align); +			alloc.end = alloc.start + size - 1; +			if (resource_contains(&avail, &alloc)) { +				new->start = alloc.start; +				new->end = alloc.end; +				return 0; +			} +		} + +		if (!this || this->start == root->start) +			break; + +		tmp.end = this->start - 1; +		this = find_sibling_prev(root, this); +	} +	return -EBUSY; +} +  /*   * Find empty slot in the resource tree given range and alignment. + * This version allocates from the beginning of the root resource first.   */  static int find_resource(struct resource *root, struct resource *new,  			 resource_size_t size, resource_size_t min, @@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,  			 void *alignf_data)  {  	struct resource *this = root->child; -	struct resource tmp = *new; +	struct resource tmp = *new, avail, alloc;  	tmp.start = root->start;  	/* -	 * Skip past an allocated resource that starts at 0, since the assignment -	 * of this->start - 1 to tmp->end below would cause an underflow. +	 * Skip past an allocated resource that starts at 0, since the +	 * assignment of this->start - 1 to tmp->end below would cause an +	 * underflow.  	 */  	if (this && this->start == 0) {  		tmp.start = this->end + 1;  		this = this->sibling;  	} -	for(;;) { +	for (;;) {  		if (this)  			tmp.end = this->start - 1;  		else  			tmp.end = root->end; -		if (tmp.start < min) -			tmp.start = min; -		if (tmp.end > max) -			tmp.end = max; -		tmp.start = ALIGN(tmp.start, align); -		if (alignf) -			tmp.start = alignf(alignf_data, &tmp, size, align); -		if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { -			new->start = tmp.start; -			new->end = tmp.start + size - 1; -			return 0; + +		resource_clip(&tmp, min, max); + +		/* Check for overflow after ALIGN() */ +		avail = *new; +		avail.start = ALIGN(tmp.start, align); +		avail.end = tmp.end; +		if (avail.start >= tmp.start) { +			alloc.start = alignf(alignf_data, &avail, size, align); +			alloc.end = alloc.start + size - 1; +			if (resource_contains(&avail, &alloc)) { +				new->start = alloc.start; +				new->end = alloc.end; +				return 0; +			}  		} +  		if (!this)  			break; +  		tmp.start = this->end + 1;  		this = this->sibling;  	} @@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,  {  	int err; +	if (!alignf) +		alignf = simple_align_resource; +  	write_lock(&resource_lock); -	err = find_resource(root, new, size, min, max, align, alignf, alignf_data); +	if (resource_alloc_from_bottom) +		err = find_resource(root, new, size, min, max, align, alignf, alignf_data); +	else +		err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);  	if (err >= 0 && __request_resource(root, new))  		err = -EBUSY;  	write_unlock(&resource_lock); @@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou  		if (first == parent)  			return first; +		if (WARN_ON(first == new))	/* duplicated insertion */ +			return first;  		if ((first->start > new->start) || (first->end < new->end))  			break; diff --git a/kernel/sched.c b/kernel/sched.c index d42992bccdfa..aa14a56f9d03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk)  	if (unlikely(running))  		tsk->sched_class->put_prev_task(rq, tsk); -	set_task_rq(tsk, task_cpu(tsk)); -  #ifdef CONFIG_FAIR_GROUP_SCHED -	if (tsk->sched_class->moved_group) -		tsk->sched_class->moved_group(tsk, on_rq); +	if (tsk->sched_class->task_move_group) +		tsk->sched_class->task_move_group(tsk, on_rq); +	else  #endif +		set_task_rq(tsk, task_cpu(tsk));  	if (unlikely(running))  		tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 933f3d1b62ea..f4f6a8326dd0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)  }  #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq)  { -	struct cfs_rq *cfs_rq = task_cfs_rq(p); - -	update_curr(cfs_rq); +	/* +	 * If the task was not on the rq at the time of this cgroup movement +	 * it must have been asleep, sleeping tasks keep their ->vruntime +	 * absolute on their old rq until wakeup (needed for the fair sleeper +	 * bonus in place_entity()). +	 * +	 * If it was on the rq, we've just 'preempted' it, which does convert +	 * ->vruntime to a relative base. +	 * +	 * Make sure both cases convert their relative position when migrating +	 * to another cgroup's rq. This does somewhat interfere with the +	 * fair sleeper stuff for the first placement, but who cares. +	 */ +	if (!on_rq) +		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; +	set_task_rq(p, task_cpu(p));  	if (!on_rq) -		place_entity(cfs_rq, &p->se, 1); +		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;  }  #endif @@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = {  	.get_rr_interval	= get_rr_interval_fair,  #ifdef CONFIG_FAIR_GROUP_SCHED -	.moved_group		= moved_group_fair, +	.task_move_group	= task_move_group_fair,  #endif  }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6fc..48ddf431db0e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)  }  /* - * Called when a process is dequeued from the active array and given - * the cpu.  We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue.  (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * Though we are interested in knowing how long it was from the *first* time a + * We are interested in knowing how long it was from the *first* time a   * task was queued to the time that it finally hit a cpu, we call this routine   * from dequeue_task() to account for possible rq->clock skew across cpus. The   * delta taken on each cpu would annul the skew. @@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)  }  /* - * Called when a process is queued into either the active or expired - * array.  The time is noted and later used to determine how long we - * had to wait for us to reach the cpu.  Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - *   * This function is only called from enqueue_task(), but also only updates   * the timestamp if it is already not set.  It's assumed that   * sched_info_dequeued() will clear that stamp when appropriate. diff --git a/kernel/signal.c b/kernel/signal.c index 919562c3d6b7..4e3cff10fdce 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)  	return count;  } -struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) +struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, +					   unsigned long *flags)  {  	struct sighand_struct *sighand; @@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)   * is gone, we keep current->exit_code unless clear_code.   */  static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) +	__releases(¤t->sighand->siglock) +	__acquires(¤t->sighand->siglock)  {  	if (arch_ptrace_stop_needed(exit_code, info)) {  		/* diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7ef..12ed8b013e2d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);   *   * Returns 0 on success, else a negative status code.   */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, +int smp_call_function_single(int cpu, smp_call_func_t func, void *info,  			     int wait)  {  	struct call_single_data d = { @@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);   *	3) any other online cpu in @mask   */  int smp_call_function_any(const struct cpumask *mask, -			  void (*func)(void *info), void *info, int wait) +			  smp_call_func_t func, void *info, int wait)  {  	unsigned int cpu;  	const struct cpumask *nodemask; @@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,   * must be disabled when calling this function.   */  void smp_call_function_many(const struct cpumask *mask, -			    void (*func)(void *), void *info, bool wait) +			    smp_call_func_t func, void *info, bool wait)  {  	struct call_function_data *data;  	unsigned long flags; @@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);   * You must not call this function with disabled interrupts or from a   * hardware interrupt handler or from a bottom half handler.   */ -int smp_call_function(void (*func)(void *), void *info, int wait) +int smp_call_function(smp_call_func_t func, void *info, int wait)  {  	preempt_disable();  	smp_call_function_many(cpu_online_mask, func, info, wait); diff --git a/kernel/softirq.c b/kernel/softirq.c index f02a9dfa19bc..18f4be0d5fe0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -229,18 +229,20 @@ restart:  	do {  		if (pending & 1) { +			unsigned int vec_nr = h - softirq_vec;  			int prev_count = preempt_count(); -			kstat_incr_softirqs_this_cpu(h - softirq_vec); -			trace_softirq_entry(h, softirq_vec); +			kstat_incr_softirqs_this_cpu(vec_nr); + +			trace_softirq_entry(vec_nr);  			h->action(h); -			trace_softirq_exit(h, softirq_vec); +			trace_softirq_exit(vec_nr);  			if (unlikely(prev_count != preempt_count())) { -				printk(KERN_ERR "huh, entered softirq %td %s %p" +				printk(KERN_ERR "huh, entered softirq %u %s %p"  				       "with preempt_count %08x," -				       " exited with %08x?\n", h - softirq_vec, -				       softirq_to_name[h - softirq_vec], -				       h->action, prev_count, preempt_count()); +				       " exited with %08x?\n", vec_nr, +				       softirq_to_name[vec_nr], h->action, +				       prev_count, preempt_count());  				preempt_count() = prev_count;  			} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799f..b65bf634035e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -704,6 +704,15 @@ static struct ctl_table kern_table[] = {  	},  #endif  	{ +		.procname	= "dmesg_restrict", +		.data		= &dmesg_restrict, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +	{  		.procname	= "ngroups_max",  		.data		= &ngroups_max,  		.maxlen		= sizeof (int), diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792bd..c8231fb15708 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,  	up_write(&listeners->sem);  } -static int fill_pid(pid_t pid, struct task_struct *tsk, -		struct taskstats *stats) +static void fill_stats(struct task_struct *tsk, struct taskstats *stats)  { -	int rc = 0; - -	if (!tsk) { -		rcu_read_lock(); -		tsk = find_task_by_vpid(pid); -		if (tsk) -			get_task_struct(tsk); -		rcu_read_unlock(); -		if (!tsk) -			return -ESRCH; -	} else -		get_task_struct(tsk); -  	memset(stats, 0, sizeof(*stats));  	/*  	 * Each accounting subsystem adds calls to its functions to @@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,  	/* fill in extended acct fields */  	xacct_add_tsk(stats, tsk); +} -	/* Define err: label here if needed */ -	put_task_struct(tsk); -	return rc; +static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) +{ +	struct task_struct *tsk; +	rcu_read_lock(); +	tsk = find_task_by_vpid(pid); +	if (tsk) +		get_task_struct(tsk); +	rcu_read_unlock(); +	if (!tsk) +		return -ESRCH; +	fill_stats(tsk, stats); +	put_task_struct(tsk); +	return 0;  } -static int fill_tgid(pid_t tgid, struct task_struct *first, -		struct taskstats *stats) +static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)  { -	struct task_struct *tsk; +	struct task_struct *tsk, *first;  	unsigned long flags;  	int rc = -ESRCH; @@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,  	 * leaders who are already counted with the dead tasks  	 */  	rcu_read_lock(); -	if (!first) -		first = find_task_by_vpid(tgid); +	first = find_task_by_vpid(tgid);  	if (!first || !lock_task_sighand(first, &flags))  		goto out; @@ -268,7 +263,6 @@ out:  	return rc;  } -  static void fill_tgid_exit(struct task_struct *tsk)  {  	unsigned long flags; @@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)  	struct nlattr *na, *ret;  	int aggr; +	/* If we don't pad, we end up with alignment on a 4 byte boundary. +	 * This causes lots of runtime warnings on systems requiring 8 byte +	 * alignment */ +	u32 pids[2] = { pid, 0 }; +	int pid_size = ALIGN(sizeof(pid), sizeof(long)); +  	aggr = (type == TASKSTATS_TYPE_PID)  			? TASKSTATS_TYPE_AGGR_PID  			: TASKSTATS_TYPE_AGGR_TGID; @@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)  	na = nla_nest_start(skb, aggr);  	if (!na)  		goto err; -	if (nla_put(skb, type, sizeof(pid), &pid) < 0) +	if (nla_put(skb, type, pid_size, pids) < 0)  		goto err;  	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));  	if (!ret) @@ -424,39 +424,46 @@ err:  	return rc;  } -static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +static int cmd_attr_register_cpumask(struct genl_info *info)  { -	int rc; -	struct sk_buff *rep_skb; -	struct taskstats *stats; -	size_t size;  	cpumask_var_t mask; +	int rc;  	if (!alloc_cpumask_var(&mask, GFP_KERNEL))  		return -ENOMEM; -  	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);  	if (rc < 0) -		goto free_return_rc; -	if (rc == 0) { -		rc = add_del_listener(info->snd_pid, mask, REGISTER); -		goto free_return_rc; -	} +		goto out; +	rc = add_del_listener(info->snd_pid, mask, REGISTER); +out: +	free_cpumask_var(mask); +	return rc; +} + +static int cmd_attr_deregister_cpumask(struct genl_info *info) +{ +	cpumask_var_t mask; +	int rc; +	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) +		return -ENOMEM;  	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);  	if (rc < 0) -		goto free_return_rc; -	if (rc == 0) { -		rc = add_del_listener(info->snd_pid, mask, DEREGISTER); -free_return_rc: -		free_cpumask_var(mask); -		return rc; -	} +		goto out; +	rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +out:  	free_cpumask_var(mask); +	return rc; +} + +static int cmd_attr_pid(struct genl_info *info) +{ +	struct taskstats *stats; +	struct sk_buff *rep_skb; +	size_t size; +	u32 pid; +	int rc; -	/* -	 * Size includes space for nested attributes -	 */  	size = nla_total_size(sizeof(u32)) +  		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); @@ -465,33 +472,64 @@ free_return_rc:  		return rc;  	rc = -EINVAL; -	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { -		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); -		stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); -		if (!stats) -			goto err; - -		rc = fill_pid(pid, NULL, stats); -		if (rc < 0) -			goto err; -	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { -		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); -		stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); -		if (!stats) -			goto err; - -		rc = fill_tgid(tgid, NULL, stats); -		if (rc < 0) -			goto err; -	} else +	pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); +	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); +	if (!stats) +		goto err; + +	rc = fill_stats_for_pid(pid, stats); +	if (rc < 0) +		goto err; +	return send_reply(rep_skb, info); +err: +	nlmsg_free(rep_skb); +	return rc; +} + +static int cmd_attr_tgid(struct genl_info *info) +{ +	struct taskstats *stats; +	struct sk_buff *rep_skb; +	size_t size; +	u32 tgid; +	int rc; + +	size = nla_total_size(sizeof(u32)) + +		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + +	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); +	if (rc < 0) +		return rc; + +	rc = -EINVAL; +	tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); +	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); +	if (!stats)  		goto err; +	rc = fill_stats_for_tgid(tgid, stats); +	if (rc < 0) +		goto err;  	return send_reply(rep_skb, info);  err:  	nlmsg_free(rep_skb);  	return rc;  } +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ +	if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) +		return cmd_attr_register_cpumask(info); +	else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) +		return cmd_attr_deregister_cpumask(info); +	else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) +		return cmd_attr_pid(info); +	else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) +		return cmd_attr_tgid(info); +	else +		return -EINVAL; +} +  static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)  {  	struct signal_struct *sig = tsk->signal; @@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)  	if (!stats)  		goto err; -	rc = fill_pid(-1, tsk, stats); -	if (rc < 0) -		goto err; +	fill_stats(tsk, stats);  	/*  	 * Doesn't matter if tsk is the leader or the last group member leaving diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed66724..7b8ec0281548 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,  static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),  				 BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER	BLK_TC_BARRIER  #define BLK_TC_RAHEAD		BLK_TC_AHEAD  /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  		return;  	what |= ddir_act[rw & WRITE]; -	what |= MASK_TC_BIT(rw, HARDBARRIER);  	what |= MASK_TC_BIT(rw, SYNC);  	what |= MASK_TC_BIT(rw, RAHEAD);  	what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)  	if (rw & REQ_RAHEAD)  		rwbs[i++] = 'A'; -	if (rw & REQ_HARDBARRIER) -		rwbs[i++] = 'B';  	if (rw & REQ_SYNC)  		rwbs[i++] = 'S';  	if (rw & REQ_META) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c3dab054d18e..9ed509a015d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -224,6 +224,9 @@ enum {  	RB_LEN_TIME_STAMP = 16,  }; +#define skip_time_extend(event) \ +	((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) +  static inline int rb_null_event(struct ring_buffer_event *event)  {  	return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)  	return length + RB_EVNT_HDR_SIZE;  } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned  rb_event_length(struct ring_buffer_event *event)  {  	switch (event->type_len) { @@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)  	return 0;  } +/* + * Return total length of time extend and data, + *   or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ +	unsigned len = 0; + +	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { +		/* time extends include the data event after it */ +		len = RB_LEN_TIME_EXTEND; +		event = skip_time_extend(event); +	} +	return len + rb_event_length(event); +} +  /**   * ring_buffer_event_length - return the length of the event   * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it.   */  unsigned ring_buffer_event_length(struct ring_buffer_event *event)  { -	unsigned length = rb_event_length(event); +	unsigned length; + +	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) +		event = skip_time_extend(event); + +	length = rb_event_length(event);  	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)  		return length;  	length -= RB_EVNT_HDR_SIZE; @@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);  static void *  rb_event_data(struct ring_buffer_event *event)  { +	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) +		event = skip_time_extend(event);  	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);  	/* If length is in len field, then array[0] has the data */  	if (event->type_len) @@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)  /* Max payload is BUF_PAGE_SIZE - header (8bytes) */  #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) -  int ring_buffer_print_page_header(struct trace_seq *s)  {  	struct buffer_data_page field; @@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)  	iter->head = 0;  } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ +	event->type_len = RINGBUF_TYPE_TIME_EXTEND; + +	/* Not the first event on the page? */ +	if (rb_event_index(event)) { +		event->time_delta = delta & TS_MASK; +		event->array[0] = delta >> TS_SHIFT; +	} else { +		/* nope, just zero it */ +		event->time_delta = 0; +		event->array[0] = 0; +	} + +	return skip_time_extend(event); +} +  /**   * ring_buffer_update_event - update event type and data   * @event: the even to update @@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)   * data field.   */  static void -rb_update_event(struct ring_buffer_event *event, -			 unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, +		struct ring_buffer_event *event, unsigned length, +		int add_timestamp, u64 delta)  { -	event->type_len = type; - -	switch (type) { - -	case RINGBUF_TYPE_PADDING: -	case RINGBUF_TYPE_TIME_EXTEND: -	case RINGBUF_TYPE_TIME_STAMP: -		break; +	/* Only a commit updates the timestamp */ +	if (unlikely(!rb_event_is_commit(cpu_buffer, event))) +		delta = 0; -	case 0: -		length -= RB_EVNT_HDR_SIZE; -		if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) -			event->array[0] = length; -		else -			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -		break; -	default: -		BUG(); +	/* +	 * If we need to add a timestamp, then we +	 * add it to the start of the resevered space. +	 */ +	if (unlikely(add_timestamp)) { +		event = rb_add_time_stamp(event, delta); +		length -= RB_LEN_TIME_EXTEND; +		delta = 0;  	} + +	event->time_delta = delta; +	length -= RB_EVNT_HDR_SIZE; +	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { +		event->type_len = 0; +		event->array[0] = length; +	} else +		event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);  }  /* @@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,  	local_sub(length, &tail_page->write);  } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event *  rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,  	     unsigned long length, unsigned long tail, -	     struct buffer_page *tail_page, u64 *ts) +	     struct buffer_page *tail_page, u64 ts)  {  	struct buffer_page *commit_page = cpu_buffer->commit_page;  	struct ring_buffer *buffer = cpu_buffer->buffer; @@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,  		 * Nested commits always have zero deltas, so  		 * just reread the time stamp  		 */ -		*ts = rb_time_stamp(buffer); -		next_page->page->time_stamp = *ts; +		ts = rb_time_stamp(buffer); +		next_page->page->time_stamp = ts;  	}   out_again: @@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,  static struct ring_buffer_event *  __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, -		  unsigned type, unsigned long length, u64 *ts) +		  unsigned long length, u64 ts, +		  u64 delta, int add_timestamp)  {  	struct buffer_page *tail_page;  	struct ring_buffer_event *event;  	unsigned long tail, write; +	/* +	 * If the time delta since the last event is too big to +	 * hold in the time field of the event, then we append a +	 * TIME EXTEND event ahead of the data event. +	 */ +	if (unlikely(add_timestamp)) +		length += RB_LEN_TIME_EXTEND; +  	tail_page = cpu_buffer->tail_page;  	write = local_add_return(length, &tail_page->write); @@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  	tail = write - length;  	/* See if we shot pass the end of this buffer page */ -	if (write > BUF_PAGE_SIZE) +	if (unlikely(write > BUF_PAGE_SIZE))  		return rb_move_tail(cpu_buffer, length, tail,  				    tail_page, ts); @@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  	event = __rb_page_index(tail_page, tail);  	kmemcheck_annotate_bitfield(event, bitfield); -	rb_update_event(event, type, length); +	rb_update_event(cpu_buffer, event, length, add_timestamp, delta); -	/* The passed in type is zero for DATA */ -	if (likely(!type)) -		local_inc(&tail_page->entries); +	local_inc(&tail_page->entries);  	/*  	 * If this is the first commit on the page, then update  	 * its timestamp.  	 */  	if (!tail) -		tail_page->page->time_stamp = *ts; +		tail_page->page->time_stamp = ts;  	return event;  } @@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,  	unsigned long addr;  	new_index = rb_event_index(event); -	old_index = new_index + rb_event_length(event); +	old_index = new_index + rb_event_ts_length(event);  	addr = (unsigned long)event;  	addr &= PAGE_MASK; @@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,  	return 0;  } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, -		  u64 *ts, u64 *delta) -{ -	struct ring_buffer_event *event; -	int ret; - -	WARN_ONCE(*delta > (1ULL << 59), -		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", -		  (unsigned long long)*delta, -		  (unsigned long long)*ts, -		  (unsigned long long)cpu_buffer->write_stamp); - -	/* -	 * The delta is too big, we to add a -	 * new timestamp. -	 */ -	event = __rb_reserve_next(cpu_buffer, -				  RINGBUF_TYPE_TIME_EXTEND, -				  RB_LEN_TIME_EXTEND, -				  ts); -	if (!event) -		return -EBUSY; - -	if (PTR_ERR(event) == -EAGAIN) -		return -EAGAIN; - -	/* Only a commited time event can update the write stamp */ -	if (rb_event_is_commit(cpu_buffer, event)) { -		/* -		 * If this is the first on the page, then it was -		 * updated with the page itself. Try to discard it -		 * and if we can't just make it zero. -		 */ -		if (rb_event_index(event)) { -			event->time_delta = *delta & TS_MASK; -			event->array[0] = *delta >> TS_SHIFT; -		} else { -			/* try to discard, since we do not need this */ -			if (!rb_try_to_discard(cpu_buffer, event)) { -				/* nope, just zero it */ -				event->time_delta = 0; -				event->array[0] = 0; -			} -		} -		cpu_buffer->write_stamp = *ts; -		/* let the caller know this was the commit */ -		ret = 1; -	} else { -		/* Try to discard the event */ -		if (!rb_try_to_discard(cpu_buffer, event)) { -			/* Darn, this is just wasted space */ -			event->time_delta = 0; -			event->array[0] = 0; -		} -		ret = 0; -	} - -	*delta = 0; - -	return ret; -} -  static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)  {  	local_inc(&cpu_buffer->committing);  	local_inc(&cpu_buffer->commits);  } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)  {  	unsigned long commits; @@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,  		      unsigned long length)  {  	struct ring_buffer_event *event; -	u64 ts, delta = 0; -	int commit = 0; +	u64 ts, delta;  	int nr_loops = 0; +	int add_timestamp; +	u64 diff;  	rb_start_commit(cpu_buffer); @@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,  	length = rb_calculate_event_length(length);   again: +	add_timestamp = 0; +	delta = 0; +  	/*  	 * We allow for interrupts to reenter here and do a trace.  	 * If one does, it will cause this original code to loop @@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,  		goto out_fail;  	ts = rb_time_stamp(cpu_buffer->buffer); +	diff = ts - cpu_buffer->write_stamp; -	/* -	 * Only the first commit can update the timestamp. -	 * Yes there is a race here. If an interrupt comes in -	 * just after the conditional and it traces too, then it -	 * will also check the deltas. More than one timestamp may -	 * also be made. But only the entry that did the actual -	 * commit will be something other than zero. -	 */ -	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && -		   rb_page_write(cpu_buffer->tail_page) == -		   rb_commit_index(cpu_buffer))) { -		u64 diff; - -		diff = ts - cpu_buffer->write_stamp; - -		/* make sure this diff is calculated here */ -		barrier(); - -		/* Did the write stamp get updated already? */ -		if (unlikely(ts < cpu_buffer->write_stamp)) -			goto get_event; +	/* make sure this diff is calculated here */ +	barrier(); +	/* Did the write stamp get updated already? */ +	if (likely(ts >= cpu_buffer->write_stamp)) {  		delta = diff;  		if (unlikely(test_time_stamp(delta))) { - -			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); -			if (commit == -EBUSY) -				goto out_fail; - -			if (commit == -EAGAIN) -				goto again; - -			RB_WARN_ON(cpu_buffer, commit < 0); +			WARN_ONCE(delta > (1ULL << 59), +				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", +				  (unsigned long long)delta, +				  (unsigned long long)ts, +				  (unsigned long long)cpu_buffer->write_stamp); +			add_timestamp = 1;  		}  	} - get_event: -	event = __rb_reserve_next(cpu_buffer, 0, length, &ts); +	event = __rb_reserve_next(cpu_buffer, length, ts, +				  delta, add_timestamp);  	if (unlikely(PTR_ERR(event) == -EAGAIN))  		goto again;  	if (!event)  		goto out_fail; -	if (!rb_event_is_commit(cpu_buffer, event)) -		delta = 0; - -	event->time_delta = delta; -  	return event;   out_fail: @@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,  #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void)  { -	current->trace_recursion++; - -	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) -		return 0; -  	/* Disable all tracing before we do anything else */  	tracing_off_permanent(); @@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)  		    in_nmi());  	WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ +	current->trace_recursion++; + +	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) +		return 0; + +	trace_recursive_fail(); +  	return -1;  } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void)  {  	WARN_ON_ONCE(!current->trace_recursion); @@ -2308,12 +2298,28 @@ static void  rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,  		      struct ring_buffer_event *event)  { +	u64 delta; +  	/*  	 * The event first in the commit queue updates the  	 * time stamp.  	 */ -	if (rb_event_is_commit(cpu_buffer, event)) -		cpu_buffer->write_stamp += event->time_delta; +	if (rb_event_is_commit(cpu_buffer, event)) { +		/* +		 * A commit event that is first on a page +		 * updates the write timestamp with the page stamp +		 */ +		if (!rb_event_index(event)) +			cpu_buffer->write_stamp = +				cpu_buffer->commit_page->page->time_stamp; +		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { +			delta = event->array[0]; +			delta <<= TS_SHIFT; +			delta += event->time_delta; +			cpu_buffer->write_stamp += delta; +		} else +			cpu_buffer->write_stamp += event->time_delta; +	}  }  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, @@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);  static inline void rb_event_discard(struct ring_buffer_event *event)  { +	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) +		event = skip_time_extend(event); +  	/* array[0] holds the actual length for the discarded event */  	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;  	event->type_len = RINGBUF_TYPE_PADDING; @@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,   again:  	/* -	 * We repeat when a timestamp is encountered. It is possible -	 * to get multiple timestamps from an interrupt entering just -	 * as one timestamp is about to be written, or from discarded -	 * commits. The most that we can have is the number on a single page. +	 * We repeat when a time extend is encountered. +	 * Since the time extend is always attached to a data event, +	 * we should never loop more than once. +	 * (We never hit the following condition more than twice).  	 */ -	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) +	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))  		return NULL;  	reader = rb_get_reader_page(cpu_buffer); @@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  		return NULL;  	/* -	 * We repeat when a timestamp is encountered. -	 * We can get multiple timestamps by nested interrupts or also -	 * if filtering is on (discarding commits). Since discarding -	 * commits can be frequent we can get a lot of timestamps. -	 * But we limit them by not adding timestamps if they begin -	 * at the start of a page. +	 * We repeat when a time extend is encountered. +	 * Since the time extend is always attached to a data event, +	 * we should never loop more than once. +	 * (We never hit the following condition more than twice).  	 */ -	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) +	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))  		return NULL;  	if (rb_per_cpu_empty(cpu_buffer)) @@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  		if (len > (commit - read))  			len = (commit - read); -		size = rb_event_length(event); +		/* Always keep the time extend and data together */ +		size = rb_event_ts_length(event);  		if (len < size)  			goto out_unlock; @@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  				break;  			event = rb_reader_event(cpu_buffer); -			size = rb_event_length(event); +			/* Always keep the time extend and data together */ +			size = rb_event_ts_length(event);  		} while (len > size);  		/* update bpage */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 001bcd2ccf4a..82d9b8106cd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)  {  	struct dentry *d_percpu = tracing_dentry_percpu();  	struct dentry *d_cpu; -	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */ -	char cpu_dir[7]; +	char cpu_dir[30]; /* 30 characters should be more than enough */ -	if (cpu > 999 || cpu < 0) -		return; - -	sprintf(cpu_dir, "cpu%ld", cpu); +	snprintf(cpu_dir, 30, "cpu%ld", cpu);  	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);  	if (!d_cpu) {  		pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8d2852baa4a..2dec9bcde8b4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,6 @@  #include <linux/perf_event.h>  #include <linux/stringify.h>  #include <linux/limits.h> -#include <linux/uaccess.h>  #include <asm/bitsperlong.h>  #include "trace.h" diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf8..24dc60d9fa1f 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)  	stats->ac_ppid	 = pid_alive(tsk) ?  				rcu_dereference(tsk->real_parent)->tgid : 0;  	rcu_read_unlock(); -	stats->ac_utime	 = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; -	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; -	stats->ac_utimescaled = -		cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; -	stats->ac_stimescaled = -		cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; +	stats->ac_utime = cputime_to_usecs(tsk->utime); +	stats->ac_stime = cputime_to_usecs(tsk->stime); +	stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); +	stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);  	stats->ac_minflt = tsk->min_flt;  	stats->ac_majflt = tsk->maj_flt; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d8..6e3c41a4024c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);  #endif -static int __initdata no_watchdog; +static int no_watchdog;  /* boot commands */ | 
