diff options
| author | Dan Williams <dan.j.williams@intel.com> | 2010-05-18 03:30:58 +0400 | 
|---|---|---|
| committer | Dan Williams <dan.j.williams@intel.com> | 2010-05-18 03:30:58 +0400 | 
| commit | 0b28330e39bbe0ffee4c56b09fc415fcec595ea3 (patch) | |
| tree | fcf504879883763557e696eff81427b1ab78f76b /kernel | |
| parent | 058276303dbc4ed089c1f7dad0871810b1f5ddf1 (diff) | |
| parent | caa20d974c86af496b419eef70010e63b7fab7ac (diff) | |
| download | linux-0b28330e39bbe0ffee4c56b09fc415fcec595ea3.tar.xz | |
Merge branch 'ioat' into dmaengine
Diffstat (limited to 'kernel')
72 files changed, 577 insertions, 275 deletions
diff --git a/kernel/async.c b/kernel/async.c index 27235f5de198..15319d6c18fe 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.  #include <linux/init.h>  #include <linux/kthread.h>  #include <linux/delay.h> +#include <linux/slab.h>  #include <asm/atomic.h>  static async_cookie_t next_cookie = 1; diff --git a/kernel/audit.c b/kernel/audit.c index 78f7f86aa238..c71bd26631a2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -46,6 +46,7 @@  #include <asm/atomic.h>  #include <linux/mm.h>  #include <linux/module.h> +#include <linux/slab.h>  #include <linux/err.h>  #include <linux/kthread.h> diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 028e85663f27..46a57b57a335 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -3,6 +3,7 @@  #include <linux/namei.h>  #include <linux/mount.h>  #include <linux/kthread.h> +#include <linux/slab.h>  struct audit_tree;  struct audit_chunk; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index cc7e87936cbc..8df43696f4ba 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -27,6 +27,7 @@  #include <linux/namei.h>  #include <linux/netlink.h>  #include <linux/sched.h> +#include <linux/slab.h>  #include <linux/inotify.h>  #include <linux/security.h>  #include "audit.h" diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a70604047f3c..ce08041f578d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -27,6 +27,7 @@  #include <linux/namei.h>  #include <linux/netlink.h>  #include <linux/sched.h> +#include <linux/slab.h>  #include <linux/security.h>  #include "audit.h" diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f3a461c0970a..3828ad5fb8f1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -49,6 +49,7 @@  #include <linux/namei.h>  #include <linux/mm.h>  #include <linux/module.h> +#include <linux/slab.h>  #include <linux/mount.h>  #include <linux/socket.h>  #include <linux/mqueue.h> @@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,  {  	if (context->name_count >= AUDIT_NAMES) {  		if (inode) -			printk(KERN_DEBUG "name_count maxed, losing inode data: " +			printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "  			       "dev=%02x:%02x, inode=%lu\n",  			       MAJOR(inode->i_sb->s_dev),  			       MINOR(inode->i_sb->s_dev), diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef909a329750..e2769e13980c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,7 +27,6 @@   */  #include <linux/cgroup.h> -#include <linux/module.h>  #include <linux/ctype.h>  #include <linux/errno.h>  #include <linux/fs.h> diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 59e9ef6aab40..da5e13975531 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -15,6 +15,7 @@   */  #include <linux/module.h> +#include <linux/slab.h>  #include <linux/cgroup.h>  #include <linux/fs.h>  #include <linux/uaccess.h> @@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)  			    struct freezer, css);  } -int cgroup_frozen(struct task_struct *task) +int cgroup_freezing_or_frozen(struct task_struct *task)  {  	struct freezer *freezer;  	enum freezer_state state;  	task_lock(task);  	freezer = task_freezer(task); -	state = freezer->state; +	if (!freezer->css.cgroup->parent) +		state = CGROUP_THAWED; /* root cgroup can't be frozen */ +	else +		state = freezer->state;  	task_unlock(task); -	return state == CGROUP_FROZEN; +	return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);  }  /* diff --git a/kernel/compat.c b/kernel/compat.c index f6c204f07ea6..7f40e9275fd9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -25,6 +25,7 @@  #include <linux/posix-timers.h>  #include <linux/times.h>  #include <linux/ptrace.h> +#include <linux/gfp.h>  #include <asm/uaccess.h> diff --git a/kernel/cpu.c b/kernel/cpu.c index f8cced2692b3..25bba73b1be3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@  #include <linux/kthread.h>  #include <linux/stop_machine.h>  #include <linux/mutex.h> +#include <linux/gfp.h>  #ifdef CONFIG_SMP  /* Serializes the updates to cpu_online_mask, cpu_present_mask */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fab459f..d10946748ec2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,   *    call to guarantee_online_mems(), as we know no one is changing   *    our task's cpuset.   * - *    Hold callback_mutex around the two modifications of our tasks - *    mems_allowed to synchronize with cpuset_mems_allowed(). - *   *    While the mm_struct we are migrating is typically from some   *    other task, the task_struct mems_allowed that we are hacking   *    is for our current task, which must allocate new pages for that @@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,  	struct cpuset *cs;  	int migrate;  	const nodemask_t *oldmem = scan->data; -	nodemask_t newmems; +	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); + +	if (!newmems) +		return;  	cs = cgroup_cs(scan->cg); -	guarantee_online_mems(cs, &newmems); +	guarantee_online_mems(cs, newmems);  	task_lock(p); -	cpuset_change_task_nodemask(p, &newmems); +	cpuset_change_task_nodemask(p, newmems);  	task_unlock(p); +	NODEMASK_FREE(newmems); +  	mm = get_task_mm(p);  	if (!mm)  		return; @@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  			   const char *buf)  { -	nodemask_t oldmem; +	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);  	int retval;  	struct ptr_heap heap; +	if (!oldmem) +		return -ENOMEM; +  	/*  	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];  	 * it's read-only  	 */ -	if (cs == &top_cpuset) -		return -EACCES; +	if (cs == &top_cpuset) { +		retval = -EACCES; +		goto done; +	}  	/*  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset. @@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  			goto done;  		if (!nodes_subset(trialcs->mems_allowed, -				node_states[N_HIGH_MEMORY])) -			return -EINVAL; +				node_states[N_HIGH_MEMORY])) { +			retval =  -EINVAL; +			goto done; +		}  	} -	oldmem = cs->mems_allowed; -	if (nodes_equal(oldmem, trialcs->mems_allowed)) { +	*oldmem = cs->mems_allowed; +	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {  		retval = 0;		/* Too easy - nothing to do */  		goto done;  	} @@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  	cs->mems_allowed = trialcs->mems_allowed;  	mutex_unlock(&callback_mutex); -	update_tasks_nodemask(cs, &oldmem, &heap); +	update_tasks_nodemask(cs, oldmem, &heap);  	heap_free(&heap);  done: +	NODEMASK_FREE(oldmem);  	return retval;  } @@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,  			  struct cgroup *oldcont, struct task_struct *tsk,  			  bool threadgroup)  { -	nodemask_t from, to;  	struct mm_struct *mm;  	struct cpuset *cs = cgroup_cs(cont);  	struct cpuset *oldcs = cgroup_cs(oldcont); +	NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); +	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); + +	if (from == NULL || to == NULL) +		goto alloc_fail;  	if (cs == &top_cpuset) {  		cpumask_copy(cpus_attach, cpu_possible_mask); -		to = node_possible_map;  	} else {  		guarantee_online_cpus(cs, cpus_attach); -		guarantee_online_mems(cs, &to);  	} +	guarantee_online_mems(cs, to);  	/* do per-task migration stuff possibly for each in the threadgroup */ -	cpuset_attach_task(tsk, &to, cs); +	cpuset_attach_task(tsk, to, cs);  	if (threadgroup) {  		struct task_struct *c;  		rcu_read_lock();  		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { -			cpuset_attach_task(c, &to, cs); +			cpuset_attach_task(c, to, cs);  		}  		rcu_read_unlock();  	}  	/* change mm; only needs to be done once even if threadgroup */ -	from = oldcs->mems_allowed; -	to = cs->mems_allowed; +	*from = oldcs->mems_allowed; +	*to = cs->mems_allowed;  	mm = get_task_mm(tsk);  	if (mm) { -		mpol_rebind_mm(mm, &to); +		mpol_rebind_mm(mm, to);  		if (is_memory_migrate(cs)) -			cpuset_migrate_mm(mm, &from, &to); +			cpuset_migrate_mm(mm, from, to);  		mmput(mm);  	} + +alloc_fail: +	NODEMASK_FREE(from); +	NODEMASK_FREE(to);  }  /* The various types of files and directories in a cpuset file system */ @@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)  static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)  { -	nodemask_t mask; +	NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); +	int retval; + +	if (mask == NULL) +		return -ENOMEM;  	mutex_lock(&callback_mutex); -	mask = cs->mems_allowed; +	*mask = cs->mems_allowed;  	mutex_unlock(&callback_mutex); -	return nodelist_scnprintf(page, PAGE_SIZE, mask); +	retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); + +	NODEMASK_FREE(mask); + +	return retval;  }  static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)  	struct cpuset *cp;	/* scans cpusets being updated */  	struct cpuset *child;	/* scans child cpusets of cp */  	struct cgroup *cont; -	nodemask_t oldmems; +	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + +	if (oldmems == NULL) +		return;  	list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)  		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))  			continue; -		oldmems = cp->mems_allowed; +		*oldmems = cp->mems_allowed;  		/* Remove offline cpus and mems from this cpuset. */  		mutex_lock(&callback_mutex); @@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)  			remove_tasks_in_empty_cpuset(cp);  		else {  			update_tasks_cpumask(cp, NULL); -			update_tasks_nodemask(cp, &oldmems, NULL); +			update_tasks_nodemask(cp, oldmems, NULL);  		}  	} +	NODEMASK_FREE(oldmems);  }  /* @@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,  static int cpuset_track_online_nodes(struct notifier_block *self,  				unsigned long action, void *arg)  { +	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + +	if (oldmems == NULL) +		return NOTIFY_DONE; +  	cgroup_lock();  	switch (action) {  	case MEM_ONLINE: -	case MEM_OFFLINE: +		*oldmems = top_cpuset.mems_allowed;  		mutex_lock(&callback_mutex);  		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];  		mutex_unlock(&callback_mutex); -		if (action == MEM_OFFLINE) -			scan_for_empty_cpusets(&top_cpuset); +		update_tasks_nodemask(&top_cpuset, oldmems, NULL); +		break; +	case MEM_OFFLINE: +		/* +		 * needn't update top_cpuset.mems_allowed explicitly because +		 * scan_for_empty_cpusets() will update it. +		 */ +		scan_for_empty_cpusets(&top_cpuset);  		break;  	default:  		break;  	}  	cgroup_unlock(); + +	NODEMASK_FREE(oldmems);  	return NOTIFY_OK;  }  #endif diff --git a/kernel/cred.c b/kernel/cred.c index 1ed8ca18790c..e1dbe9eef800 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -10,6 +10,7 @@   */  #include <linux/module.h>  #include <linux/cred.h> +#include <linux/slab.h>  #include <linux/sched.h>  #include <linux/key.h>  #include <linux/keyctl.h> @@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void)  	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);  	if (!new) -		return NULL; +		goto free_tgcred;  	kdebug("prepare_usermodehelper_creds() alloc %p", new); @@ -397,6 +398,10 @@ struct cred *prepare_usermodehelper_creds(void)  error:  	put_cred(new); +free_tgcred: +#ifdef CONFIG_KEYS +	kfree(tgcred); +#endif  	return NULL;  } diff --git a/kernel/early_res.c b/kernel/early_res.c index 3cb2c661bb78..31aa9332ef3f 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -333,6 +333,12 @@ void __init free_early_partial(u64 start, u64 end)  	struct early_res *r;  	int i; +	if (start == end) +		return; + +	if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end)) +		return; +  try_next:  	i = find_overlapped_early(start, end);  	if (i >= max_early_res) diff --git a/kernel/exit.c b/kernel/exit.c index cce59cb5ee6a..7f2683a10ac4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -953,7 +953,8 @@ NORET_TYPE void do_exit(long code)  	acct_update_integrals(tsk);  	/* sync mm's RSS info before statistics gathering */ -	sync_mm_rss(tsk, tsk->mm); +	if (tsk->mm) +		sync_mm_rss(tsk, tsk->mm);  	group_dead = atomic_dec_and_test(&tsk->signal->live);  	if (group_dead) {  		hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 4799c5f0e6d0..44b0791b0a2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1052,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	p->prev_utime = cputime_zero;  	p->prev_stime = cputime_zero;  #endif +#if defined(SPLIT_RSS_COUNTING) +	memset(&p->rss_stat, 0, sizeof(p->rss_stat)); +#endif  	p->default_timer_slack_ns = current->timer_slack_ns; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 42ec11b2af8a..b7091d5ca2f8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)  		if (desc->chip->ack)  			desc->chip->ack(irq);  	} +	desc->status |= IRQ_MASKED; +} + +static inline void mask_irq(struct irq_desc *desc, int irq) +{ +	if (desc->chip->mask) { +		desc->chip->mask(irq); +		desc->status |= IRQ_MASKED; +	} +} + +static inline void unmask_irq(struct irq_desc *desc, int irq) +{ +	if (desc->chip->unmask) { +		desc->chip->unmask(irq); +		desc->status &= ~IRQ_MASKED; +	}  }  /* @@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)  	raw_spin_lock(&desc->lock);  	desc->status &= ~IRQ_INPROGRESS; -	if (unlikely(desc->status & IRQ_ONESHOT)) -		desc->status |= IRQ_MASKED; -	else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) -		desc->chip->unmask(irq); +	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) +		unmask_irq(desc, irq);  out_unlock:  	raw_spin_unlock(&desc->lock);  } @@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)  	action = desc->action;  	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {  		desc->status |= IRQ_PENDING; -		if (desc->chip->mask) -			desc->chip->mask(irq); +		mask_irq(desc, irq);  		goto out;  	} @@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)  		irqreturn_t action_ret;  		if (unlikely(!action)) { -			desc->chip->mask(irq); +			mask_irq(desc, irq);  			goto out_unlock;  		} @@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)  		if (unlikely((desc->status &  			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==  			      (IRQ_PENDING | IRQ_MASKED))) { -			desc->chip->unmask(irq); -			desc->status &= ~IRQ_MASKED; +			unmask_irq(desc, irq);  		}  		desc->status &= ~IRQ_PENDING; @@ -716,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,  	__set_irq_handler(irq, handle, 0, name);  } -void __init set_irq_noprobe(unsigned int irq) +void set_irq_noprobe(unsigned int irq)  {  	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; @@ -731,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)  	raw_spin_unlock_irqrestore(&desc->lock, flags);  } -void __init set_irq_probe(unsigned int irq) +void set_irq_probe(unsigned int irq)  {  	struct irq_desc *desc = irq_to_desc(irq);  	unsigned long flags; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078ca60c7..704e488730a5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)  {  	struct irq_desc *desc = irq_to_desc(irq);  	struct irqaction *action; +	unsigned long flags;  	if (!desc)  		return 0; @@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)  	if (desc->status & IRQ_NOREQUEST)  		return 0; +	raw_spin_lock_irqsave(&desc->lock, flags);  	action = desc->action;  	if (action)  		if (irqflags & action->flags & IRQF_SHARED)  			action = NULL; +	raw_spin_unlock_irqrestore(&desc->lock, flags); +  	return !action;  } @@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)   */  static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)  { +again:  	chip_bus_lock(irq, desc);  	raw_spin_lock_irq(&desc->lock); + +	/* +	 * Implausible though it may be we need to protect us against +	 * the following scenario: +	 * +	 * The thread is faster done than the hard interrupt handler +	 * on the other CPU. If we unmask the irq line then the +	 * interrupt can come in again and masks the line, leaves due +	 * to IRQ_INPROGRESS and the irq line is masked forever. +	 */ +	if (unlikely(desc->status & IRQ_INPROGRESS)) { +		raw_spin_unlock_irq(&desc->lock); +		chip_bus_sync_unlock(irq, desc); +		cpu_relax(); +		goto again; +	} +  	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {  		desc->status &= ~IRQ_MASKED;  		desc->chip->unmask(irq); @@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		if (new->flags & IRQF_ONESHOT)  			desc->status |= IRQ_ONESHOT; +		/* +		 * Force MSI interrupts to run with interrupts +		 * disabled. The multi vector cards can cause stack +		 * overflows due to nested interrupts when enough of +		 * them are directed to a core and fire at the same +		 * time. +		 */ +		if (desc->msi_desc) +			new->flags |= IRQF_DISABLED; +  		if (!(desc->status & IRQ_NOAUTOEN)) {  			desc->depth = 0;  			desc->status &= ~IRQ_DISABLED; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 963559dbd858..65d3845665ac 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -6,6 +6,7 @@   */  #include <linux/irq.h> +#include <linux/slab.h>  #include <linux/module.h>  #include <linux/random.h>  #include <linux/interrupt.h> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6f50eccc79c0..7a6eb04ef6b5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -7,6 +7,7 @@   */  #include <linux/irq.h> +#include <linux/gfp.h>  #include <linux/proc_fs.h>  #include <linux/seq_file.h>  #include <linux/interrupt.h> diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8e5288a8a355..13aff293f4de 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -21,6 +21,7 @@  #include <linux/sched.h>	/* for cond_resched */  #include <linux/mm.h>  #include <linux/ctype.h> +#include <linux/slab.h>  #include <asm/sections.h> diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 761fdd2b3034..11f3515ca83f 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -69,9 +69,16 @@ struct kgdb_state {  	struct pt_regs		*linux_regs;  }; +/* Exception state values */ +#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ +#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ +#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */ +#define DCPU_SSTEP       0x8 /* CPU is single stepping */ +  static struct debuggerinfo_struct {  	void			*debuggerinfo;  	struct task_struct	*task; +	int 			exception_state;  } kgdb_info[NR_CPUS];  /** @@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)  /*   * Copy the binary array pointed to by buf into mem.  Fix $, #, and - * 0x7d escaped with 0x7d.  Return a pointer to the character after - * the last byte written. + * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. + * The input buf is overwitten with the result to write to mem.   */  static int kgdb_ebin2mem(char *buf, char *mem, int count)  { -	int err = 0; -	char c; +	int size = 0; +	char *c = buf;  	while (count-- > 0) { -		c = *buf++; -		if (c == 0x7d) -			c = *buf++ ^ 0x20; - -		err = probe_kernel_write(mem, &c, 1); -		if (err) -			break; - -		mem++; +		c[size] = *buf++; +		if (c[size] == 0x7d) +			c[size] = *buf++ ^ 0x20; +		size++;  	} -	return err; +	return probe_kernel_write(mem, c, size);  }  /* @@ -563,49 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)  }  /* - * CPU debug state control: - */ - -#ifdef CONFIG_SMP -static void kgdb_wait(struct pt_regs *regs) -{ -	unsigned long flags; -	int cpu; - -	local_irq_save(flags); -	cpu = raw_smp_processor_id(); -	kgdb_info[cpu].debuggerinfo = regs; -	kgdb_info[cpu].task = current; -	/* -	 * Make sure the above info reaches the primary CPU before -	 * our cpu_in_kgdb[] flag setting does: -	 */ -	smp_wmb(); -	atomic_set(&cpu_in_kgdb[cpu], 1); - -	/* Disable any cpu specific hw breakpoints */ -	kgdb_disable_hw_debug(regs); - -	/* Wait till primary CPU is done with debugging */ -	while (atomic_read(&passive_cpu_wait[cpu])) -		cpu_relax(); - -	kgdb_info[cpu].debuggerinfo = NULL; -	kgdb_info[cpu].task = NULL; - -	/* fix up hardware debug registers on local cpu */ -	if (arch_kgdb_ops.correct_hw_break) -		arch_kgdb_ops.correct_hw_break(); - -	/* Signal the primary CPU that we are done: */ -	atomic_set(&cpu_in_kgdb[cpu], 0); -	touch_softlockup_watchdog_sync(); -	clocksource_touch_watchdog(); -	local_irq_restore(flags); -} -#endif - -/*   * Some architectures need cache flushes when we set/clear a   * breakpoint:   */ @@ -1400,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)  	return 1;  } -/* - * kgdb_handle_exception() - main entry point from a kernel exception - * - * Locking hierarchy: - *	interface locks, if any (begin_session) - *	kgdb lock (kgdb_active) - */ -int -kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)  { -	struct kgdb_state kgdb_var; -	struct kgdb_state *ks = &kgdb_var;  	unsigned long flags;  	int sstep_tries = 100;  	int error = 0;  	int i, cpu; - -	ks->cpu			= raw_smp_processor_id(); -	ks->ex_vector		= evector; -	ks->signo		= signo; -	ks->ex_vector		= evector; -	ks->err_code		= ecode; -	ks->kgdb_usethreadid	= 0; -	ks->linux_regs		= regs; - -	if (kgdb_reenter_check(ks)) -		return 0; /* Ouch, double exception ! */ - +	int trace_on = 0;  acquirelock:  	/*  	 * Interrupts will be restored by the 'trap return' code, except when @@ -1435,13 +1373,43 @@ acquirelock:  	 */  	local_irq_save(flags); -	cpu = raw_smp_processor_id(); +	cpu = ks->cpu; +	kgdb_info[cpu].debuggerinfo = regs; +	kgdb_info[cpu].task = current; +	/* +	 * Make sure the above info reaches the primary CPU before +	 * our cpu_in_kgdb[] flag setting does: +	 */ +	atomic_inc(&cpu_in_kgdb[cpu]);  	/* -	 * Acquire the kgdb_active lock: +	 * CPU will loop if it is a slave or request to become a kgdb +	 * master cpu and acquire the kgdb_active lock:  	 */ -	while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) +	while (1) { +		if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { +			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu) +				break; +		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { +			if (!atomic_read(&passive_cpu_wait[cpu])) +				goto return_normal; +		} else { +return_normal: +			/* Return to normal operation by executing any +			 * hw breakpoint fixup. +			 */ +			if (arch_kgdb_ops.correct_hw_break) +				arch_kgdb_ops.correct_hw_break(); +			if (trace_on) +				tracing_on(); +			atomic_dec(&cpu_in_kgdb[cpu]); +			touch_softlockup_watchdog_sync(); +			clocksource_touch_watchdog(); +			local_irq_restore(flags); +			return 0; +		}  		cpu_relax(); +	}  	/*  	 * For single stepping, try to only enter on the processor @@ -1475,9 +1443,6 @@ acquirelock:  	if (kgdb_io_ops->pre_exception)  		kgdb_io_ops->pre_exception(); -	kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; -	kgdb_info[ks->cpu].task = current; -  	kgdb_disable_hw_debug(ks->linux_regs);  	/* @@ -1486,15 +1451,9 @@ acquirelock:  	 */  	if (!kgdb_single_step) {  		for (i = 0; i < NR_CPUS; i++) -			atomic_set(&passive_cpu_wait[i], 1); +			atomic_inc(&passive_cpu_wait[i]);  	} -	/* -	 * spin_lock code is good enough as a barrier so we don't -	 * need one here: -	 */ -	atomic_set(&cpu_in_kgdb[ks->cpu], 1); -  #ifdef CONFIG_SMP  	/* Signal the other CPUs to enter kgdb_wait() */  	if ((!kgdb_single_step) && kgdb_do_roundup) @@ -1518,6 +1477,9 @@ acquirelock:  	kgdb_single_step = 0;  	kgdb_contthread = current;  	exception_level = 0; +	trace_on = tracing_is_on(); +	if (trace_on) +		tracing_off();  	/* Talk to debugger with gdbserial protocol */  	error = gdb_serial_stub(ks); @@ -1526,13 +1488,11 @@ acquirelock:  	if (kgdb_io_ops->post_exception)  		kgdb_io_ops->post_exception(); -	kgdb_info[ks->cpu].debuggerinfo = NULL; -	kgdb_info[ks->cpu].task = NULL; -	atomic_set(&cpu_in_kgdb[ks->cpu], 0); +	atomic_dec(&cpu_in_kgdb[ks->cpu]);  	if (!kgdb_single_step) {  		for (i = NR_CPUS-1; i >= 0; i--) -			atomic_set(&passive_cpu_wait[i], 0); +			atomic_dec(&passive_cpu_wait[i]);  		/*  		 * Wait till all the CPUs have quit  		 * from the debugger. @@ -1551,6 +1511,8 @@ kgdb_restore:  		else  			kgdb_sstep_pid = 0;  	} +	if (trace_on) +		tracing_on();  	/* Free kgdb_active */  	atomic_set(&kgdb_active, -1);  	touch_softlockup_watchdog_sync(); @@ -1560,13 +1522,52 @@ kgdb_restore:  	return error;  } +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + *	interface locks, if any (begin_session) + *	kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ +	struct kgdb_state kgdb_var; +	struct kgdb_state *ks = &kgdb_var; +	int ret; + +	ks->cpu			= raw_smp_processor_id(); +	ks->ex_vector		= evector; +	ks->signo		= signo; +	ks->ex_vector		= evector; +	ks->err_code		= ecode; +	ks->kgdb_usethreadid	= 0; +	ks->linux_regs		= regs; + +	if (kgdb_reenter_check(ks)) +		return 0; /* Ouch, double exception ! */ +	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER; +	ret = kgdb_cpu_enter(ks, regs); +	kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER; +	return ret; +} +  int kgdb_nmicallback(int cpu, void *regs)  {  #ifdef CONFIG_SMP +	struct kgdb_state kgdb_var; +	struct kgdb_state *ks = &kgdb_var; + +	memset(ks, 0, sizeof(struct kgdb_state)); +	ks->cpu			= cpu; +	ks->linux_regs		= regs; +  	if (!atomic_read(&cpu_in_kgdb[cpu]) && -			atomic_read(&kgdb_active) != cpu && -			atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { -		kgdb_wait((struct pt_regs *)regs); +	    atomic_read(&kgdb_active) != -1 && +	    atomic_read(&kgdb_active) != cpu) { +		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE; +		kgdb_cpu_enter(ks, regs); +		kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;  		return 0;  	}  #endif @@ -1742,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);   */  void kgdb_breakpoint(void)  { -	atomic_set(&kgdb_setting_breakpoint, 1); +	atomic_inc(&kgdb_setting_breakpoint);  	wmb(); /* Sync point before breakpoint */  	arch_kgdb_breakpoint();  	wmb(); /* Sync point after breakpoint */ -	atomic_set(&kgdb_setting_breakpoint, 0); +	atomic_dec(&kgdb_setting_breakpoint);  }  EXPORT_SYMBOL_GPL(kgdb_breakpoint); diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea15194..83911c780175 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused)  	set_task_comm(tsk, "kthreadd");  	ignore_signals(tsk);  	set_cpus_allowed_ptr(tsk, cpu_all_mask); -	set_mems_allowed(node_possible_map); +	set_mems_allowed(node_states[N_HIGH_MEMORY]);  	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ca07c5c0c914..877fb306d415 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -56,7 +56,6 @@  #include <linux/module.h>  #include <linux/sched.h>  #include <linux/list.h> -#include <linux/slab.h>  #include <linux/stacktrace.h>  static DEFINE_SPINLOCK(latency_lock); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c927a549db2c..2594e1ce41cb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -43,6 +43,7 @@  #include <linux/ftrace.h>  #include <linux/stringify.h>  #include <linux/bitops.h> +#include <linux/gfp.h>  #include <asm/sections.h> @@ -582,9 +583,6 @@ static int static_obj(void *obj)  	unsigned long start = (unsigned long) &_stext,  		      end   = (unsigned long) &_end,  		      addr  = (unsigned long) obj; -#ifdef CONFIG_SMP -	int i; -#endif  	/*  	 * static variable? @@ -595,24 +593,16 @@ static int static_obj(void *obj)  	if (arch_is_kernel_data(addr))  		return 1; -#ifdef CONFIG_SMP  	/* -	 * percpu var? +	 * in-kernel percpu var?  	 */ -	for_each_possible_cpu(i) { -		start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); -		end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM -					+ per_cpu_offset(i); - -		if ((addr >= start) && (addr < end)) -			return 1; -	} -#endif +	if (is_kernel_percpu_address(addr)) +		return 1;  	/* -	 * module var? +	 * module static or percpu var?  	 */ -	return is_module_address(addr); +	return is_module_address(addr) || is_module_percpu_address(addr);  }  /* diff --git a/kernel/module.c b/kernel/module.c index c968d3606dca..1016b75b026a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module);  #ifdef CONFIG_SMP -static void *percpu_modalloc(unsigned long size, unsigned long align, -			     const char *name) +static inline void __percpu *mod_percpu(struct module *mod)  { -	void *ptr; +	return mod->percpu; +} +static int percpu_modalloc(struct module *mod, +			   unsigned long size, unsigned long align) +{  	if (align > PAGE_SIZE) {  		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", -		       name, align, PAGE_SIZE); +		       mod->name, align, PAGE_SIZE);  		align = PAGE_SIZE;  	} -	ptr = __alloc_reserved_percpu(size, align); -	if (!ptr) +	mod->percpu = __alloc_reserved_percpu(size, align); +	if (!mod->percpu) {  		printk(KERN_WARNING  		       "Could not allocate %lu bytes percpu data\n", size); -	return ptr; +		return -ENOMEM; +	} +	mod->percpu_size = size; +	return 0;  } -static void percpu_modfree(void *freeme) +static void percpu_modfree(struct module *mod)  { -	free_percpu(freeme); +	free_percpu(mod->percpu);  }  static unsigned int find_pcpusec(Elf_Ehdr *hdr, @@ -400,24 +406,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,  	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");  } -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +static void percpu_modcopy(struct module *mod, +			   const void *from, unsigned long size)  {  	int cpu;  	for_each_possible_cpu(cpu) -		memcpy(pcpudest + per_cpu_offset(cpu), from, size); +		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); +} + +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ +	struct module *mod; +	unsigned int cpu; + +	preempt_disable(); + +	list_for_each_entry_rcu(mod, &modules, list) { +		if (!mod->percpu_size) +			continue; +		for_each_possible_cpu(cpu) { +			void *start = per_cpu_ptr(mod->percpu, cpu); + +			if ((void *)addr >= start && +			    (void *)addr < start + mod->percpu_size) { +				preempt_enable(); +				return true; +			} +		} +	} + +	preempt_enable(); +	return false;  }  #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align, -				    const char *name) +static inline void __percpu *mod_percpu(struct module *mod)  {  	return NULL;  } -static inline void percpu_modfree(void *pcpuptr) +static inline int percpu_modalloc(struct module *mod, +				  unsigned long size, unsigned long align) +{ +	return -ENOMEM; +} +static inline void percpu_modfree(struct module *mod)  { -	BUG();  }  static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,  					Elf_Shdr *sechdrs, @@ -425,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,  {  	return 0;  } -static inline void percpu_modcopy(void *pcpudst, const void *src, -				  unsigned long size) +static inline void percpu_modcopy(struct module *mod, +				  const void *from, unsigned long size)  {  	/* pcpusec should be 0, and size of that section should be 0. */  	BUG_ON(size != 0);  } +bool is_module_percpu_address(unsigned long addr) +{ +	return false; +}  #endif /* CONFIG_SMP */ @@ -473,11 +521,13 @@ static void module_unload_init(struct module *mod)  	int cpu;  	INIT_LIST_HEAD(&mod->modules_which_use_me); -	for_each_possible_cpu(cpu) -		per_cpu_ptr(mod->refptr, cpu)->count = 0; +	for_each_possible_cpu(cpu) { +		per_cpu_ptr(mod->refptr, cpu)->incs = 0; +		per_cpu_ptr(mod->refptr, cpu)->decs = 0; +	}  	/* Hold reference count during initialization. */ -	__this_cpu_write(mod->refptr->count, 1); +	__this_cpu_write(mod->refptr->incs, 1);  	/* Backwards compatibility macros put refcount during init. */  	mod->waiter = current;  } @@ -616,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)  unsigned int module_refcount(struct module *mod)  { -	unsigned int total = 0; +	unsigned int incs = 0, decs = 0;  	int cpu;  	for_each_possible_cpu(cpu) -		total += per_cpu_ptr(mod->refptr, cpu)->count; -	return total; +		decs += per_cpu_ptr(mod->refptr, cpu)->decs; +	/* +	 * ensure the incs are added up after the decs. +	 * module_put ensures incs are visible before decs with smp_wmb. +	 * +	 * This 2-count scheme avoids the situation where the refcount +	 * for CPU0 is read, then CPU0 increments the module refcount, +	 * then CPU1 drops that refcount, then the refcount for CPU1 is +	 * read. We would record a decrement but not its corresponding +	 * increment so we would see a low count (disaster). +	 * +	 * Rare situation? But module_refcount can be preempted, and we +	 * might be tallying up 4096+ CPUs. So it is not impossible. +	 */ +	smp_rmb(); +	for_each_possible_cpu(cpu) +		incs += per_cpu_ptr(mod->refptr, cpu)->incs; +	return incs - decs;  }  EXPORT_SYMBOL(module_refcount); @@ -798,10 +864,11 @@ void module_put(struct module *module)  {  	if (module) {  		preempt_disable(); -		__this_cpu_dec(module->refptr->count); +		smp_wmb(); /* see comment in module_refcount */ +		__this_cpu_inc(module->refptr->decs);  		trace_module_put(module, _RET_IP_, -				 __this_cpu_read(module->refptr->count)); +				 __this_cpu_read(module->refptr->decs));  		/* Maybe they're waiting for us to drop reference? */  		if (unlikely(!module_is_live(module)))  			wake_up_process(module->waiter); @@ -1400,8 +1467,7 @@ static void free_module(struct module *mod)  	/* This may be NULL, but that's OK */  	module_free(mod, mod->module_init);  	kfree(mod->args); -	if (mod->percpu) -		percpu_modfree(mod->percpu); +	percpu_modfree(mod);  #if defined(CONFIG_MODULE_UNLOAD)  	if (mod->refptr)  		free_percpu(mod->refptr); @@ -1520,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,  		default:  			/* Divert to percpu allocation if a percpu var. */  			if (sym[i].st_shndx == pcpuindex) -				secbase = (unsigned long)mod->percpu; +				secbase = (unsigned long)mod_percpu(mod);  			else  				secbase = sechdrs[sym[i].st_shndx].sh_addr;  			sym[i].st_value += secbase; @@ -1954,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod,  	unsigned int modindex, versindex, infoindex, pcpuindex;  	struct module *mod;  	long err = 0; -	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ +	void *ptr = NULL; /* Stops spurious gcc warning */  	unsigned long symoffs, stroffs, *strmap;  	mm_segment_t old_fs; @@ -2094,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod,  	if (pcpuindex) {  		/* We have a special allocation for this section. */ -		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, -					 sechdrs[pcpuindex].sh_addralign, -					 mod->name); -		if (!percpu) { -			err = -ENOMEM; +		err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, +				      sechdrs[pcpuindex].sh_addralign); +		if (err)  			goto free_mod; -		}  		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -		mod->percpu = percpu;  	}  	/* Determine total sizes, and put offsets in sh_entsize.  For now @@ -2317,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod,  	sort_extable(mod->extable, mod->extable + mod->num_exentries);  	/* Finally, copy percpu area over. */ -	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, +	percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,  		       sechdrs[pcpuindex].sh_size);  	add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, @@ -2409,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod,  	module_free(mod, mod->module_core);  	/* mod will be freed with core. Don't access it beyond this line! */   free_percpu: -	if (percpu) -		percpu_modfree(percpu); +	percpu_modfree(mod);   free_mod:  	kfree(args);  	kfree(strmap); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 2ab67233ee8f..f74e6c00e26d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -13,6 +13,7 @@   *             Pavel Emelianov <xemul@openvz.org>   */ +#include <linux/slab.h>  #include <linux/module.h>  #include <linux/nsproxy.h>  #include <linux/init_task.h> diff --git a/kernel/padata.c b/kernel/padata.c index 93caf65ff57c..fd03513c7327 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -25,6 +25,7 @@  #include <linux/padata.h>  #include <linux/mutex.h>  #include <linux/sched.h> +#include <linux/slab.h>  #include <linux/rcupdate.h>  #define MAX_SEQ_NR INT_MAX - NR_CPUS diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 574ee58a3046..2f3fbf84215a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -15,6 +15,7 @@  #include <linux/smp.h>  #include <linux/file.h>  #include <linux/poll.h> +#include <linux/slab.h>  #include <linux/sysfs.h>  #include <linux/dcache.h>  #include <linux/percpu.h> @@ -1164,11 +1165,9 @@ void perf_event_task_sched_out(struct task_struct *task,  	struct perf_event_context *ctx = task->perf_event_ctxp;  	struct perf_event_context *next_ctx;  	struct perf_event_context *parent; -	struct pt_regs *regs;  	int do_switch = 1; -	regs = task_pt_regs(task); -	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); +	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);  	if (likely(!ctx || !cpuctx->task_ctx))  		return; @@ -2786,12 +2785,11 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)  	return NULL;  } -#ifdef CONFIG_EVENT_TRACING  __weak  void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)  {  } -#endif +  /*   * Output @@ -3378,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,  				     struct perf_task_event *task_event)  {  	struct perf_output_handle handle; -	int size;  	struct task_struct *task = task_event->task; -	int ret; +	unsigned long flags; +	int size, ret; + +	/* +	 * If this CPU attempts to acquire an rq lock held by a CPU spinning +	 * in perf_output_lock() from interrupt context, it's game over. +	 */ +	local_irq_save(flags);  	size  = task_event->event_id.header.size;  	ret = perf_output_begin(&handle, event, size, 0, 0); -	if (ret) +	if (ret) { +		local_irq_restore(flags);  		return; +	}  	task_event->event_id.pid = perf_event_pid(event, task);  	task_event->event_id.ppid = perf_event_pid(event, current); @@ -3397,6 +3403,7 @@ static void perf_event_task_output(struct perf_event *event,  	perf_output_put(&handle, task_event->event_id);  	perf_output_end(&handle); +	local_irq_restore(flags);  }  static int perf_event_task_match(struct perf_event *event) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 79aac93acf99..a5aff94e1f0b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -13,6 +13,7 @@  #include <linux/syscalls.h>  #include <linux/err.h>  #include <linux/acct.h> +#include <linux/slab.h>  #define BITS_PER_PAGE		(PAGE_SIZE*8) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 1a22dfd42df9..bc7704b3a443 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1061,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,  	}  } -static void stop_process_timers(struct task_struct *tsk) +static void stop_process_timers(struct signal_struct *sig)  { -	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; +	struct thread_group_cputimer *cputimer = &sig->cputimer;  	unsigned long flags;  	if (!cputimer->running) @@ -1072,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)  	spin_lock_irqsave(&cputimer->lock, flags);  	cputimer->running = 0;  	spin_unlock_irqrestore(&cputimer->lock, flags); + +	sig->cputime_expires.prof_exp = cputime_zero; +	sig->cputime_expires.virt_exp = cputime_zero; +	sig->cputime_expires.sched_exp = 0;  }  static u32 onecputick; @@ -1133,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,  	    list_empty(&timers[CPUCLOCK_VIRT]) &&  	    cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&  	    list_empty(&timers[CPUCLOCK_SCHED])) { -		stop_process_timers(tsk); +		stop_process_timers(sig);  		return;  	} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index da5288ec2392..aa9e916da4d5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -22,6 +22,7 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/freezer.h> +#include <linux/gfp.h>  #include <scsi/scsi_scan.h>  #include <asm/suspend.h> diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c index 39ac698ef836..fdcad9ed5a7b 100644 --- a/kernel/power/hibernate_nvs.c +++ b/kernel/power/hibernate_nvs.c @@ -10,6 +10,7 @@  #include <linux/kernel.h>  #include <linux/list.h>  #include <linux/mm.h> +#include <linux/slab.h>  #include <linux/suspend.h>  /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 5ade1bdcf366..71ae29052ab6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)  		printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "  				"(%d tasks refusing to freeze):\n",  				elapsed_csecs / 100, elapsed_csecs % 100, todo); -		show_state();  		read_lock(&tasklist_lock);  		do_each_thread(g, p) {  			task_lock(p);  			if (freezing(p) && !freezer_should_skip(p)) -				printk(KERN_ERR " %s\n", p->comm); +				sched_show_task(p);  			cancel_freezing(p);  			task_unlock(p);  		} while_each_thread(g, p); @@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)  		if (nosig_only && should_send_signal(p))  			continue; -		if (cgroup_frozen(p)) +		if (cgroup_freezing_or_frozen(p))  			continue;  		thaw_process(p); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 830cadecbdfc..be861c26dda7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -26,6 +26,7 @@  #include <linux/console.h>  #include <linux/highmem.h>  #include <linux/list.h> +#include <linux/slab.h>  #include <asm/uaccess.h>  #include <asm/mmu_context.h> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 44cce10b582d..56e7dbb8b996 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -15,6 +15,7 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/syscalls.h> +#include <linux/gfp.h>  #include "power.h" diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1d575733d4e1..66824d71983a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -23,6 +23,7 @@  #include <linux/swap.h>  #include <linux/swapops.h>  #include <linux/pm.h> +#include <linux/slab.h>  #include "power.h" diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f1125c1a6321..63fe25433980 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@  #include <linux/mutex.h>  #include <linux/module.h>  #include <linux/kernel_stat.h> +#include <linux/hardirq.h>  #ifdef CONFIG_DEBUG_LOCK_ALLOC  static struct lock_class_key rcu_lock_key; @@ -66,6 +67,28 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map);  int rcu_scheduler_active __read_mostly;  EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +/** + * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases.  Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + */ +int rcu_read_lock_bh_held(void) +{ +	if (!debug_lockdep_rcu_enabled()) +		return 1; +	return in_softirq(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +  /*   * This function is invoked towards the end of the scheduler's initialization   * process.  Before this is called, the idle task might contain diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf37c40b..c7eaa37a768b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -10,7 +10,6 @@  #include <linux/types.h>  #include <linux/parser.h>  #include <linux/fs.h> -#include <linux/slab.h>  #include <linux/res_counter.h>  #include <linux/uaccess.h>  #include <linux/mm.h> diff --git a/kernel/resource.c b/kernel/resource.c index 2d5be5d9bf5f..9c358e263534 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -219,19 +219,34 @@ void release_child_resources(struct resource *r)  }  /** - * request_resource - request and reserve an I/O or memory resource + * request_resource_conflict - request and reserve an I/O or memory resource   * @root: root resource descriptor   * @new: resource descriptor desired by caller   * - * Returns 0 for success, negative error code on error. + * Returns 0 for success, conflict resource on error.   */ -int request_resource(struct resource *root, struct resource *new) +struct resource *request_resource_conflict(struct resource *root, struct resource *new)  {  	struct resource *conflict;  	write_lock(&resource_lock);  	conflict = __request_resource(root, new);  	write_unlock(&resource_lock); +	return conflict; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ +	struct resource *conflict; + +	conflict = request_resource_conflict(root, new);  	return conflict ? -EBUSY : 0;  } @@ -474,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou  }  /** - * insert_resource - Inserts a resource in the resource tree + * insert_resource_conflict - Inserts resource in the resource tree   * @parent: parent of the new resource   * @new: new resource to insert   * - * Returns 0 on success, -EBUSY if the resource can't be inserted. + * Returns 0 on success, conflict resource if the resource can't be inserted.   * - * This function is equivalent to request_resource when no conflict + * This function is equivalent to request_resource_conflict when no conflict   * happens. If a conflict happens, and the conflicting resources   * entirely fit within the range of the new resource, then the new   * resource is inserted and the conflicting resources become children of   * the new resource.   */ -int insert_resource(struct resource *parent, struct resource *new) +struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)  {  	struct resource *conflict;  	write_lock(&resource_lock);  	conflict = __insert_resource(parent, new);  	write_unlock(&resource_lock); +	return conflict; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ +	struct resource *conflict; + +	conflict = insert_resource_conflict(parent, new);  	return conflict ? -EBUSY : 0;  } diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7858d3..6af210a7de70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@  #include <linux/debugfs.h>  #include <linux/ctype.h>  #include <linux/ftrace.h> +#include <linux/slab.h>  #include <asm/tlb.h>  #include <asm/irq_regs.h> @@ -2650,7 +2651,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  {  	unsigned long flags;  	struct rq *rq; -	int cpu = get_cpu(); +	int cpu __maybe_unused = get_cpu();  #ifdef CONFIG_SMP  	/* @@ -4902,7 +4903,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  	int ret;  	cpumask_var_t mask; -	if (len < cpumask_size()) +	if ((len * BITS_PER_BYTE) < nr_cpu_ids) +		return -EINVAL; +	if (len & (sizeof(unsigned long)-1))  		return -EINVAL;  	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -4910,10 +4913,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,  	ret = sched_getaffinity(pid, mask);  	if (ret == 0) { -		if (copy_to_user(user_mask_ptr, mask, cpumask_size())) +		size_t retlen = min_t(size_t, len, cpumask_size()); + +		if (copy_to_user(user_mask_ptr, mask, retlen))  			ret = -EFAULT;  		else -			ret = cpumask_size(); +			ret = retlen;  	}  	free_cpumask_var(mask); @@ -5383,7 +5388,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  		get_task_struct(mt);  		task_rq_unlock(rq, &flags); -		wake_up_process(rq->migration_thread); +		wake_up_process(mt);  		put_task_struct(mt);  		wait_for_completion(&req.done);  		tlb_migrate_finish(p->mm); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index fccf9fbb0d7b..e6871cb3fc83 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -27,6 +27,7 @@   *  of the License.   */ +#include <linux/gfp.h>  #include "sched_cpupri.h"  /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 67f95aada4b9..9b49db144037 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -518,8 +518,4 @@ void proc_sched_set_task(struct task_struct *p)  	p->se.nr_wakeups_idle			= 0;  	p->sched_info.bkl_count			= 0;  #endif -	p->se.sum_exec_runtime			= 0; -	p->se.prev_sum_exec_runtime		= 0; -	p->nvcsw				= 0; -	p->nivcsw				= 0;  } diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 7494bbf5a270..7d3f4fa9ef4f 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,  			goto cancelled;  		/* the timer holds a reference whilst it is pending */ -		ret = work->ops->get_ref(work); +		ret = slow_work_get_ref(work);  		if (ret < 0)  			goto cant_get_ref; diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 321f3c59d732..a29ebd1ef41d 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h @@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);   */  static inline void slow_work_set_thread_pid(int id, pid_t pid)  { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG  	slow_work_pids[id] = pid;  #endif  }  static inline void slow_work_mark_time(struct slow_work *work)  { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG  	work->mark = CURRENT_TIME;  #endif  }  static inline void slow_work_begin_exec(int id, struct slow_work *work)  { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG  	slow_work_execs[id] = work;  #endif  }  static inline void slow_work_end_exec(int id, struct slow_work *work)  { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG  	write_lock(&slow_work_execs_lock);  	slow_work_execs[id] = NULL;  	write_unlock(&slow_work_execs_lock); diff --git a/kernel/smp.c b/kernel/smp.c index 9867b6bfefce..3fc697336183 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -9,6 +9,7 @@  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/init.h> +#include <linux/gfp.h>  #include <linux/smp.h>  #include <linux/cpu.h> diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0d4c7898ab80..4b493f67dcb5 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -155,11 +155,11 @@ void softlockup_tick(void)  	 * Wake up the high-prio watchdog task twice per  	 * threshold timespan.  	 */ -	if (now > touch_ts + softlockup_thresh/2) +	if (time_after(now - softlockup_thresh/2, touch_ts))  		wake_up_process(per_cpu(softlockup_watchdog, this_cpu));  	/* Warn about unreasonable delays: */ -	if (now <= (touch_ts + softlockup_thresh)) +	if (time_before_eq(now - softlockup_thresh, touch_ts))  		return;  	per_cpu(softlockup_print_ts, this_cpu) = touch_ts; diff --git a/kernel/srcu.c b/kernel/srcu.c index bde4295774c8..2980da3fd509 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -30,7 +30,6 @@  #include <linux/preempt.h>  #include <linux/rcupdate.h>  #include <linux/sched.h> -#include <linux/slab.h>  #include <linux/smp.h>  #include <linux/srcu.h> diff --git a/kernel/sys.c b/kernel/sys.c index 8298878f4f71..6d1a7e0f9d5b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,7 @@  #include <linux/personality.h>  #include <linux/ptrace.h>  #include <linux/fs_struct.h> +#include <linux/gfp.h>  #include <linux/compat.h>  #include <linux/syscalls.h> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8cd50d8f9bde..59030570f5ca 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@  #include <linux/file.h>  #include <linux/ctype.h>  #include <linux/netdevice.h> +#include <linux/slab.h>  #ifdef CONFIG_SYSCTL_SYSCALL diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 899ca51be5e8..11281d5792bd 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -22,6 +22,7 @@  #include <linux/delayacct.h>  #include <linux/cpumask.h>  #include <linux/percpu.h> +#include <linux/slab.h>  #include <linux/cgroupstats.h>  #include <linux/cgroup.h>  #include <linux/fs.h> diff --git a/kernel/time.c b/kernel/time.c index 804798005d19..656dccfe1cbb 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -35,7 +35,6 @@  #include <linux/syscalls.h>  #include <linux/security.h>  #include <linux/fs.h> -#include <linux/slab.h>  #include <linux/math64.h>  #include <linux/ptrace.h> diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213016f0..aada0e52680a 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -22,6 +22,29 @@  #include "tick-internal.h" +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ) + +static int tick_increase_min_delta(struct clock_event_device *dev) +{ +	/* Nothing to do if we already reached the limit */ +	if (dev->min_delta_ns >= MIN_DELTA_LIMIT) +		return -ETIME; + +	if (dev->min_delta_ns < 5000) +		dev->min_delta_ns = 5000; +	else +		dev->min_delta_ns += dev->min_delta_ns >> 1; + +	if (dev->min_delta_ns > MIN_DELTA_LIMIT) +		dev->min_delta_ns = MIN_DELTA_LIMIT; + +	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", +	       dev->name ? dev->name : "?", +	       (unsigned long long) dev->min_delta_ns); +	return 0; +} +  /**   * tick_program_event internal worker function   */ @@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,  		if (!ret || !force)  			return ret; +		dev->retries++;  		/* -		 * We tried 2 times to program the device with the given -		 * min_delta_ns. If that's not working then we double it +		 * We tried 3 times to program the device with the given +		 * min_delta_ns. If that's not working then we increase it  		 * and emit a warning.  		 */  		if (++i > 2) {  			/* Increase the min. delta and try again */ -			if (!dev->min_delta_ns) -				dev->min_delta_ns = 5000; -			else -				dev->min_delta_ns += dev->min_delta_ns >> 1; - -			printk(KERN_WARNING -			       "CE: %s increasing min_delta_ns to %llu nsec\n", -			       dev->name ? dev->name : "?", -			       (unsigned long long) dev->min_delta_ns << 1); - +			if (tick_increase_min_delta(dev)) { +				/* +				 * Get out of the loop if min_delta_ns +				 * hit the limit already. That's +				 * better than staying here forever. +				 * +				 * We clear next_event so we have a +				 * chance that the box survives. +				 */ +				printk(KERN_WARNING +				       "CE: Reprogramming failure. Giving up\n"); +				dev->next_event.tv64 = KTIME_MAX; +				return -ETIME; +			}  			i = 0;  		} diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 12f5c55090be..ac38fbb176cc 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -19,6 +19,7 @@  #include <linux/timecompare.h>  #include <linux/module.h> +#include <linux/slab.h>  #include <linux/math64.h>  /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 16736379a9ca..39f6177fafac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -818,7 +818,8 @@ void update_wall_time(void)  	shift = min(shift, maxshift);  	while (offset >= timekeeper.cycle_interval) {  		offset = logarithmic_accumulation(offset, shift); -		shift--; +		if(offset < timekeeper.cycle_interval<<shift) +			shift--;  	}  	/* correct the clock when NTP error is too big */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd1050c..1a4a7dd78777 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  	SEQ_printf(m, " event_handler:  ");  	print_name_offset(m, dev->event_handler);  	SEQ_printf(m, "\n"); +	SEQ_printf(m, " retries:        %lu\n", dev->retries);  }  static void timer_list_show_tickdevices(struct seq_file *m) @@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)  	u64 now = ktime_to_ns(ktime_get());  	int cpu; -	SEQ_printf(m, "Timer List Version: v0.5\n"); +	SEQ_printf(m, "Timer List Version: v0.6\n");  	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);  	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index c61a7949387f..aeb6a54f2771 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@  #include <linux/kallsyms.h>  #include <linux/perf_event.h>  #include <linux/sched.h> +#include <linux/slab.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -880,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)  	if (base->running_timer == timer)  		goto out; +	timer_stats_timer_clear_start_info(timer);  	ret = 0;  	if (timer_pending(timer)) {  		detach_timer(timer, 1); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 07f945a99430..b3bc91a3f510 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -21,6 +21,7 @@  #include <linux/percpu.h>  #include <linux/init.h>  #include <linux/mutex.h> +#include <linux/slab.h>  #include <linux/debugfs.h>  #include <linux/smp_lock.h>  #include <linux/time.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d9062f5cc0c0..2404b59b3097 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -24,6 +24,7 @@  #include <linux/uaccess.h>  #include <linux/ftrace.h>  #include <linux/sysctl.h> +#include <linux/slab.h>  #include <linux/ctype.h>  #include <linux/list.h>  #include <linux/hash.h> diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 9f4f565b01e6..a22582a06161 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -9,7 +9,6 @@  #include <linux/workqueue.h>  #include <linux/sched.h>  #include <linux/module.h> -#include <linux/slab.h>  #define CREATE_TRACE_POINTS  #include <trace/events/power.h> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05a9f83b8819..41ca394feb22 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -14,6 +14,7 @@  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/mutex.h> +#include <linux/slab.h>  #include <linux/init.h>  #include <linux/hash.h>  #include <linux/list.h> @@ -207,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);  #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)  #define RB_EVNT_MIN_SIZE	8U	/* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT	0 +# define RB_ARCH_ALIGNMENT		RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT	1 +# define RB_ARCH_ALIGNMENT		8U +#endif +  /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */  #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1201,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)  	for (i = 0; i < nr_pages; i++) {  		if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) -			return; +			goto out;  		p = cpu_buffer->pages->next;  		bpage = list_entry(p, struct buffer_page, list);  		list_del_init(&bpage->list);  		free_buffer_page(bpage);  	}  	if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) -		return; +		goto out;  	rb_reset_cpu(cpu_buffer);  	rb_check_pages(cpu_buffer); +out:  	spin_unlock_irq(&cpu_buffer->reader_lock);  } @@ -1229,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,  	for (i = 0; i < nr_pages; i++) {  		if (RB_WARN_ON(cpu_buffer, list_empty(pages))) -			return; +			goto out;  		p = pages->next;  		bpage = list_entry(p, struct buffer_page, list);  		list_del_init(&bpage->list); @@ -1238,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,  	rb_reset_cpu(cpu_buffer);  	rb_check_pages(cpu_buffer); +out:  	spin_unlock_irq(&cpu_buffer->reader_lock);  } @@ -1547,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event,  	case 0:  		length -= RB_EVNT_HDR_SIZE; -		if (length > RB_MAX_SMALL_DATA) +		if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)  			event->array[0] = length;  		else  			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); @@ -1722,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length)  	if (!length)  		length = 1; -	if (length > RB_MAX_SMALL_DATA) +	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)  		length += sizeof(event.array[0]);  	length += RB_EVNT_HDR_SIZE; -	length = ALIGN(length, RB_ALIGNMENT); +	length = ALIGN(length, RB_ARCH_ALIGNMENT);  	return length;  } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ec2ee6f6560..44f916a04065 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -33,10 +33,10 @@  #include <linux/kdebug.h>  #include <linux/string.h>  #include <linux/rwsem.h> +#include <linux/slab.h>  #include <linux/ctype.h>  #include <linux/init.h>  #include <linux/poll.h> -#include <linux/gfp.h>  #include <linux/fs.h>  #include "trace.h" diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6fbfb8f417b9..9d589d8dcd1a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void)  	int this_cpu;  	u64 now; -	raw_local_irq_save(flags); +	local_irq_save(flags);  	this_cpu = raw_smp_processor_id();  	now = cpu_clock(this_cpu); @@ -110,7 +110,7 @@ u64 notrace trace_clock_global(void)  	arch_spin_unlock(&trace_clock_struct.lock);   out: -	raw_local_irq_restore(flags); +	local_irq_restore(flags);  	return now;  } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 81f691eb3a30..0565bb42566f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -17,7 +17,12 @@ EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);  static char *perf_trace_buf;  static char *perf_trace_buf_nmi; -typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ; +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) +	perf_trace_t;  /* Count the events in use (per event id, not per instance) */  static int	total_ref_count; @@ -130,6 +135,8 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,  	char *trace_buf, *raw_data;  	int pc, cpu; +	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); +  	pc = preempt_count();  	/* Protect the per cpu buffer, begin the rcu read side */ @@ -152,7 +159,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,  	raw_data = per_cpu_ptr(trace_buf, cpu);  	/* zero the dead bytes from align to not leak stack to user */ -	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; +	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));  	entry = (struct trace_entry *)raw_data;  	tracing_generic_entry_update(entry, *irq_flags, pc); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index beab8bf2f310..c697c7043349 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,6 +15,7 @@  #include <linux/uaccess.h>  #include <linux/module.h>  #include <linux/ctype.h> +#include <linux/slab.h>  #include <linux/delay.h>  #include <asm/setup.h> diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4615f62a04f1..88c0b6dbd7fe 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,6 +22,7 @@  #include <linux/ctype.h>  #include <linux/mutex.h>  #include <linux/perf_event.h> +#include <linux/slab.h>  #include "trace.h"  #include "trace_output.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e6989d9b44da..9aed1a5cf553 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -9,6 +9,7 @@  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> +#include <linux/slab.h>  #include <linux/fs.h>  #include "trace.h" diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 94103cdcf9d8..d59cd6879477 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -23,6 +23,7 @@  #include <linux/debugfs.h>  #include <linux/ftrace.h>  #include <linux/module.h> +#include <linux/slab.h>  #include <linux/fs.h>  #include "trace_output.h" diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0acd834659ed..017fa376505d 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@  #include <linux/kernel.h>  #include <linux/mmiotrace.h>  #include <linux/pci.h> +#include <linux/slab.h>  #include <linux/time.h>  #include <asm/atomic.h> diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea470d67..81003b4d617f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,6 +3,7 @@  #include <linux/stringify.h>  #include <linux/kthread.h>  #include <linux/delay.h> +#include <linux/slab.h>  static inline int trace_valid_entry(struct trace_entry *entry)  { diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index a4bb239eb987..96cffb269e73 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -10,6 +10,7 @@  #include <linux/list.h> +#include <linux/slab.h>  #include <linux/rbtree.h>  #include <linux/debugfs.h>  #include "trace_stat.h" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 33c2a5b769dc..4d6d711717f2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@  #include <trace/syscall.h>  #include <trace/events/syscalls.h> +#include <linux/slab.h>  #include <linux/kernel.h>  #include <linux/ftrace.h>  #include <linux/perf_event.h> diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 40cafb07dffd..cc2d2faa7d9e 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@  #include <trace/events/workqueue.h>  #include <linux/list.h>  #include <linux/percpu.h> +#include <linux/slab.h>  #include <linux/kref.h>  #include "trace_stat.h"  #include "trace.h"  | 
