diff options
Diffstat (limited to 'kernel')
47 files changed, 1670 insertions, 1124 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ac6b27abb1ad..642d4277c2ea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/audit.c b/kernel/audit.c index 4e9d20829681..d13276d41410 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) err = -EPERM; break; case AUDIT_USER: - case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_USER: - case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d240349cbf0f..88b416dfbc72 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -42,7 +42,6 @@ #include <linux/seq_file.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/stat.h> #include <linux/string.h> @@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf) return -EACCES; trialcs = *cs; - retval = cpulist_parse(buf, trialcs.cpus_allowed); - if (retval < 0) - return retval; + + /* + * We allow a cpuset's cpus_allowed to be empty; if it has attached + * tasks, we'll catch it later when we validate the change and return + * -ENOSPC. + */ + if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + cpus_clear(trialcs.cpus_allowed); + } else { + retval = cpulist_parse(buf, trialcs.cpus_allowed); + if (retval < 0) + return retval; + } cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); - if (cpus_empty(trialcs.cpus_allowed)) + /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ + if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval < 0) @@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -EACCES; trialcs = *cs; - retval = nodelist_parse(buf, trialcs.mems_allowed); - if (retval < 0) - goto done; + + /* + * We allow a cpuset's mems_allowed to be empty; if it has attached + * tasks, we'll catch it later when we validate the change and return + * -ENOSPC. + */ + if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + nodes_clear(trialcs.mems_allowed); + } else { + retval = nodelist_parse(buf, trialcs.mems_allowed); + if (retval < 0) + goto done; + } nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - if (nodes_empty(trialcs.mems_allowed)) { + /* mems_allowed cannot be empty for a cpuset with attached tasks. */ + if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { retval = -ENOSPC; goto done; } @@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child) * it is holding that mutex while calling check_for_release(), * which calls kmalloc(), so can't be called holding callback_mutex(). * - * We don't need to task_lock() this reference to tsk->cpuset, - * because tsk is already marked PF_EXITING, so attach_task() won't - * mess with it, or task is a failed fork, never visible to attach_task. - * * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). @@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; + task_lock(current); cs = tsk->cpuset; tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ + task_unlock(current); if (notify_on_release(cs)) { char *pathbuf = NULL; diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c new file mode 100644 index 000000000000..0d98827887a7 --- /dev/null +++ b/kernel/die_notifier.c @@ -0,0 +1,38 @@ + +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> +#include <linux/kdebug.h> + + +static ATOMIC_NOTIFIER_HEAD(die_chain); + +int notify_die(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + return atomic_notifier_call_chain(&die_chain, val, &args); +} + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(register_die_notifier); + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_die_notifier); + + diff --git a/kernel/exit.c b/kernel/exit.c index 92369240d91d..f5a7abb621f3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -7,7 +7,6 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> diff --git a/kernel/fork.c b/kernel/fork.c index b7d169def942..a8dd75d4992b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -14,7 +14,6 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/unistd.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> @@ -1516,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the mnt_namespace structure if it is being shared - */ -static int unshare_mnt_namespace(unsigned long unshare_flags, - struct mnt_namespace **new_nsp, struct fs_struct *new_fs) -{ - struct mnt_namespace *ns = current->nsproxy->mnt_ns; - - if ((unshare_flags & CLONE_NEWNS) && ns) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); - if (!*new_nsp) - return -ENOMEM; - } - - return 0; -} - -/* * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) @@ -1593,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } -#ifndef CONFIG_IPC_NS -static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) -{ - if (flags & CLONE_NEWIPC) - return -EINVAL; - - return 0; -} -#endif - /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1615,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct mnt_namespace *ns, *new_ns = NULL; struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; - struct uts_namespace *uts, *new_uts = NULL; - struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); @@ -1637,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) - goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_ns; + goto bad_unshare_cleanup_fs; if ((err = unshare_vm(unshare_flags, &new_mm))) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; - if ((err = unshare_utsname(unshare_flags, &new_uts))) + if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_fs))) goto bad_unshare_cleanup_semundo; - if ((err = unshare_ipcs(unshare_flags, &new_ipc))) - goto bad_unshare_cleanup_uts; - - if (new_ns || new_uts || new_ipc) { - old_nsproxy = current->nsproxy; - new_nsproxy = dup_namespaces(old_nsproxy); - if (!new_nsproxy) { - err = -ENOMEM; - goto bad_unshare_cleanup_ipc; - } - } - if (new_fs || new_ns || new_mm || new_fd || new_ulist || - new_uts || new_ipc) { + if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { task_lock(current); if (new_nsproxy) { + old_nsproxy = current->nsproxy; current->nsproxy = new_nsproxy; new_nsproxy = old_nsproxy; } @@ -1677,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fs = fs; } - if (new_ns) { - ns = current->nsproxy->mnt_ns; - current->nsproxy->mnt_ns = new_ns; - new_ns = ns; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1698,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } - if (new_uts) { - uts = current->nsproxy->uts_ns; - current->nsproxy->uts_ns = new_uts; - new_uts = uts; - } - - if (new_ipc) { - ipc = current->nsproxy->ipc_ns; - current->nsproxy->ipc_ns = new_ipc; - new_ipc = ipc; - } - task_unlock(current); } if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_ipc: - if (new_ipc) - put_ipc_ns(new_ipc); - -bad_unshare_cleanup_uts: - if (new_uts) - put_uts_ns(new_uts); - bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) @@ -1738,10 +1666,6 @@ bad_unshare_cleanup_sigh: if (atomic_dec_and_test(&new_sigh->count)) kmem_cache_free(sighand_cachep, new_sigh); -bad_unshare_cleanup_ns: - if (new_ns) - put_mnt_ns(new_ns); - bad_unshare_cleanup_fs: if (new_fs) put_fs_struct(new_fs); diff --git a/kernel/futex.c b/kernel/futex.c index 5a270b5e3f95..600bc9d801f2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -48,6 +48,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <linux/module.h> #include <asm/futex.h> #include "rtmutex_common.h" @@ -55,32 +56,6 @@ #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* - * Futexes are matched on equal values of this key. - * The key type depends on whether it's a shared or private mapping. - * Don't rearrange members without looking at hash_futex(). - * - * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. - * We set bit 0 to indicate if it's an inode-based key. - */ -union futex_key { - struct { - unsigned long pgoff; - struct inode *inode; - int offset; - } shared; - struct { - unsigned long address; - struct mm_struct *mm; - int offset; - } private; - struct { - unsigned long word; - void *ptr; - int offset; - } both; -}; - -/* * Priority Inheritance state: */ struct futex_pi_state { @@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, union futex_key *key) +int get_futex_key(u32 __user *uaddr, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) } return err; } +EXPORT_SYMBOL_GPL(get_futex_key); /* * Take a reference to the resource addressed by a key. @@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) * NOTE: mmap_sem MUST be held between get_futex_key() and calling this * function, if it is called at all. mmap_sem keeps key->shared.inode valid. */ -static inline void get_key_refs(union futex_key *key) +inline void get_futex_key_refs(union futex_key *key) { if (key->both.ptr != 0) { if (key->both.offset & 1) @@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key) atomic_inc(&key->private.mm->mm_count); } } +EXPORT_SYMBOL_GPL(get_futex_key_refs); /* * Drop a reference to the resource addressed by a key. * The hash bucket spinlock must not be held. */ -static void drop_key_refs(union futex_key *key) +void drop_futex_key_refs(union futex_key *key) { if (key->both.ptr != 0) { if (key->both.offset & 1) @@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key) mmdrop(key->private.mm); } } +EXPORT_SYMBOL_GPL(drop_futex_key_refs); static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { @@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, this->lock_ptr = &hb2->lock; } this->key = key2; - get_key_refs(&key2); + get_futex_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) @@ -886,9 +864,9 @@ out_unlock: if (hb1 != hb2) spin_unlock(&hb2->lock); - /* drop_key_refs() must be called outside the spinlocks. */ + /* drop_futex_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) - drop_key_refs(&key1); + drop_futex_key_refs(&key1); out: up_read(¤t->mm->mmap_sem); @@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); - get_key_refs(&q->key); + get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -925,7 +903,7 @@ static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { spin_unlock(&hb->lock); - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); } /* @@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q) ret = 1; } - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); return ret; } @@ -999,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) spin_unlock(&hb->lock); - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); } -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +static long futex_wait_restart(struct restart_block *restart); +static int futex_wait_abstime(u32 __user *uaddr, u32 val, + int timed, unsigned long abs_time) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; + unsigned long time_left = 0; u32 uval; int ret; @@ -1087,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * !list_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (likely(!list_empty(&q.list))) - time = schedule_timeout(time); + time_left = 0; + if (likely(!list_empty(&q.list))) { + unsigned long rel_time; + + if (timed) { + unsigned long now = jiffies; + if (time_after(now, abs_time)) + rel_time = 0; + else + rel_time = abs_time - now; + } else + rel_time = MAX_SCHEDULE_TIMEOUT; + + time_left = schedule_timeout(rel_time); + } __set_current_state(TASK_RUNNING); /* @@ -1099,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) /* If we were woken (and unqueued), we succeeded, whatever. */ if (!unqueue_me(&q)) return 0; - if (time == 0) + if (time_left == 0) return -ETIMEDOUT; + /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ - return -EINTR; + if (time_left == MAX_SCHEDULE_TIMEOUT) + return -ERESTARTSYS; + else { + struct restart_block *restart; + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->arg0 = (unsigned long)uaddr; + restart->arg1 = (unsigned long)val; + restart->arg2 = (unsigned long)timed; + restart->arg3 = abs_time; + return -ERESTART_RESTARTBLOCK; + } out_unlock_release_sem: queue_unlock(&q, hb); @@ -1115,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) return ret; } +static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time) +{ + int timed = (rel_time != MAX_SCHEDULE_TIMEOUT); + return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time); +} + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = (u32 __user *)restart->arg0; + u32 val = (u32)restart->arg1; + int timed = (int)restart->arg2; + unsigned long abs_time = restart->arg3; + + restart->fn = do_no_restart_syscall; + return (long)futex_wait_abstime(uaddr, val, timed, abs_time); +} + + /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1b3033105b40..c9f4f044a8a8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) return orun; } +EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index aff1f0fabb0d..32e1ab1477d1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * * Controller mappings for all interrupt sources: */ -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_chip, @@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq) if (desc->chip->ack) desc->chip->ack(irq); action_ret = handle_IRQ_event(irq, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); desc->chip->end(irq); return 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5597c157442a..203a518b6f14 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) } *p = new; -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif + /* Exclude IRQ from balancing */ if (new->flags & IRQF_NOBALANCING) desc->status |= IRQ_NO_BALANCING; @@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!shared) { irq_chip_set_defaults(desc->chip); +#if defined(CONFIG_IRQ_PER_CPU) + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; +#endif + /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { if (desc->chip && desc->chip->set_type) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 2db91eb54ad8..ddde0ef9ccdc 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; + unsigned long flags; + int ret = 1; - for (action = desc->action ; action; action = action->next) + spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) - return 0; - return 1; + !strcmp(new_action->name, action->name)) { + ret = 0; + break; + } + } + spin_unlock_irqrestore(&desc->lock, flags); + return ret; } void register_handler_proc(unsigned int irq, struct irqaction *action) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9d8c79b48823..b0d81aae472f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ - if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { + if ((irqfixup == 2 && ((irq == 0) || + (desc->action->flags & IRQF_IRQPOLL))) || + action_ret == IRQ_NONE) { int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; diff --git a/kernel/itimer.c b/kernel/itimer.c index 307c6a632ef6..3205e8e114fa 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -7,7 +7,6 @@ /* These are all the functions necessary to implement itimers */ #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/time.h> @@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } /* - * We do not care about correctness. We just sanitize the values so - * the ktime_t operations which expect normalized values do not - * break. This converts negative values to long timeouts similar to - * the code in kernel versions < 2.6.16 - * - * Print a limited number of warning messages when an invalid timeval - * is detected. - */ -static void fixup_timeval(struct timeval *tv, int interval) -{ - static int warnlimit = 10; - unsigned long tmp; - - if (warnlimit > 0) { - warnlimit--; - printk(KERN_WARNING - "setitimer: %s (pid = %d) provided " - "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", - current->comm, current->pid, - interval ? "it_interval" : "it_value", - tv->tv_sec, (long) tv->tv_usec); - } - - tmp = tv->tv_usec; - if (tmp >= USEC_PER_SEC) { - tv->tv_usec = tmp % USEC_PER_SEC; - tv->tv_sec += tmp / USEC_PER_SEC; - } - - tmp = tv->tv_sec; - if (tmp > LONG_MAX) - tv->tv_sec = LONG_MAX; -} - -/* * Returns true if the timeval is in canonical form */ #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -/* - * Check for invalid timevals, sanitize them and print a limited - * number of warnings. - */ -static void check_itimerval(struct itimerval *value) { - - if (unlikely(!timeval_valid(&value->it_value))) - fixup_timeval(&value->it_value, 0); - - if (unlikely(!timeval_valid(&value->it_interval))) - fixup_timeval(&value->it_interval, 1); -} - int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; @@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) /* * Validate the timevals in value. - * - * Note: Although the spec requires that invalid values shall - * return -EINVAL, we just fixup the value and print a limited - * number of warnings in order not to break users of this - * historical misfeature. - * - * Scheduled for replacement in March 2007 */ - check_itimerval(value); + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; switch (which) { case ITIMER_REAL: diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5a0de8409739..f1bda23140b2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr, symbol_end = (unsigned long)_etext; } - *symbolsize = symbol_end - symbol_start; - *offset = addr - symbol_start; + if (symbolsize) + *symbolsize = symbol_end - symbol_start; + if (offset) + *offset = addr - symbol_start; return low; } @@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr, return NULL; } +int lookup_symbol_name(unsigned long addr, char *symname) +{ + symname[0] = '\0'; + symname[KSYM_NAME_LEN] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, NULL, NULL); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), symname); + return 0; + } + /* see if it's in a module */ + return lookup_module_symbol_name(addr, symname); +} + +int lookup_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + name[0] = '\0'; + name[KSYM_NAME_LEN] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, size, offset); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), name); + modname[0] = '\0'; + return 0; + } + /* see if it's in a module */ + return lookup_module_symbol_attrs(addr, size, offset, modname, name); +} + /* Look up a kernel symbol and return it in a text buffer. */ int sprint_symbol(char *buffer, unsigned long address) { @@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address) struct kallsym_iter { loff_t pos; - struct module *owner; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols */ char type; char name[KSYM_NAME_LEN+1]; + char module_name[MODULE_NAME_LEN + 1]; + int exported; }; static int get_ksymbol_mod(struct kallsym_iter *iter) { - iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, &iter->type, - iter->name, sizeof(iter->name)); - if (iter->owner == NULL) + if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, + &iter->type, iter->name, iter->module_name, + &iter->exported) < 0) return 0; - - /* Label it "global" if it is exported, "local" if not exported. */ - iter->type = is_exported(iter->name, iter->owner) - ? toupper(iter->type) : tolower(iter->type); - return 1; } @@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; - iter->owner = NULL; + iter->module_name[0] = '\0'; iter->value = kallsyms_addresses[iter->pos]; iter->type = kallsyms_get_symbol_type(off); @@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p) if (!iter->name[0]) return 0; - if (iter->owner) + if (iter->module_name[0]) { + char type; + + /* Label it "global" if it is exported, + * "local" if not exported. */ + type = iter->exported ? toupper(iter->type) : + tolower(iter->type); seq_printf(m, "%0*lx %c %s\t[%s]\n", (int)(2*sizeof(void*)), - iter->value, iter->type, iter->name, - module_name(iter->owner)); - else + iter->value, type, iter->name, iter->module_name); + } else seq_printf(m, "%0*lx %c %s\n", (int)(2*sizeof(void*)), iter->value, iter->type, iter->name); @@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return ret; } -static int kallsyms_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - kfree(m->private); - return seq_release(inode, file); -} - static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, .llseek = seq_lseek, - .release = kallsyms_release, + .release = seq_release_private, }; static int __init kallsyms_init(void) diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a59c8a01ae0..25db14b89e82 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/kmod.c b/kernel/kmod.c index 796276141e51..49cc4b9c1a8d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -23,7 +23,6 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/mnt_namespace.h> #include <linux/completion.h> @@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data) /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); + /* + * Our parent is keventd, which runs with elevated scheduling priority. + * Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + retval = -EPERM; if (current->fs->root) retval = kernel_execve(sub_info->path, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d25a9ada3f8e..9e47d8c493f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,16 +35,19 @@ #include <linux/hash.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/stddef.h> #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/kdebug.h> + #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> -#include <asm/kdebug.h> +#include <asm/uaccess.h> #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) @@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobe_enabled; + DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; @@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) struct kprobe_insn_page *kip; struct hlist_node *pos; - retry: - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + retry: + hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } /* All out of space. Need to allocate a new page. Use slot 0. */ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { + if (!kip) return NULL; - } /* * Use module_alloc so this page is within +/- 2GB of where the @@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void) if (check_safety() != 0) return -EAGAIN; - hlist_for_each_safe(pos, next, &kprobe_insn_pages) { + hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { int i; - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ @@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; @@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) break; } } - if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { + + if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); - } } #endif @@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, reset_kprobe_instance(); } } - return; } static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, @@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) } /* Called with kretprobe_lock held */ -struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) -{ - struct hlist_node *node; - struct kretprobe_instance *ri; - hlist_for_each_entry(ri, node, &rp->free_instances, uflist) - return ri; - return NULL; -} - -/* Called with kretprobe_lock held */ -static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe - *rp) -{ - struct hlist_node *node; - struct kretprobe_instance *ri; - hlist_for_each_entry(ri, node, &rp->used_instances, uflist) - return ri; - return NULL; -} - -/* Called with kretprobe_lock held */ -void __kprobes add_rp_inst(struct kretprobe_instance *ri) -{ - /* - * Remove rp inst off the free list - - * Add it back when probed function returns - */ - hlist_del(&ri->uflist); - - /* Add rp inst onto table */ - INIT_HLIST_NODE(&ri->hlist); - hlist_add_head(&ri->hlist, - &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); - - /* Also add this rp inst to the used list. */ - INIT_HLIST_NODE(&ri->uflist); - hlist_add_head(&ri->uflist, &ri->rp->used_instances); -} - -/* Called with kretprobe_lock held */ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head) { @@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - while ((ri = get_free_rp_inst(rp)) != NULL) { + struct hlist_node *pos, *next; + + hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { hlist_del(&ri->uflist); kfree(ri); } @@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static int __kprobes in_kprobes_functions(unsigned long addr) { - if (addr >= (unsigned long)__kprobes_text_start - && addr < (unsigned long)__kprobes_text_end) + if (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) return -EINVAL; return 0; } @@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); - if ((!kernel_text_address((unsigned long) p->addr)) || - in_kprobes_functions((unsigned long) p->addr)) + if (!kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; p->mod_refcounted = 0; - /* Check are we probing a module */ - if ((probed_mod = module_text_address((unsigned long) p->addr))) { + + /* + * Check if are we probing a module. + */ + probed_mod = module_text_address((unsigned long) p->addr); + if (probed_mod) { struct module *calling_mod = module_text_address(called_from); - /* We must allow modules to probe themself and - * in this case avoid incrementing the module refcount, - * so as to allow unloading of self probing modules. + /* + * We must allow modules to probe themself and in this case + * avoid incrementing the module refcount, so as to allow + * unloading of self probing modules. */ - if (calling_mod && (calling_mod != probed_mod)) { + if (calling_mod && calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) return -EINVAL; p->mod_refcounted = 1; @@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, goto out; } - if ((ret = arch_prepare_kprobe(p)) != 0) + ret = arch_prepare_kprobe(p); + if (ret) goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (atomic_add_return(1, &kprobe_count) == \ + if (kprobe_enabled) { + if (atomic_add_return(1, &kprobe_count) == \ (ARCH_INACTIVE_KPROBE_COUNT + 1)) - register_page_fault_notifier(&kprobe_page_fault_nb); - - arch_arm_kprobe(p); + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); + } out: mutex_unlock(&kprobe_mutex); @@ -616,8 +586,7 @@ out: int __kprobes register_kprobe(struct kprobe *p) { - return __register_kprobe(p, - (unsigned long)__builtin_return_address(0)); + return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); } void __kprobes unregister_kprobe(struct kprobe *p) @@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p) return; } valid_p: - if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && - (p->list.next == &old_p->list) && - (p->list.prev == &old_p->list))) { - /* Only probe on the hash list */ - arch_disarm_kprobe(p); + if (old_p == p || + (old_p->pre_handler == aggr_pre_handler && + p->list.next == &old_p->list && p->list.prev == &old_p->list)) { + /* + * Only probe on the hash list. Disarm only if kprobes are + * enabled - otherwise, the breakpoint would already have + * been removed. We save on flushing icache. + */ + if (kprobe_enabled) + arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); cleanup_p = 1; } else { @@ -656,9 +630,11 @@ valid_p: mutex_unlock(&kprobe_mutex); synchronize_sched(); - if (p->mod_refcounted && - (mod = module_text_address((unsigned long)p->addr))) - module_put(mod); + if (p->mod_refcounted) { + mod = module_text_address((unsigned long)p->addr); + if (mod) + module_put(mod); + } if (cleanup_p) { if (p != old_p) { @@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ spin_lock_irqsave(&kretprobe_lock, flags); - arch_prepare_kretprobe(rp, regs); + if (!hlist_empty(&rp->free_instances)) { + struct kretprobe_instance *ri; + + ri = hlist_entry(rp->free_instances.first, + struct kretprobe_instance, uflist); + ri->rp = rp; + ri->task = current; + arch_prepare_kretprobe(ri, regs); + + /* XXX(hch): why is there no hlist_move_head? */ + hlist_del(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->used_instances); + hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); + } else + rp->nmissed++; spin_unlock_irqrestore(&kretprobe_lock, flags); return 0; } @@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; + struct hlist_node *pos, *next; unregister_kprobe(&rp->kp); + /* No race here */ spin_lock_irqsave(&kretprobe_lock, flags); - while ((ri = get_used_rp_inst(rp)) != NULL) { + hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { ri->rp = NULL; hlist_del(&ri->uflist); } @@ -816,6 +808,9 @@ static int __init init_kprobes(void) } atomic_set(&kprobe_count, 0); + /* By default, kprobes are enabled */ + kprobe_enabled = true; + err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); @@ -825,7 +820,7 @@ static int __init init_kprobes(void) #ifdef CONFIG_DEBUG_FS static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) + const char *sym, int offset,char *modname) { char *kprobe_type; @@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; - unsigned long size, offset = 0; + unsigned long offset = 0; char *modname, namebuf[128]; head = &kprobe_table[i]; preempt_disable(); hlist_for_each_entry_rcu(p, node, head, hlist) { - sym = kallsyms_lookup((unsigned long)p->addr, &size, + sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (p->pre_handler == aggr_pre_handler) { list_for_each_entry_rcu(kp, &p->list, list) @@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; +static void __kprobes enable_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already enabled, just return */ + if (kprobe_enabled) + goto already_enabled; + + /* + * Re-register the page fault notifier only if there are any + * active probes at the time of enabling kprobes globally + */ + if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT) + register_page_fault_notifier(&kprobe_page_fault_nb); + + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + arch_arm_kprobe(p); + } + + kprobe_enabled = true; + printk(KERN_INFO "Kprobes globally enabled\n"); + +already_enabled: + mutex_unlock(&kprobe_mutex); + return; +} + +static void __kprobes disable_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already disabled, just return */ + if (!kprobe_enabled) + goto already_disabled; + + kprobe_enabled = false; + printk(KERN_INFO "Kprobes globally disabled\n"); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!arch_trampoline_kprobe(p)) + arch_disarm_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); + + mutex_lock(&kprobe_mutex); + /* Unconditionally unregister the page_fault notifier */ + unregister_page_fault_notifier(&kprobe_page_fault_nb); + +already_disabled: + mutex_unlock(&kprobe_mutex); + return; +} + +/* + * XXX: The debugfs bool file interface doesn't allow for callbacks + * when the bool state is switched. We can reuse that facility when + * available + */ +static ssize_t read_enabled_file_bool(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[3]; + + if (kprobe_enabled) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t write_enabled_file_bool(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + int buf_size; + + buf_size = min(count, (sizeof(buf)-1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + enable_all_kprobes(); + break; + case 'n': + case 'N': + case '0': + disable_all_kprobes(); + break; + } + + return count; +} + +static struct file_operations fops_kp = { + .read = read_enabled_file_bool, + .write = write_enabled_file_bool, +}; + static int __kprobes debugfs_kprobe_init(void) { struct dentry *dir, *file; + unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir , 0 , + file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_kprobes_operations); if (!file) { debugfs_remove(dir); return -ENOMEM; } + file = debugfs_create_file("enabled", 0600, dir, + &value, &fops_kp); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + return 0; } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7065a687ac54..1a5ff2211d88 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace) trace->entries = stack_trace + nr_stack_trace_entries; trace->skip = 3; - trace->all_contexts = 0; - save_stack_trace(trace, NULL); + save_stack_trace(trace); trace->max_entries = trace->nr_entries; @@ -341,10 +340,7 @@ static const char *usage_str[] = const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { - unsigned long offs, size; - char *modname; - - return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); + return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } void @@ -1313,8 +1309,9 @@ out_unlock_set: /* * Look up a dependency chain. If the key is not present yet then - * add it and return 0 - in this case the new dependency chain is - * validated. If the key is already hashed, return 1. + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) */ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) { @@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, * Mark a lock with a usage bit, and validate the state transition: */ static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, unsigned long ip) + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, this->class->usage_mask |= new_mask; -#ifdef CONFIG_TRACE_IRQFLAGS - if (new_bit == LOCK_ENABLED_HARDIRQS || - new_bit == LOCK_ENABLED_HARDIRQS_READ) - ip = curr->hardirq_enable_ip; - else if (new_bit == LOCK_ENABLED_SOFTIRQS || - new_bit == LOCK_ENABLED_SOFTIRQS_READ) - ip = curr->softirq_enable_ip; -#endif if (!save_trace(this->class->usage_traces + new_bit)) return 0; @@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) +mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; struct held_lock *hlock; @@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) else usage_bit = LOCK_ENABLED_SOFTIRQS; } - if (!mark_lock(curr, hlock, usage_bit, ip)) + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, 1, ip)) + if (!mark_held_locks(curr, 1)) return; /* * If we have softirqs enabled, then set the usage @@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, 0, ip)) + if (!mark_held_locks(curr, 0)) return; curr->hardirq_enable_ip = ip; @@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, 0, ip); + mark_held_locks(curr, 0); } /* @@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (read) { if (curr->hardirq_context) if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ, ip)) + LOCK_USED_IN_HARDIRQ_READ)) return 0; if (curr->softirq_context) if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ, ip)) + LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) return 0; } } if (!hardirqs_off) { if (read) { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ, ip)) + LOCK_ENABLED_HARDIRQS_READ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ, ip)) + LOCK_ENABLED_SOFTIRQS_READ)) return 0; } else { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS, ip)) + LOCK_ENABLED_HARDIRQS)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS, ip)) + LOCK_ENABLED_SOFTIRQS)) return 0; } } #endif /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED, ip)) + if (!mark_lock(curr, hlock, LOCK_USED)) return 0; out_calc_hash: /* diff --git a/kernel/module.c b/kernel/module.c index 1eb8ca565ba0..d36e45477fac 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/init.h> +#include <linux/kallsyms.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size) { /* Reallocation required? */ if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, - GFP_KERNEL); + int *new; + + new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, + GFP_KERNEL); if (!new) return 0; - memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); pcpu_num_allocated *= 2; - kfree(pcpu_size); pcpu_size = new; } @@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } #ifdef CONFIG_KALLSYMS -int is_exported(const char *name, const struct module *mod) +static int is_exported(const char *name, const struct module *mod) { if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; @@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod, if (!best) return NULL; - *size = nextval - mod->symtab[best].st_value; - *offset = addr - mod->symtab[best].st_value; + if (size) + *size = nextval - mod->symtab[best].st_value; + if (offset) + *offset = addr - mod->symtab[best].st_value; return mod->strtab + mod->symtab[best].st_name; } @@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, size_t namelen) +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) || + within(addr, mod->module_core, mod->core_size)) { + const char *sym; + + sym = get_ksymbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + strlcpy(symname, sym, KSYM_NAME_LEN + 1); + mutex_unlock(&module_mutex); + return 0; + } + } +out: + mutex_unlock(&module_mutex); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) || + within(addr, mod->module_core, mod->core_size)) { + const char *sym; + + sym = get_ksymbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); + if (name) + strlcpy(name, sym, KSYM_NAME_LEN + 1); + mutex_unlock(&module_mutex); + return 0; + } + } +out: + mutex_unlock(&module_mutex); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) { struct module *mod; @@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - namelen); + KSYM_NAME_LEN + 1); + strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); + *exported = is_exported(name, mod); mutex_unlock(&module_mutex); - return mod; + return 0; } symnum -= mod->num_symtab; } mutex_unlock(&module_mutex); - return NULL; + return -ERANGE; } static unsigned long mod_find_symname(struct module *mod, const char *name) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5b9ee6f6bbb..1bc4b55241a8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk) /* * creates a copy of "orig" with refcount 1. - * This does not grab references to the contained namespaces, - * so that needs to be done by dup_namespaces. */ -static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) { struct nsproxy *ns; @@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) } /* - * copies the nsproxy, setting refcount to 1, and grabbing a - * reference to all contained namespaces. Called from - * sys_unshare() + * Create new nsproxy and all of its the associated namespaces. + * Return the newly created nsproxy. Do not attach this to the task, + * leave it to the caller to do proper locking and attach it to task. */ -struct nsproxy *dup_namespaces(struct nsproxy *orig) +static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, + struct fs_struct *new_fs) { - struct nsproxy *ns = clone_namespaces(orig); + struct nsproxy *new_nsp; - if (ns) { - if (ns->mnt_ns) - get_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - get_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - get_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - get_pid_ns(ns->pid_ns); - } + new_nsp = clone_nsproxy(tsk->nsproxy); + if (!new_nsp) + return ERR_PTR(-ENOMEM); - return ns; + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + if (IS_ERR(new_nsp->mnt_ns)) + goto out_ns; + + new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + if (IS_ERR(new_nsp->uts_ns)) + goto out_uts; + + new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + if (IS_ERR(new_nsp->ipc_ns)) + goto out_ipc; + + new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); + if (IS_ERR(new_nsp->pid_ns)) + goto out_pid; + + return new_nsp; + +out_pid: + if (new_nsp->ipc_ns) + put_ipc_ns(new_nsp->ipc_ns); +out_ipc: + if (new_nsp->uts_ns) + put_uts_ns(new_nsp->uts_ns); +out_uts: + if (new_nsp->mnt_ns) + put_mnt_ns(new_nsp->mnt_ns); +out_ns: + kfree(new_nsp); + return ERR_PTR(-ENOMEM); } /* @@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk) if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) return 0; - new_ns = clone_namespaces(old_ns); - if (!new_ns) { - err = -ENOMEM; + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; goto out; } - tsk->nsproxy = new_ns; - - err = copy_mnt_ns(flags, tsk); - if (err) - goto out_ns; - - err = copy_utsname(flags, tsk); - if (err) - goto out_uts; - - err = copy_ipcs(flags, tsk); - if (err) - goto out_ipc; - - err = copy_pid_ns(flags, tsk); - if (err) - goto out_pid; + new_ns = create_new_namespaces(flags, tsk, tsk->fs); + if (IS_ERR(new_ns)) { + err = PTR_ERR(new_ns); + goto out; + } + tsk->nsproxy = new_ns; out: put_nsproxy(old_ns); return err; - -out_pid: - if (new_ns->ipc_ns) - put_ipc_ns(new_ns->ipc_ns); -out_ipc: - if (new_ns->uts_ns) - put_uts_ns(new_ns->uts_ns); -out_uts: - if (new_ns->mnt_ns) - put_mnt_ns(new_ns->mnt_ns); -out_ns: - tsk->nsproxy = old_ns; - kfree(new_ns); - goto out; } void free_nsproxy(struct nsproxy *ns) @@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns) put_pid_ns(ns->pid_ns); kfree(ns); } + +/* + * Called from unshare. Unshare all the namespaces part of nsproxy. + * On sucess, returns the new nsproxy and a reference to old nsproxy + * to make sure it stays around. + */ +int unshare_nsproxy_namespaces(unsigned long unshare_flags, + struct nsproxy **new_nsp, struct fs_struct *new_fs) +{ + struct nsproxy *old_ns = current->nsproxy; + int err = 0; + + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + +#ifndef CONFIG_IPC_NS + if (unshare_flags & CLONE_NEWIPC) + return -EINVAL; +#endif + +#ifndef CONFIG_UTS_NS + if (unshare_flags & CLONE_NEWUTS) + return -EINVAL; +#endif + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + get_nsproxy(old_ns); + + *new_nsp = create_new_namespaces(unshare_flags, current, + new_fs ? new_fs : current->fs); + if (IS_ERR(*new_nsp)) { + err = PTR_ERR(*new_nsp); + put_nsproxy(old_ns); + } + return err; +} diff --git a/kernel/params.c b/kernel/params.c index 312172320b4c..e61c46c97ce7 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) return param_get_bool(buffer, &dummy); } -/* We cheat here and temporarily mangle the string. */ +/* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, unsigned int min, unsigned int max, diff --git a/kernel/pid.c b/kernel/pid.c index 9c80bc23d6b8..d3ad724afa83 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr) } EXPORT_SYMBOL_GPL(find_get_pid); -int copy_pid_ns(int flags, struct task_struct *tsk) +struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) { - struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; - int err = 0; - - if (!old_ns) - return 0; - + BUG_ON(!old_ns); get_pid_ns(old_ns); - return err; + return old_ns; } void free_pid_ns(struct kref *kref) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 657f77697415..1de710e18373 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { @@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { @@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->sched_time < t->expires.sched) { @@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { @@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { @@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || sched_time < t->expires.sched) { @@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, */ head = &tsk->signal->cpu_timers[clock_idx]; if (list_empty(head) || - cputime_ge(list_entry(head->next, + cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 44318ca71978..588c99da0307 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -31,7 +31,6 @@ * POSIX clocks & timers */ #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> diff --git a/kernel/power/process.c b/kernel/power/process.c index 0eb5c420e8ed..088419387388 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -8,7 +8,6 @@ #undef DEBUG -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> @@ -25,10 +24,9 @@ static inline int freezeable(struct task_struct * p) { - if ((p == current) || + if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->exit_state == EXIT_ZOMBIE) || - (p->exit_state == EXIT_DEAD)) + (p->exit_state != 0)) return 0; return 1; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 128da11f01c2..b7039772b05c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -14,7 +14,6 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> -#include <linux/smp_lock.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/spinlock.h> diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e83ed9945a80..b8b235cc19d1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,7 +12,6 @@ */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/file.h> #include <linux/utsname.h> #include <linux/version.h> diff --git a/kernel/printk.c b/kernel/printk.c index 4b47e59248df..0bbdeac2810c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -20,7 +20,6 @@ #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> -#include <linux/smp_lock.h> #include <linux/console.h> #include <linux/init.h> #include <linux/module.h> @@ -931,8 +930,16 @@ void register_console(struct console *console) { int i; unsigned long flags; + struct console *bootconsole = NULL; - if (preferred_console < 0) + if (console_drivers) { + if (console->flags & CON_BOOT) + return; + if (console_drivers->flags & CON_BOOT) + bootconsole = console_drivers; + } + + if (preferred_console < 0 || bootconsole || !console_drivers) preferred_console = selected_console; /* @@ -978,8 +985,11 @@ void register_console(struct console *console) if (!(console->flags & CON_ENABLED)) return; - if (console_drivers && (console_drivers->flags & CON_BOOT)) { - unregister_console(console_drivers); + if (bootconsole) { + printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", + bootconsole->name, bootconsole->index, + console->name, console->index); + unregister_console(bootconsole); console->flags &= ~CON_PRINTBUFFER; } @@ -1030,16 +1040,11 @@ int unregister_console(struct console *console) } } - /* If last console is removed, we re-enable picking the first - * one that gets registered. Without that, pmac early boot console - * would prevent fbcon from taking over. - * + /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. */ - if (console_drivers == NULL) - preferred_console = selected_console; - else if (console->flags & CON_CONSDEV) + if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; release_console_sem(); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index bcd14e83ef39..55ba82a85a66 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, - &sched_ops, NULL }; - /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -534,7 +530,7 @@ rcu_torture_writer(void *arg) rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); - if (old_rp != NULL) { + if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; @@ -685,7 +681,7 @@ rcu_torture_printk(char *page) atomic_read(&rcu_torture_wcount[i])); } cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats != NULL) + if (cur_ops->stats) cnt += cur_ops->stats(&page[cnt]); return cnt; } @@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed(current, tmp_mask); - if (reader_tasks != NULL) { + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed(reader_tasks[i], tmp_mask); } - if (fakewriter_tasks != NULL) { + if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed(fakewriter_tasks[i], tmp_mask); @@ -808,21 +804,21 @@ rcu_torture_cleanup(void) int i; fullstop = 1; - if (shuffler_task != NULL) { + if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); } shuffler_task = NULL; - if (writer_task != NULL) { + if (writer_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); kthread_stop(writer_task); } writer_task = NULL; - if (reader_tasks != NULL) { + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i] != NULL) { + if (reader_tasks[i]) { VERBOSE_PRINTK_STRING( "Stopping rcu_torture_reader task"); kthread_stop(reader_tasks[i]); @@ -834,9 +830,9 @@ rcu_torture_cleanup(void) } rcu_torture_current = NULL; - if (fakewriter_tasks != NULL) { + if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i] != NULL) { + if (fakewriter_tasks[i]) { VERBOSE_PRINTK_STRING( "Stopping rcu_torture_fakewriter task"); kthread_stop(fakewriter_tasks[i]); @@ -847,7 +843,7 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - if (stats_task != NULL) { + if (stats_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); } @@ -858,7 +854,7 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (cur_ops->cleanup != NULL) + if (cur_ops->cleanup) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms("End of test: FAILURE"); @@ -866,27 +862,28 @@ rcu_torture_cleanup(void) rcu_torture_print_module_parms("End of test: SUCCESS"); } -static int +static int __init rcu_torture_init(void) { int i; int cpu; int firsterr = 0; + static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &sched_ops, }; /* Process args and tell the world that the torturer is on the job. */ - - for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) { + if (strcmp(torture_type, cur_ops->name) == 0) break; - } } - if (cur_ops == NULL) { + if (i == ARRAY_SIZE(torture_ops)) { printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); return (-EINVAL); } - if (cur_ops->init != NULL) + if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ if (nreaders >= 0) @@ -899,7 +896,7 @@ rcu_torture_init(void) /* Set up the freelist. */ INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { rcu_tortures[i].rtort_mbtest = 0; list_add_tail(&rcu_tortures[i].rtort_free, &rcu_torture_freelist); diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 291ded556aa0..9a87886b022e 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem) int ret = __down_write_trylock(sem); if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); return ret; } diff --git a/kernel/sched.c b/kernel/sched.c index 0227f1625a75..a3a04085e794 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -52,8 +52,9 @@ #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> -#include <asm/tlb.h> +#include <linux/reciprocal_div.h> +#include <asm/tlb.h> #include <asm/unistd.h> /* @@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) #define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) + (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active)) #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +#ifdef CONFIG_SMP +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif + /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] @@ -223,6 +245,10 @@ struct rq { unsigned long raw_weighted_load; #ifdef CONFIG_SMP unsigned long cpu_load[3]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned char in_nohz_recently; +#endif #endif unsigned long long nr_switches; @@ -278,7 +304,7 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues); +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; static inline int cpu_of(struct rq *rq) { @@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p) if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } + +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} #else static inline void resched_task(struct task_struct *p) { @@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); if (local_group) { this_load = avg_load; @@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p) struct sched_domain *sd; int i; - if (idle_cpu(cpu)) + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) return cpu; for_each_domain(cpu, sd) { @@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } total_load += avg_load; - total_pwr += group->cpu_power; + total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); - group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { this_load = avg_load; @@ -2468,8 +2516,8 @@ group_next: max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->cpu_power, - (avg_load - this_load) * this->cpu_power) + *imbalance = min(max_pull * busiest->__cpu_power, + (avg_load - this_load) * this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -2503,28 +2551,29 @@ small_imbalance: * moving them. */ - pwr_now += busiest->cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->cpu_power * - min(this_load_per_task, this_load); + pwr_now += busiest->__cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->__cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - busiest->cpu_power; + tmp = sg_div_cpu_power(busiest, + busiest_load_per_task * SCHED_LOAD_SCALE); if (max_load > tmp) - pwr_move += busiest->cpu_power * + pwr_move += busiest->__cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->cpu_power < + if (max_load * busiest->__cpu_power < busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = max_load * busiest->cpu_power / this->cpu_power; + tmp = sg_div_cpu_power(this, + max_load * busiest->__cpu_power); else - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - this->cpu_power; - pwr_move += this->cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(this, + busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += this->__cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2657,6 +2706,12 @@ redo: double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + /* + * some other cpu did the load balance for us. + */ + if (nr_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); @@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq) } } +#ifdef CONFIG_NO_HZ +static struct { + atomic_t load_balancer; + cpumask_t cpu_mask; +} nohz ____cacheline_aligned = { + .load_balancer = ATOMIC_INIT(-1), + .cpu_mask = CPU_MASK_NONE, +}; + /* - * run_rebalance_domains is triggered when needed from the scheduler tick. + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. If all the cpus in the system + * go into this tickless mode, then there will be no ilb owner (as there is + * no need for one) and all the cpus will sleep till the next wakeup event + * arrives... * + * For the ilb owner, tick is not stopped. And this tick will be used + * for idle load balancing. ilb owner will still be part of + * nohz.cpu_mask.. + * + * While stopping the tick, this cpu will become the ilb owner if there + * is no other owner. And will be the owner till that cpu becomes busy + * or if all cpus in the system stop their ticks at which point + * there is no need for ilb owner. + * + * When the ilb owner becomes busy, it nominates another owner, during the + * next busy scheduler_tick() + */ +int select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + cpu_set(cpu, nohz.cpu_mask); + cpu_rq(cpu)->in_nohz_recently = 1; + + /* + * If we are going offline and still the leader, give up! + */ + if (cpu_is_offline(cpu) && + atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + return 0; + } + + /* time for ilb owner also to sleep */ + if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (atomic_read(&nohz.load_balancer) == cpu) + atomic_set(&nohz.load_balancer, -1); + return 0; + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) + return 1; + } else if (atomic_read(&nohz.load_balancer) == cpu) + return 1; + } else { + if (!cpu_isset(cpu, nohz.cpu_mask)) + return 0; + + cpu_clear(cpu, nohz.cpu_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + } + return 0; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ -static DEFINE_SPINLOCK(balancing); - -static void run_rebalance_domains(struct softirq_action *h) +static inline void rebalance_domains(int cpu, enum idle_type idle) { - int this_cpu = smp_processor_id(), balance = 1; - struct rq *this_rq = cpu_rq(this_cpu); + int balance = 1; + struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; - /* - * We are idle if there are no processes running. This - * is valid even if we are the idle process (SMT). - */ - enum idle_type idle = !this_rq->nr_running ? - SCHED_IDLE : NOT_IDLE; - /* Earliest time when we have to call run_rebalance_domains again */ + /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; - for_each_domain(this_cpu, sd) { + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2995,7 +3116,114 @@ out: if (!balance) break; } - this_rq->next_balance = next_balance; + rq->next_balance = next_balance; +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int local_cpu = smp_processor_id(); + struct rq *local_rq = cpu_rq(local_cpu); + enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; + + rebalance_domains(local_cpu, idle); + +#ifdef CONFIG_NO_HZ + /* + * If this cpu is the owner for idle load balancing, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + if (local_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == local_cpu) { + cpumask_t cpus = nohz.cpu_mask; + struct rq *rq; + int balance_cpu; + + cpu_clear(local_cpu, cpus); + for_each_cpu_mask(balance_cpu, cpus) { + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rebalance_domains(balance_cpu, SCHED_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(local_rq->next_balance, rq->next_balance)) + local_rq->next_balance = rq->next_balance; + } + } +#endif +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * + * In case of CONFIG_NO_HZ, this is the place where we nominate a new + * idle load balancing owner or decide to stop the periodic load balancing, + * if the whole system is idle. + */ +static inline void trigger_load_balance(int cpu) +{ + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_NO_HZ + /* + * If we were in the nohz mode recently and busy at the current + * scheduler tick, then check if we need to nominate new idle + * load balancer. + */ + if (rq->in_nohz_recently && !rq->idle_at_tick) { + rq->in_nohz_recently = 0; + + if (atomic_read(&nohz.load_balancer) == cpu) { + cpu_clear(cpu, nohz.cpu_mask); + atomic_set(&nohz.load_balancer, -1); + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* + * simple selection for now: Nominate the + * first cpu in the nohz list to be the next + * ilb owner. + * + * TBD: Traverse the sched domains and nominate + * the nearest cpu in the nohz.cpu_mask. + */ + int ilb = first_cpu(nohz.cpu_mask); + + if (ilb != NR_CPUS) + resched_cpu(ilb); + } + } + + /* + * If this cpu is idle and doing idle load balancing for all the + * cpus with ticks stopped, is it time for that to stop? + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && + cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + resched_cpu(cpu); + return; + } + + /* + * If this cpu is idle and the idle load balancing is done by + * someone else, then no need raise the SCHED_SOFTIRQ + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && + cpu_isset(cpu, nohz.cpu_mask)) + return; +#endif + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); } #else /* @@ -3218,16 +3446,17 @@ void scheduler_tick(void) unsigned long long now = sched_clock(); struct task_struct *p = current; int cpu = smp_processor_id(); + int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); update_cpu_clock(p, rq, now); - if (p != rq->idle) + if (!idle_at_tick) task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); + rq->idle_at_tick = idle_at_tick; + trigger_load_balance(cpu); #endif } @@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio) struct prio_array *array; unsigned long flags; struct rq *rq; - int oldprio; + int delta; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - oldprio = p->prio; + delta = prio - p->prio; array = p->array; if (array) dequeue_task(p, array); @@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(p, array); /* * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's + * our priority decreased, or if our priority became higher + * than the current's. */ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) + if (TASK_PREEMPTS_CURR(p, rq) || + (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } task_rq_unlock(rq, &flags); @@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice) enqueue_task(p, array); inc_raw_weighted_load(rq, p); /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if our priority became higher + * than the current's. */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) + if (TASK_PREEMPTS_CURR(p, rq) || + (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -4153,13 +4382,11 @@ recheck: __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's + * our priority decreased, or our priority became higher + * than the current's. */ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) + if (TASK_PREEMPTS_CURR(p, rq) || + (task_running(rq, p) && p->prio > oldprio)) resched_task(rq->curr); } __task_rq_unlock(rq); @@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter) show_task(p); } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -5304,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } - if (!group->cpu_power) { + if (!group->__cpu_power) { printk("\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -5481,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, continue; sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; + sg->__cpu_power = 0; for_each_cpu_mask(j, span) { if (group_fn(j, cpu_map, NULL) != group) @@ -6170,7 +6399,7 @@ next_sg: continue; } - sg->cpu_power += sd->groups->cpu_power; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); } sg = sg->next; if (sg != group_head) @@ -6245,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; + sd->groups->__cpu_power = 0; + /* * For perf policy, if the groups in child domain share resources * (for example cores sharing some portions of the cache hierarchy @@ -6255,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && (child->flags & (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); return; } - sd->groups->cpu_power = 0; - /* * add cpu_power of each child group to this groups cpu_power */ group = child->groups; do { - sd->groups->cpu_power += group->cpu_power; + sg_inc_cpu_power(sd->groups, group->__cpu_power); group = group->next; } while (group != child->groups); } @@ -6426,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(node_domains, j); sd->groups = sg; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = nodemask; sg->next = sg; cpus_or(covered, covered, nodemask); @@ -6454,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) "Can not alloc domain group for node %d\n", j); goto error; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = tmp; sg->next = prev->next; cpus_or(covered, covered, tmp); diff --git a/kernel/signal.c b/kernel/signal.c index 2b4087d545a3..1368e67c8482 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -12,7 +12,6 @@ #include <linux/slab.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/fs.h> diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 50afeb813305..8fa7040247ad 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -34,12 +34,32 @@ static struct notifier_block panic_block = { .notifier_call = softlock_panic, }; +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(void) +{ + return sched_clock() >> 30; /* 2^30 ~= 10^9 */ +} + void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(touch_timestamp) = jiffies; + __raw_get_cpu_var(touch_timestamp) = get_timestamp(); } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* Cause each CPU to re-update its timestamp rather than complain */ + for_each_online_cpu(cpu) + per_cpu(touch_timestamp, cpu) = 0; +} +EXPORT_SYMBOL(touch_all_softlockup_watchdogs); + /* * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: @@ -48,9 +68,18 @@ void softlockup_tick(void) { int this_cpu = smp_processor_id(); unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); + unsigned long print_timestamp; + unsigned long now; + + if (touch_timestamp == 0) { + touch_softlockup_watchdog(); + return; + } + + print_timestamp = per_cpu(print_timestamp, this_cpu); - /* prevent double reports: */ - if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || + /* report at most once a second */ + if (print_timestamp < (touch_timestamp + 1) || did_panic || !per_cpu(watchdog_task, this_cpu)) return; @@ -61,12 +90,14 @@ void softlockup_tick(void) return; } + now = get_timestamp(); + /* Wake up the high-prio watchdog task every second: */ - if (time_after(jiffies, touch_timestamp + HZ)) + if (now > (touch_timestamp + 1)) wake_up_process(per_cpu(watchdog_task, this_cpu)); /* Warn about unreasonable 10+ seconds delays: */ - if (time_after(jiffies, touch_timestamp + 10*HZ)) { + if (now > (touch_timestamp + 10)) { per_cpu(print_timestamp, this_cpu) = touch_timestamp; spin_lock(&print_lock); @@ -82,11 +113,14 @@ void softlockup_tick(void) */ static int watchdog(void * __bind_cpu) { - struct sched_param param = { .sched_priority = 99 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); current->flags |= PF_NOFREEZE; + /* initialize timestamp */ + touch_softlockup_watchdog(); + /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 10 seconds then the @@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) printk("watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } - per_cpu(touch_timestamp, hotcpu) = jiffies; + per_cpu(touch_timestamp, hotcpu) = 0; per_cpu(watchdog_task, hotcpu) = p; kthread_bind(p, hotcpu); break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12458040e665..daabb74ee0bc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,11 +1,12 @@ /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ -#include <linux/stop_machine.h> -#include <linux/kthread.h> -#include <linux/sched.h> #include <linux/cpu.h> #include <linux/err.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/stop_machine.h> #include <linux/syscalls.h> #include <asm/atomic.h> #include <asm/semaphore.h> @@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) return ret; } +EXPORT_SYMBOL_GPL(stop_machine_run); diff --git a/kernel/sys.c b/kernel/sys.c index fe1f3ab20477..926bf9d7ac45 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (retval) return retval; + if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim.rlim_cur = 1; + } + task_lock(current->group_leader); *old_rlim = new_rlim; task_unlock(current->group_leader); @@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) unsigned long rlim_cur = new_rlim.rlim_cur; cputime_t cputime; - if (rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - rlim_cur = 1; - } cputime = secs_to_cputime(rlim_cur); read_lock(&tasklist_lock); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c904748f2290..f0664bd5011c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int maps_protect; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -603,6 +604,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_FS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "maps_protect", + .data = &maps_protect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff --git a/kernel/time.c b/kernel/time.c index ba18ec4899bd..f04791f69408 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -31,7 +31,6 @@ #include <linux/timex.h> #include <linux/capability.h> #include <linux/errno.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> @@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb) } EXPORT_SYMBOL(current_fs_time); +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int inline jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int inline jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + /** * timespec_trunc - Truncate timespec to a granularity * @t: Timespec @@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec) EXPORT_SYMBOL(ns_to_timeval); /* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/* * When we convert to jiffies then we interpret incoming values * the following way: * diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 93bccba1f265..99b6034fc86b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bfda3f7f0716..a96ec9ab3454 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -static int tick_do_timer_cpu = -1; +int tick_do_timer_cpu __read_mostly = -1; DEFINE_SPINLOCK(tick_device_lock); /* @@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } + /* Transfer the do_timer job away from this cpu */ + if (*cpup == tick_do_timer_cpu) { + int cpu = first_cpu(cpu_online_map); + + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c9d203bde518..bb13f2724905 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; +extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 51556b95f60f..3483e6cb9549 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { + if (select_nohz_load_balancer(1)) { + /* + * sched tick not stopped! + */ + cpu_clear(cpu, nohz_cpu_mask); + goto out; + } + ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; } + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = -1; + /* * calculate the expiry time for the next timer wheel * timer @@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void) now = ktime_get(); local_irq_disable(); + select_nohz_load_balancer(0); tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); @@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); + int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == -1)) + tick_do_timer_cpu = cpu; + /* Check, if the jiffies need an update */ - tick_do_update_jiffies64(now); + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); /* * When we are idle and the tick is stopped, we have to touch @@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct hrtimer_cpu_base *base = timer->base->cpu_base; struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == -1)) + tick_do_timer_cpu = cpu; +#endif /* Check, if the jiffies need an update */ - tick_do_update_jiffies64(now); + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 000000000000..f9217bf644f6 --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,476 @@ +/* + * linux/kernel/time/timekeeping.c + * + * Kernel timekeeping code and accessor functions + * + * This code was moved from linux/kernel/timer.c. + * Please see that file for copyright and history logs. + * + */ + +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/sysdev.h> +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/tick.h> + + +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime and avenrun. + */ +__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + +EXPORT_SYMBOL(xtime_lock); + + +/* + * The current time + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + */ +struct timespec xtime __attribute__ ((aligned (16))); +struct timespec wall_to_monotonic __attribute__ ((aligned (16))); + +EXPORT_SYMBOL(xtime); + + +static struct clocksource *clock; /* pointer to current clocksource */ + + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + clock->error = 0; + ntp_clear(); + + update_vsyscall(&xtime, clock); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static void change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + + new = clocksource_get_next(); + + if (clock == new) + return; + + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + + tick_clock_notify(); + + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); +} +#else +static inline void change_clocksource(void) { } +#endif + +/** + * timekeeping_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + + do { + seq = read_seqbegin(&xtime_lock); + + ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/** + * read_persistent_clock - Return time in seconds from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Returns seconds from epoch using the battery backed persistent clock. + * Returns zero if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +unsigned long __attribute__((weak)) read_persistent_clock(void) +{ + return 0; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + unsigned long sec = read_persistent_clock(); + + write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + clock->cycle_last = clocksource_read(clock); + + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} + +/* flag for if timekeeping is suspended */ +static int timekeeping_suspended; +/* time in seconds when suspend began */ +static unsigned long timekeeping_suspend_time; + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + unsigned long now = read_persistent_clock(); + + write_seqlock_irqsave(&xtime_lock, flags); + + if (now && (now > timekeeping_suspend_time)) { + unsigned long sleep_length = now - timekeeping_suspend_time; + + xtime.tv_sec += sleep_length; + wall_to_monotonic.tv_sec -= sleep_length; + } + /* re-base the last cycle value */ + clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); + + /* Resume hrtimers */ + hres_timers_resume(); + + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; + timekeeping_suspend_time = read_persistent_clock(); + write_sequnlock_irqrestore(&xtime_lock, flags); + + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * If the error is already larger, we look ahead even further + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) +{ + s64 tick_error, i; + u32 look_ahead, adj; + s32 error2, mult; + + /* + * Use the current error value to determine how much to look ahead. + * The larger the error the slower we adjust for it to avoid problems + * with losing too many ticks, otherwise we would overadjust and + * produce an even larger error. The smaller the adjustment the + * faster we try to adjust for it, as lost ticks can do less harm + * here. This is tuned so that an error of about 1 msec is adusted + * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). + */ + error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = abs(error2); + for (look_ahead = 0; error2 > 0; look_ahead++) + error2 >>= 2; + + /* + * Now calculate the error in (1 << look_ahead) ticks, but first + * remove the single look ahead already included in the error. + */ + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error -= clock->xtime_interval >> 1; + error = ((error - tick_error) >> look_ahead) + tick_error; + + /* Finally calculate the adjustment shift value. */ + i = *interval; + mult = 1; + if (error < 0) { + error = -error; + *interval = -*interval; + *offset = -*offset; + mult = -1; + } + for (adj = 0; error > i; adj++) + error >>= 1; + + *interval <<= adj; + *offset <<= adj; + return mult << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + error >>= 2; + if (likely(error <= interval)) + adj = 1; + else + adj = clocksource_bigadjust(error, &interval, &offset); + } else if (error < -interval) { + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else + adj = clocksource_bigadjust(error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); +} + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +void update_wall_time(void) +{ + cycle_t offset; + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; + +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset >= clock->cycle_interval) { + /* accumulate one interval */ + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + xtime.tv_sec++; + second_overflow(); + } + + /* interpolator bits */ + time_interpolator_update(clock->xtime_interval + >> clock->shift); + + /* accumulate error between NTP and clock interval */ + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } + + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); + + /* store full nanoseconds into xtime */ + xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + change_clocksource(); + update_vsyscall(&xtime, clock); +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 59df5e8555a8..b734ca4bc75e 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); static void print_name_offset(struct seq_file *m, void *sym) { - unsigned long addr = (unsigned long)sym; - char namebuf[KSYM_NAME_LEN+1]; - unsigned long size, offset; - const char *sym_name; - char *modname; - - sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); - if (sym_name) - SEQ_printf(m, "%s", sym_name); - else + char symname[KSYM_NAME_LEN+1]; + + if (lookup_symbol_name((unsigned long)sym, symname) < 0) SEQ_printf(m, "<%p>", sym); + else + SEQ_printf(m, "%s", symname); } static void diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1bc4882e28e0..868f1bceb07f 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static void print_name_offset(struct seq_file *m, unsigned long addr) { - char namebuf[KSYM_NAME_LEN+1]; - unsigned long size, offset; - const char *sym_name; - char *modname; - - sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); - if (sym_name) - seq_printf(m, "%s", sym_name); - else + char symname[KSYM_NAME_LEN+1]; + + if (lookup_symbol_name(addr, symname) < 0) seq_printf(m, "<%p>", (void *)addr); + else + seq_printf(m, "%s", symname); } static int tstats_show(struct seq_file *m, void *v) diff --git a/kernel/timer.c b/kernel/timer.c index b22bd39740dd..7a6448340f90 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1,7 +1,7 @@ /* * linux/kernel/timer.c * - * Kernel internal timers, kernel timekeeping, basic process system calls + * Kernel internal timers, basic process system calls * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -74,7 +74,7 @@ struct tvec_t_base_s { tvec_t tv3; tvec_t tv4; tvec_t tv5; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned; typedef struct tvec_t_base_s tvec_base_t; @@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/* + * Note that all tvec_bases is 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB for + * the new flag to indicate whether the timer is deferrable + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(tvec_base_t *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline tvec_base_t *tbase_get_base(tvec_base_t *base) +{ + return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | + TBASE_DEFERRABLE_FLAG)); +} + +static inline void +timer_set_base(struct timer_list *timer, tvec_base_t *new_base) +{ + timer->base = (tvec_base_t *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} + /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded @@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer) } EXPORT_SYMBOL(init_timer); +void fastcall init_timer_deferrable(struct timer_list *timer) +{ + init_timer(timer); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable); + static inline void detach_timer(struct timer_list *timer, int clear_pending) { @@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, tvec_base_t *base; for (;;) { - base = timer->base; + tvec_base_t *prelock_base = timer->base; + base = tbase_get_base(prelock_base); if (likely(base != NULL)) { spin_lock_irqsave(&base->lock, *flags); - if (likely(base == timer->base)) + if (likely(prelock_base == timer->base)) return base; /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); @@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer->base = NULL; + timer_set_base(timer, NULL); spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->base = base; + timer_set_base(timer, base); } } @@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer->base = base; + timer_set_base(timer, base); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); } @@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * don't have to detach them individually. */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(timer->base != base); + BUG_ON(tbase_get_base(timer->base) != base); internal_add_timer(base, timer); } @@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base) void (*fn)(unsigned long); unsigned long data; - timer = list_entry(head->next,struct timer_list,entry); + timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; @@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + found = 1; expires = nte->expires; /* Look at the cascade bucket(s)? */ @@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void) #endif -/******************************************************************/ - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); - -EXPORT_SYMBOL(xtime); - - -/* XXX - all of this timekeeping code should be later moved to time.c */ -#include <linux/clocksource.h> -static struct clocksource *clock; /* pointer to current clocksource */ - -#ifdef CONFIG_GENERIC_TIME -/** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook - * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) - */ -static inline s64 __get_nsec_offset(void) -{ - cycle_t cycle_now, cycle_delta; - s64 ns_offset; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); - - return ns_offset; -} - -/** - * __get_realtime_clock_ts - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. Used by - * do_gettimeofday() and get_realtime_clock_ts(). - */ -static inline void __get_realtime_clock_ts(struct timespec *ts) -{ - unsigned long seq; - s64 nsecs; - - do { - seq = read_seqbegin(&xtime_lock); - - *ts = xtime; - nsecs = __get_nsec_offset(); - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} - -/** - * getnstimeofday - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void getnstimeofday(struct timespec *ts) -{ - __get_realtime_clock_ts(ts); -} - -EXPORT_SYMBOL(getnstimeofday); - -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using get_realtime_clock_ts() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec now; - - __get_realtime_clock_ts(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} - -EXPORT_SYMBOL(do_gettimeofday); -/** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time - * - * Sets the time of day to the new time and update NTP and notify hrtimers - */ -int do_settimeofday(struct timespec *tv) -{ - unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - nsec -= __get_nsec_offset(); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - clock->error = 0; - ntp_clear(); - - update_vsyscall(&xtime, clock); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -/** - * change_clocksource - Swaps clocksources if a new one is available - * - * Accumulates current time interval and initializes new clocksource - */ -static void change_clocksource(void) -{ - struct clocksource *new; - cycle_t now; - u64 nsec; - - new = clocksource_get_next(); - - if (clock == new) - return; - - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - - tick_clock_notify(); - - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); -} -#else -static inline void change_clocksource(void) { } -#endif - -/** - * timekeeping_is_continuous - check to see if timekeeping is free running - */ -int timekeeping_is_continuous(void) -{ - unsigned long seq; - int ret; - - do { - seq = read_seqbegin(&xtime_lock); - - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - - } while (read_seqretry(&xtime_lock, seq)); - - return ret; -} - -/** - * read_persistent_clock - Return time in seconds from the persistent clock. - * - * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. - */ -unsigned long __attribute__((weak)) read_persistent_clock(void) -{ - return 0; -} - -/* - * timekeeping_init - Initializes the clocksource and common timekeeping values - */ -void __init timekeeping_init(void) -{ - unsigned long flags; - unsigned long sec = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - ntp_clear(); - - clock = clocksource_get_next(); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - write_sequnlock_irqrestore(&xtime_lock, flags); -} - -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; -/* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; - -/** - * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. - */ -static int timekeeping_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long now = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - } - /* re-base the last cycle value */ - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - - touch_softlockup_watchdog(); - - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ - hres_timers_resume(); - - return 0; -} - -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_suspended = 1; - timekeeping_suspend_time = read_persistent_clock(); - write_sequnlock_irqrestore(&xtime_lock, flags); - - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); - - return 0; -} - -/* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, - set_kset_name("timekeeping"), -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) -{ - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(timekeeping_init_device); - -/* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, - s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; - - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = current_tick_length() >> - (TICK_LENGTH_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) -{ - s64 error, interval = clock->cycle_interval; - int adj; - - error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); - if (error > interval) { - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = clocksource_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = clocksource_bigadjust(error, &interval, &offset); - } else - return; - - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (TICK_LENGTH_SHIFT - clock->shift); -} - -/** - * update_wall_time - Uses the current clocksource to increment the wall time - * - * Called from the timer interrupt, must hold a write on xtime_lock. - */ -static void update_wall_time(void) -{ - cycle_t offset; - - /* Make sure we're fully resumed: */ - if (unlikely(timekeeping_suspended)) - return; - -#ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; -#else - offset = clock->cycle_interval; -#endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; - - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. - */ - while (offset >= clock->cycle_interval) { - /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; - offset -= clock->cycle_interval; - - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; - xtime.tv_sec++; - second_overflow(); - } - - /* interpolator bits */ - time_interpolator_update(clock->xtime_interval - >> clock->shift); - - /* accumulate error between NTP and clock interval */ - clock->error += current_tick_length(); - clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); - } - - /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); - - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - - /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); -} - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. @@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks) } /* - * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. - */ -__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - -EXPORT_SYMBOL(xtime_lock); - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) @@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu) cpu_to_node(cpu)); if (!base) return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } memset(base, 0, sizeof(*base)); per_cpu(tvec_bases, cpu) = base; } else { @@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) struct timer_list *timer; while (!list_empty(head)) { - timer = list_entry(head->next, struct timer_list, entry); + timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); - timer->base = new_base; + timer_set_base(timer, new_base); internal_add_timer(new_base, timer); } } diff --git a/kernel/uid16.c b/kernel/uid16.c index 187e2a423878..dd308ba4e03b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -6,7 +6,6 @@ #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index c859164a6993..160c8c5136bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) } /* - * unshare the current process' utsname namespace. - * called only in sys_unshare() - */ -int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) -{ - if (unshare_flags & CLONE_NEWUTS) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_uts = clone_uts_ns(current->nsproxy->uts_ns); - if (!*new_uts) - return -ENOMEM; - } - - return 0; -} - -/* * Copy task tsk's utsname namespace, or clone it if flags * specifies CLONE_NEWUTS. In latter case, changes to the * utsname of this process won't be seen by parent, and vice * versa. */ -int copy_utsname(int flags, struct task_struct *tsk) +struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; - int err = 0; - - if (!old_ns) - return 0; + BUG_ON(!old_ns); get_uts_ns(old_ns); if (!(flags & CLONE_NEWUTS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } + return old_ns; new_ns = clone_uts_ns(old_ns); - if (!new_ns) { - err = -ENOMEM; - goto out; - } - tsk->nsproxy->uts_ns = new_ns; -out: put_uts_ns(old_ns); - return err; + return new_ns; } void free_uts_ns(struct kref *kref) |