diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/futex.c | 176 | ||||
-rw-r--r-- | kernel/kgdb.c | 8 | ||||
-rw-r--r-- | kernel/module.c | 44 | ||||
-rw-r--r-- | kernel/sched.c | 323 | ||||
-rw-r--r-- | kernel/sched_clock.c | 236 | ||||
-rw-r--r-- | kernel/sched_debug.c | 7 | ||||
-rw-r--r-- | kernel/sched_fair.c | 39 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 2 | ||||
-rw-r--r-- | kernel/sched_rt.c | 9 |
10 files changed, 419 insertions, 427 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 188c43223f52..1c9938addb9d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/futex.c b/kernel/futex.c index 98092c9817f4..449def8074fe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -104,10 +104,6 @@ struct futex_q { /* Key which the futex is hashed on: */ union futex_key key; - /* For fd, sigio sent using these: */ - int fd; - struct file *filp; - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; @@ -126,9 +122,6 @@ struct futex_hash_bucket { static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; -/* Futex-fs vfsmount entry: */ -static struct vfsmount *futex_mnt; - /* * Take mm->mmap_sem, when futex is shared */ @@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static void wake_futex(struct futex_q *q) { plist_del(&q->list, &q->list.plist); - if (q->filp) - send_sigio(&q->filp->f_owner, q->fd, POLL_IN); /* * The lock in wake_up_all() is a crucial memory barrier after the * plist_del() and also before assigning to q->lock_ptr. @@ -988,14 +979,10 @@ out: } /* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket * -queue_lock(struct futex_q *q, int fd, struct file *filp) +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - q->fd = fd; - q->filp = filp; - init_waitqueue_head(&q->waiters); get_futex_key_refs(&q->key); @@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) return hb; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * exactly once. They are called with the hashed spinlock held. */ -/* The key must be already stored in q->key. */ -static void queue_me(struct futex_q *q, int fd, struct file *filp) -{ - struct futex_hash_bucket *hb; - - hb = queue_lock(q, fd, filp); - __queue_me(q, hb); -} - /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { @@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret != 0)) goto out_release_sem; - hb = queue_lock(&q, -1, NULL); + hb = queue_lock(&q); /* * Access the page AFTER the futex is queued. @@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, hb); + queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -1386,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto out_release_sem; retry_unlocked: - hb = queue_lock(&q, -1, NULL); + hb = queue_lock(&q); retry_locked: ret = lock_taken = 0; @@ -1499,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* * Only actually queue now that the atomic ops are done: */ - __queue_me(&q, hb); + queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -1746,121 +1724,6 @@ pi_faulted: return ret; } -static int futex_close(struct inode *inode, struct file *filp) -{ - struct futex_q *q = filp->private_data; - - unqueue_me(q); - kfree(q); - - return 0; -} - -/* This is one-shot: once it's gone off you need a new fd */ -static unsigned int futex_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct futex_q *q = filp->private_data; - int ret = 0; - - poll_wait(filp, &q->waiters, wait); - - /* - * plist_node_empty() is safe here without any lock. - * q->lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (plist_node_empty(&q->list)) - ret = POLLIN | POLLRDNORM; - - return ret; -} - -static const struct file_operations futex_fops = { - .release = futex_close, - .poll = futex_poll, -}; - -/* - * Signal allows caller to avoid the race which would occur if they - * set the sigio stuff up afterwards. - */ -static int futex_fd(u32 __user *uaddr, int signal) -{ - struct futex_q *q; - struct file *filp; - int ret, err; - struct rw_semaphore *fshared; - static unsigned long printk_interval; - - if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { - printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " - "will be removed from the kernel in June 2007\n", - current->comm); - } - - ret = -EINVAL; - if (!valid_signal(signal)) - goto out; - - ret = get_unused_fd(); - if (ret < 0) - goto out; - filp = get_empty_filp(); - if (!filp) { - put_unused_fd(ret); - ret = -ENFILE; - goto out; - } - filp->f_op = &futex_fops; - filp->f_path.mnt = mntget(futex_mnt); - filp->f_path.dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; - - if (signal) { - err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); - if (err < 0) { - goto error; - } - filp->f_owner.signum = signal; - } - - q = kmalloc(sizeof(*q), GFP_KERNEL); - if (!q) { - err = -ENOMEM; - goto error; - } - q->pi_state = NULL; - - fshared = ¤t->mm->mmap_sem; - down_read(fshared); - err = get_futex_key(uaddr, fshared, &q->key); - - if (unlikely(err != 0)) { - up_read(fshared); - kfree(q); - goto error; - } - - /* - * queue_me() must be called before releasing mmap_sem, because - * key->shared.inode needs to be referenced while holding it. - */ - filp->private_data = q; - - queue_me(q, ret, filp); - up_read(fshared); - - /* Now we map fd to filp, so userspace can access it */ - fd_install(ret, filp); -out: - return ret; -error: - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; -} - /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -2092,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_BITSET: ret = futex_wake(uaddr, fshared, val, val3); break; - case FUTEX_FD: - /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ - ret = futex_fd(uaddr, val); - break; case FUTEX_REQUEUE: ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); break; @@ -2156,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt); -} - -static struct file_system_type futex_fs_type = { - .name = "futexfs", - .get_sb = futexfs_get_sb, - .kill_sb = kill_anon_super, -}; - static int __init futex_init(void) { u32 curval; @@ -2193,16 +2039,6 @@ static int __init futex_init(void) spin_lock_init(&futex_queues[i].lock); } - i = register_filesystem(&futex_fs_type); - if (i) - return i; - - futex_mnt = kern_mount(&futex_fs_type); - if (IS_ERR(futex_mnt)) { - unregister_filesystem(&futex_fs_type); - return PTR_ERR(futex_mnt); - } - return 0; } __initcall(futex_init); diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 1bd0ec1c80b2..39e31a036f5b 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -61,7 +61,7 @@ struct kgdb_state { int err_code; int cpu; int pass_exception; - long threadid; + unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; }; @@ -146,7 +146,7 @@ atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); * the other CPUs might interfere with your debugging context, so * use this with care: */ -int kgdb_do_roundup = 1; +static int kgdb_do_roundup = 1; static int __init opt_nokgdbroundup(char *str) { @@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count) * While we find nice hex chars, build a long_val. * Return number of chars processed. */ -int kgdb_hex2long(char **ptr, long *long_val) +int kgdb_hex2long(char **ptr, unsigned long *long_val) { int hex_val; int num = 0; @@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr) return 0; } -int remove_all_break(void) +static int remove_all_break(void) { unsigned long addr; int error; diff --git a/kernel/module.c b/kernel/module.c index 8674a390a2e8..8e4528c9909f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = { static const char vermagic[] = VERMAGIC_STRING; +static int try_to_force_load(struct module *mod, const char *symname) +{ +#ifdef CONFIG_MODULE_FORCE_LOAD + if (!(tainted & TAINT_FORCED_MODULE)) + printk("%s: no version for \"%s\" found: kernel tainted.\n", + mod->name, symname); + add_taint_module(mod, TAINT_FORCED_MODULE); + return 0; +#else + return -ENOEXEC; +#endif +} + #ifdef CONFIG_MODVERSIONS static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, @@ -914,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs, if (versions[i].crc == *crc) return 1; - printk("%s: disagrees about version of symbol %s\n", - mod->name, symname); DEBUGP("Found checksum %lX vs module %lX\n", *crc, versions[i].crc); - return 0; + goto bad_version; } - /* Not in module's version table. OK, but that taints the kernel. */ - if (!(tainted & TAINT_FORCED_MODULE)) - printk("%s: no version for \"%s\" found: kernel tainted.\n", - mod->name, symname); - add_taint_module(mod, TAINT_FORCED_MODULE); - return 1; + + if (!try_to_force_load(mod, symname)) + return 1; + +bad_version: + printk("%s: disagrees about version of symbol %s\n", + mod->name, symname); + return 0; } static inline int check_modstruct_version(Elf_Shdr *sechdrs, @@ -1853,9 +1866,9 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - add_taint_module(mod, TAINT_FORCED_MODULE); - printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", - mod->name); + err = try_to_force_load(mod, "magic"); + if (err) + goto free_hdr; } else if (!same_magic(modmagic, vermagic)) { printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); @@ -2006,9 +2019,10 @@ static struct module *load_module(void __user *umod, (mod->num_gpl_future_syms && !gplfuturecrcindex) || (mod->num_unused_syms && !unusedcrcindex) || (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { - printk(KERN_WARNING "%s: No versions for exported symbols." - " Tainting kernel.\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE); + printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); + err = try_to_force_load(mod, "nocrc"); + if (err) + goto cleanup; } #endif markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); diff --git a/kernel/sched.c b/kernel/sched.c index 34bcc5bc120e..58fb8af15776 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,16 +75,6 @@ #include <asm/irq_regs.h> /* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - -/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) } #endif +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + #ifdef CONFIG_GROUP_SCHED #include <linux/cgroup.h> @@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); -/* doms_cur_mutex serializes access to doms_cur[] array */ -static DEFINE_MUTEX(doms_cur_mutex); - #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex); # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +/* + * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ #define MIN_SHARES 2 +#define MAX_SHARES (ULONG_MAX - 1) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -static inline void lock_doms_cur(void) -{ - mutex_lock(&doms_cur_mutex); -} - -static inline void unlock_doms_cur(void) -{ - mutex_unlock(&doms_cur_mutex); -} - #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_doms_cur(void) { } -static inline void unlock_doms_cur(void) { } #endif /* CONFIG_GROUP_SCHED */ @@ -560,13 +547,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows, clock_underflows; - u64 idle_clock; - unsigned int clock_deep_idle_events; - u64 tick_timestamp; + u64 clock; atomic_t nr_iowait; @@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq) #endif } -#ifdef CONFIG_NO_HZ -static inline bool nohz_on(int cpu) -{ - return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; -} - -static inline u64 max_skipped_ticks(struct rq *rq) -{ - return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; -} - -static inline void update_last_tick_seen(struct rq *rq) -{ - rq->last_tick_seen = jiffies; -} -#else -static inline u64 max_skipped_ticks(struct rq *rq) -{ - return 1; -} - -static inline void update_last_tick_seen(struct rq *rq) -{ -} -#endif - -/* - * Update the per-runqueue clock, as finegrained as the platform can give - * us, but without assuming monotonicity, etc.: - */ -static void __update_rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -#endif - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; - u64 max_time = rq->tick_timestamp + max_jump; - - if (unlikely(clock + delta > max_time)) { - if (clock < max_time) - clock = max_time; - else - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; -} - -static void update_rq_clock(struct rq *rq) -{ - if (likely(smp_processor_id() == cpu_of(rq))) - __update_rq_clock(rq); -} - /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static inline void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -__read_mostly char *sched_feat_names[] = { +static __read_mostly char *sched_feat_names[] = { #include "sched_features.h" NULL }; #undef SCHED_FEAT -int sched_feat_open(struct inode *inode, struct file *filp) +static int sched_feat_open(struct inode *inode, struct file *filp) { filp->private_data = inode->i_private; return 0; @@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static const unsigned long long time_sync_thresh = 100000; +unsigned long long time_sync_thresh = 100000; static DEFINE_PER_CPU(unsigned long long, time_offset); static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); @@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); static DEFINE_SPINLOCK(time_sync_lock); static unsigned long long prev_global_time; -static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) +static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) { - unsigned long flags; - - spin_lock_irqsave(&time_sync_lock, flags); + /* + * We want this inlined, to not get tracer function calls + * in this critical section: + */ + spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); + __raw_spin_lock(&time_sync_lock.raw_lock); if (time < prev_global_time) { per_cpu(time_offset, cpu) += prev_global_time - time; @@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) prev_global_time = time; } - spin_unlock_irqrestore(&time_sync_lock, flags); + __raw_spin_unlock(&time_sync_lock.raw_lock); + spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); return time; } @@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) static unsigned long long __cpu_clock(int cpu) { unsigned long long now; - unsigned long flags; - struct rq *rq; /* * Only call sched_clock() if the scheduler has already been @@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu) if (unlikely(!scheduler_running)) return 0; - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); - now = rq->clock; - local_irq_restore(flags); + now = sched_clock_cpu(cpu); return now; } @@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu) unsigned long long cpu_clock(int cpu) { unsigned long long prev_cpu_time, time, delta_time; + unsigned long flags; + local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; - if (unlikely(delta_time > time_sync_thresh)) + if (unlikely(delta_time > time_sync_thresh)) { time = __sync_cpu_clock(time, cpu); + per_cpu(prev_cpu_time, cpu) = time; + } + local_irq_restore(flags); return time; } @@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void) return rq; } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - - spin_lock(&rq->lock); - __update_rq_clock(rq); - spin_unlock(&rq->lock); - rq->clock_deep_idle_events++; -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - u64 now = sched_clock(); - - rq->idle_clock += delta_ns; - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - spin_lock(&rq->lock); - rq->prev_clock_raw = now; - rq->clock += delta_ns; - spin_unlock(&rq->lock); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - static void __resched_task(struct task_struct *p, int tif_bit); static inline void resched_task(struct task_struct *p) @@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq) enum { HRTICK_SET, /* re-programm hrtick_timer */ HRTICK_RESET, /* not a new slice */ + HRTICK_BLOCK, /* stop hrtick operations */ }; /* @@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; + if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) + return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } @@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); spin_lock(&rq->lock); - __update_rq_clock(rq); + update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); spin_unlock(&rq->lock); return HRTIMER_NORESTART; } -static inline void init_rq_hrtick(struct rq *rq) +static void hotplug_hrtick_disable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + rq->hrtick_flags = 0; + __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); + + hrtick_clear(rq); +} + +static void hotplug_hrtick_enable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hotplug_hrtick_disable(cpu); + return NOTIFY_OK; + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hotplug_hrtick_enable(cpu); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} + +static void init_rq_hrtick(struct rq *rq) { rq->hrtick_flags = 0; hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq) void hrtick_resched(void) { } + +static inline void init_hrtick(void) +{ +} #endif /* @@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); + if (!lw->inv_weight) + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); tmp = (u64)delta_exec * weight; /* @@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, if (shares < MIN_SHARES) shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; __set_se_shares(tg->se[tcpu], shares); } @@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - return account_guest_time(p, cputime); + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime); + return; + } p->stime = cputime_add(p->stime, cputime); @@ -4404,19 +4347,11 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; - u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + sched_clock_tick(); spin_lock(&rq->lock); - __update_rq_clock(rq); - /* - * Let rq->clock advance by at least TICK_NSEC: - */ - if (unlikely(rq->clock < next_tick)) { - rq->clock = next_tick; - rq->clock_underflows++; - } - rq->tick_timestamp = rq->clock; - update_last_tick_seen(rq); + update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); @@ -4570,7 +4505,7 @@ need_resched_nonpreemptible: * Do the rq-clock update outside the rq lock: */ local_irq_disable(); - __update_rq_clock(rq); + update_rq_clock(rq); spin_lock(&rq->lock); clear_tsk_need_resched(prev); @@ -4595,9 +4530,9 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev); - sched_info_switch(prev, next); - if (likely(prev != next)) { + sched_info_switch(prev, next); + rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, { int i, j; - lock_doms_cur(); + mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); @@ -7804,7 +7739,7 @@ match2: register_sched_domain_sysctl(); - unlock_doms_cur(); + mutex_unlock(&sched_domains_mutex); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void) int err; get_online_cpus(); + mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); err = arch_init_sched_domains(&cpu_online_map); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); return err; @@ -7932,13 +7869,16 @@ void __init sched_init_smp(void) BUG_ON(sched_group_nodes_bycpu == NULL); #endif get_online_cpus(); + mutex_lock(&sched_domains_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) @@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->my_q = cfs_rq; se->load.weight = tg->shares; - se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight); + se->load.inv_weight = 0; se->parent = parent; } #endif @@ -8149,8 +8089,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; - update_last_tick_seen(rq); init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) @@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void) p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->clock = 0; if (!rt_task(p)) { /* @@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) dequeue_entity(cfs_rq, se, 0); se->load.weight = shares; - se->load.inv_weight = div64_u64((1ULL<<32), shares); + se->load.inv_weight = 0; if (on_rq) enqueue_entity(cfs_rq, se, 0); @@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - /* - * A weight of 0 or 1 can cause arithmetics problems. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ if (shares < MIN_SHARES) shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; mutex_lock(&shares_mutex); if (tg->shares == shares) @@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * force a rebalance */ cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares/nr_cpu_ids); + set_se_shares(tg->se[i], shares); } /* diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c new file mode 100644 index 000000000000..9c597e37f7de --- /dev/null +++ b/kernel/sched_clock.c @@ -0,0 +1,236 @@ +/* + * sched_clock for unstable cpu clocks + * + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Based on code by: + * Ingo Molnar <mingo@redhat.com> + * Guillaume Chazarain <guichaz@gmail.com> + * + * Create a semi stable clock from a mixture of other events, including: + * - gtod + * - jiffies + * - sched_clock() + * - explicit idle events + * + * We use gtod as base and the unstable clock deltas. The deltas are filtered, + * making it monotonic and keeping it within an expected window. This window + * is set up using jiffies. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat + * consistent between cpus (never more than 1 jiffies difference). + */ +#include <linux/sched.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/ktime.h> +#include <linux/module.h> + + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + +struct sched_clock_data { + /* + * Raw spinlock - this is a special case: this might be called + * from within instrumentation code so we dont want to do any + * instrumentation ourselves. + */ + raw_spinlock_t lock; + + unsigned long prev_jiffies; + u64 prev_raw; + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +void sched_clock_init(void) +{ + u64 ktime_now = ktime_to_ns(ktime_get()); + u64 now = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + scd->prev_jiffies = jiffies; + scd->prev_raw = now; + scd->tick_raw = now; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; + } +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use jiffies to generate a min,max window to clip the raw values + */ +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +{ + unsigned long now_jiffies = jiffies; + long delta_jiffies = now_jiffies - scd->prev_jiffies; + u64 clock = scd->clock; + u64 min_clock, max_clock; + s64 delta = now - scd->prev_raw; + + WARN_ON_ONCE(!irqs_disabled()); + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + if (unlikely(delta < 0)) { + clock++; + goto out; + } + + max_clock = min_clock + TICK_NSEC; + + if (unlikely(clock + delta > max_clock)) { + if (clock < max_clock) + clock = max_clock; + else + clock++; + } else { + clock += delta; + } + + out: + if (unlikely(clock < min_clock)) + clock = min_clock; + + scd->prev_raw = now; + scd->prev_jiffies = now_jiffies; + scd->clock = clock; +} + +static void lock_double_clock(struct sched_clock_data *data1, + struct sched_clock_data *data2) +{ + if (data1 < data2) { + __raw_spin_lock(&data1->lock); + __raw_spin_lock(&data2->lock); + } else { + __raw_spin_lock(&data2->lock); + __raw_spin_lock(&data1->lock); + } +} + +u64 sched_clock_cpu(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + u64 now, clock; + + WARN_ON_ONCE(!irqs_disabled()); + now = sched_clock(); + + if (cpu != raw_smp_processor_id()) { + /* + * in order to update a remote cpu's clock based on our + * unstable raw time rebase it against: + * tick_raw (offset between raw counters) + * tick_gotd (tick offset between cpus) + */ + struct sched_clock_data *my_scd = this_scd(); + + lock_double_clock(scd, my_scd); + + now -= my_scd->tick_raw; + now += scd->tick_raw; + + now -= my_scd->tick_gtod; + now += scd->tick_gtod; + + __raw_spin_unlock(&my_scd->lock); + } else { + __raw_spin_lock(&scd->lock); + } + + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); + + return clock; +} + +void sched_clock_tick(void) +{ + struct sched_clock_data *scd = this_scd(); + u64 now, now_gtod; + + WARN_ON_ONCE(!irqs_disabled()); + + now = sched_clock(); + now_gtod = ktime_to_ns(ktime_get()); + + __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now); + /* + * update tick_gtod after __update_sched_clock() because that will + * already observe 1 new jiffy; adding a new tick_gtod to that would + * increase the clock 2 jiffies. + */ + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + __raw_spin_unlock(&scd->lock); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct sched_clock_data *scd = this_scd(); + u64 now = sched_clock(); + + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + __raw_spin_lock(&scd->lock); + scd->prev_raw = now; + scd->clock += delta_ns; + __raw_spin_unlock(&scd->lock); + + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +#endif + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6b4a12558e88..5f06118fbc31 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu) PN(next_balance); P(curr->pid); PN(clock); - PN(idle_clock); - PN(prev_clock_raw); - P(clock_warps); - P(clock_overflows); - P(clock_underflows); - P(clock_deep_idle_events); - PN(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 89fa32b4edf2..c863663d204d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + account_entity_enqueue(cfs_rq, se); if (wakeup) { place_entity(cfs_rq, se, 0); @@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - account_entity_enqueue(cfs_rq, se); } static void update_avg(u64 *avg, u64 sample) @@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ - if (queued) - return resched_task(rq_of(cfs_rq)->curr); + if (queued) { + resched_task(rq_of(cfs_rq)->curr); + return; + } /* * don't let the period tick interfere with the hrtick preemption */ @@ -957,7 +959,7 @@ static void yield_task_fair(struct rq *rq) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - __update_rq_clock(rq); + update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ @@ -1007,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p) * sibling runqueue info. This will avoid the checks and cache miss * penalities associated with that. */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) return cpu; for_each_domain(cpu, sd) { @@ -1611,30 +1613,6 @@ static const struct sched_class fair_sched_class = { }; #ifdef CONFIG_SCHED_DEBUG -static void -print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) -{ - struct sched_entity *se; - - if (!cfs_rq) - return; - - list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { - int i; - - for (i = depth; i; i--) - seq_puts(m, " "); - - seq_printf(m, "%lu %s %lu\n", - se->load.weight, - entity_is_task(se) ? "T" : "G", - calc_delta_weight(SCHED_LOAD_SCALE, se) - ); - if (!entity_is_task(se)) - print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); - } -} - static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; @@ -1642,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu) rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); - - seq_printf(m, "\nWeight tree:\n"); - print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); rcu_read_unlock(); } #endif diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 2bcafa375633..3a4f92dbbe66 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { +static const struct sched_class idle_sched_class = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c2730a5a4f05..060e87b0cb1c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq) } } - +/* + * If we are not running and we are not going to reschedule soon, we should + * try to push tasks away now + */ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && - (p->prio >= rq->rt.highest_prio) && + !test_tsk_need_resched(rq->curr) && rq->rt.overloaded) push_rt_tasks(rq); } @@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq) p->se.exec_start = rq->clock; } -const struct sched_class rt_sched_class = { +static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, |