diff options
-rw-r--r-- | arch/x86/include/asm/mwait.h | 8 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 51 | ||||
-rw-r--r-- | kernel/sched/core.c | 17 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 43 |
5 files changed, 110 insertions, 15 deletions
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a1410db38a1a..653dfa7662e1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..a388bb883128 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -24,6 +24,7 @@ #include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> +#include <asm/mwait.h> #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/debugreg.h> @@ -399,6 +400,53 @@ static void amd_e400_idle(void) default_idle(); } +/* + * Intel Core2 and older machines prefer MWAIT over HALT for C1. + * We can't rely on cpuidle installing MWAIT, because it will not load + * on systems that support only C1 -- so the boot default must be MWAIT. + * + * Some AMD machines are the opposite, they depend on using HALT. + * + * So for default C1, which is used during boot until cpuidle loads, + * use MWAIT-C1 on Intel HW that has it, else use HALT. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (!cpu_has(c, X86_FEATURE_MWAIT)) + return 0; + + return 1; +} + +/* + * MONITOR/MWAIT with no hints, used for default default C1 state. + * This invokes MWAIT with interrutps enabled and no flags, + * which is backwards compatible with the original MWAIT implementation. + */ + +static void mwait_idle(void) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + smp_mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); + smp_mb(); /* quirk */ + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else { + local_irq_enable(); + } + __current_clr_polling(); +} + void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -412,6 +460,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + x86_idle = mwait_idle; } else x86_idle = default_idle; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 62671f53202a..feda520bd034 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -690,6 +690,23 @@ static inline bool got_nohz_idle_kick(void) bool sched_can_stop_tick(void) { /* + * FIFO realtime policy runs the highest priority task. Other runnable + * tasks are of a lower priority. The scheduler tick does nothing. + */ + if (current->policy == SCHED_FIFO) + return true; + + /* + * Round-robin realtime tasks time slice with other tasks at the same + * realtime priority. Is this task the only one at this priority? + */ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; + + return rt_se->run_list.prev == rt_se->run_list.next; + } + + /* * More than one running task need preemption. * nr_running update is assumed to be visible * after IPI is sent from wakers. diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..0a81a954c041 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -914,6 +914,12 @@ static void yield_task_dl(struct rq *rq) } update_rq_clock(rq); update_curr_dl(rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..ee595ef30470 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1196,9 +1196,11 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; - long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1211,30 +1213,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. + * Allow a move that brings us closer to a balanced situation, + * without moving things past the point of balance. */ orig_src_load = env->src_stats.load; - orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + /* + * In a task swap, there will be one load moving from src to dst, + * and another moving back. This is the net sum of both moves. + * A simple task move will always have a positive value. + * Allow the move if it brings the system closer to a balanced + * situation, without crossing over the balance point. + */ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* @@ -1763,6 +1774,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) } } /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; nodes = max_group; } return nid; |