From 92f6a5e37a2e2d3342dafb2b39c2f8bc340bbf84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2009 12:43:07 +0200 Subject: sched: Do less agressive buddy clearing Yanmin reported a hackbench regression due to: > commit de69a80be32445b0a71e8e3b757e584d7beb90f7 > Author: Peter Zijlstra > Date: Thu Sep 17 09:01:20 2009 +0200 > > sched: Stop buddies from hogging the system I really liked de69a80b, and it affecting hackbench shows I wasn't crazy ;-) So hackbench is a multi-cast, with one sender spraying multiple receivers, who in their turn don't spray back. This would be exactly the scenario that patch 'cures'. Previously we would not clear the last buddy after running the next task, allowing the sender to get back to work sooner than it otherwise ought to have been, increasing latencies for other tasks. Now, since those receivers don't poke back, they don't enforce the buddy relation, which means there's nothing to re-elect the sender. Cure this by less agressively clearing the buddy stats. Only clear buddies when they were not chosen. It should still avoid a buddy sticking around long after its served its time. Reported-by: "Zhang, Yanmin" Signed-off-by: Peter Zijlstra CC: Mike Galbraith LKML-Reference: <1255084986.8802.46.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eeda..c32c3e643daa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *buddy; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next) { + buddy = cfs_rq->next; + cfs_rq->next = NULL; + if (wakeup_preempt_entity(buddy, se) < 1) + return buddy; + } - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + if (cfs_rq->last) { + buddy = cfs_rq->last; + cfs_rq->last = NULL; + if (wakeup_preempt_entity(buddy, se) < 1) + return buddy; + } return se; } @@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From f685ceacab07d3f6c236f04803e2f2f0dbcc5afb Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Oct 2009 23:09:22 +0200 Subject: sched: Strengthen buddies and mitigate buddy induced latencies This patch restores the effectiveness of LAST_BUDDY in preventing pgsql+oltp from collapsing due to wakeup preemption. It also switches LAST_BUDDY to exclusively do what it does best, namely mitigate the effects of aggressive wakeup preemption, which improves vmark throughput markedly, and restores mysql+oltp scalability. Since buddies are about scalability, enable them beginning at the point where we begin expanding sched_latency, namely sched_nr_latency. Previously, buddies were cleared aggressively, which seriously reduced their effectiveness. Not clearing aggressively however, produces a small drop in mysql+oltp throughput immediately after peak, indicating that LAST_BUDDY is actually doing some harm. This is right at the point where X on the desktop in competition with another load wants low latency service. Ergo, do not enable until we need to scale. To mitigate latency induced by buddies, or by a task just missing wakeup preemption, check latency at tick time. Last hunk prevents buddies from stymieing BALANCE_NEWIDLE via CACHE_HOT_BUDDY. Supporting performance tests: tip = v2.6.32-rc5-1497-ga525b32 tipx = NO_GENTLE_FAIR_SLEEPERS NEXT_BUDDY granularity knobs = 31 knobs + 31 buddies tip+x = NO_GENTLE_FAIR_SLEEPERS granularity knobs = 31 knobs (Three run averages except where noted.) vmark: ------ tip 108466 messages per second tip+ 125307 messages per second tip+x 125335 messages per second tipx 117781 messages per second 2.6.31.3 122729 messages per second mysql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 9949.89 18690.20 34801.24 34460.04 32682.88 30765.97 28305.27 25059.64 19548.08 tip+ 10013.90 18526.84 34900.38 34420.14 33069.83 32083.40 30578.30 28010.71 25605.47 tipx 9698.71 18002.70 34477.56 33420.01 32634.30 31657.27 29932.67 26827.52 21487.18 2.6.31.3 8243.11 18784.20 34404.83 33148.38 31900.32 31161.90 29663.81 25995.94 18058.86 pgsql+oltp: ----------- clients 1 2 4 8 16 32 64 128 256 .......................................................................................... tip 13686.37 26609.25 51934.28 51347.81 49479.51 45312.65 36691.91 26851.57 24145.35 tip+ (1x) 13907.85 27135.87 52951.98 52514.04 51742.52 50705.43 49947.97 48374.19 46227.94 tip+x 13906.78 27065.81 52951.19 52542.59 52176.11 51815.94 50838.90 49439.46 46891.00 tipx 13742.46 26769.81 52351.99 51891.73 51320.79 50938.98 50248.65 48908.70 46553.84 2.6.31.3 13815.35 26906.46 52683.34 52061.31 51937.10 51376.80 50474.28 49394.47 47003.25 Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 73 ++++++++++++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 27 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched.c b/kernel/sched.c index 789001da0a94..cae6700bedb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2008,7 +2008,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c32c3e643daa..37087a7fac22 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,21 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - struct sched_entity *buddy; + struct sched_entity *left = se; - if (cfs_rq->next) { - buddy = cfs_rq->next; - cfs_rq->next = NULL; - if (wakeup_preempt_entity(buddy, se) < 1) - return buddy; - } + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last) { - buddy = cfs_rq->last; - cfs_rq->last = NULL; - if (wakeup_preempt_entity(buddy, se) < 1) - return buddy; - } + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1577,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; update_curr(cfs_rq); @@ -1591,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1648,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) -- cgit v1.2.3