From b1a0fbfdde65dffd83c84c006f84fa12041907c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Dec 2013 10:12:40 -0500 Subject: percpu: fix spurious sparse warnings from DEFINE_PER_CPU() When CONFIG_DEBUG_FORCE_WEAK_PER_CPU or CONFIG_ARCH_NEEDS_WEAK_PER_CPU is set, DEFINE_PER_CPU() explodes into cryptic series of definitions to still allow using "static" for percpu variables while keeping all per-cpu symbols unique in the kernel image which is required for weak symbols. This ultimately converts the actual symbol to global whether DEFINE_PER_CPU() is prefixed with static or not. Unfortunately, the macro forgot to add explicit extern declartion of the actual symbol ending up defining global symbol without preceding declaration for static definitions which naturally don't have matching DECLARE_PER_CPU(). The only ill effect is triggering of the following warnings. fs/inode.c:74:8: warning: symbol 'nr_inodes' was not declared. Should it be static? fs/inode.c:75:8: warning: symbol 'nr_unused' was not declared. Should it be static? Fix it by adding extern declaration in the DEFINE_PER_CPU() macro. Signed-off-by: Tejun Heo Reported-by: Wanlong Gao Tested-by: Wanlong Gao --- include/linux/percpu-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 57e890abe1f0..a5fc7d01aad6 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -69,6 +69,7 @@ __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ extern __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + extern __PCPU_ATTRS(sec) __typeof__(type) name; \ __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \ __typeof__(type) name #else -- cgit v1.2.3 From ba1f14fbe70965ae0fb1655a5275a62723f65b77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2013 14:26:41 +0100 Subject: sched: Remove PREEMPT_NEED_RESCHED from generic code While hunting a preemption issue with Alexander, Ben noticed that the currently generic PREEMPT_NEED_RESCHED stuff is horribly broken for load-store architectures. We currently rely on the IPI to fold TIF_NEED_RESCHED into PREEMPT_NEED_RESCHED, but when this IPI lands while we already have a load for the preempt-count but before the store, the store will erase the PREEMPT_NEED_RESCHED change. The current preempt-count only works on load-store archs because interrupts are assumed to be completely balanced wrt their preempt_count fiddling; the previous preempt_count load will match the preempt_count state after the interrupt and therefore nothing gets lost. This patch removes the PREEMPT_NEED_RESCHED usage from generic code and pushes it into x86 arch code; the generic code goes back to relying on TIF_NEED_RESCHED. Boot tested on x86_64 and compile tested on ppc64. Reported-by: Benjamin Herrenschmidt Reported-and-Tested-by: Alexander Graf Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20131128132641.GP10022@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 11 +++++++++++ include/asm-generic/preempt.h | 35 +++++++++++------------------------ include/linux/sched.h | 2 -- 3 files changed, 22 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 8729723636fd..c8b051933b1b 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -7,6 +7,12 @@ DECLARE_PER_CPU(int, __preempt_count); +/* + * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such + * that a decrement hitting 0 means we can and should reschedule. + */ +#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) + /* * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users * that think a non-zero value indicates we cannot preempt. @@ -74,6 +80,11 @@ static __always_inline void __preempt_count_sub(int val) __this_cpu_add_4(__preempt_count, -val); } +/* + * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ static __always_inline bool __preempt_count_dec_and_test(void) { GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index ddf2b420ac8f..1cd3f5d767a8 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -3,13 +3,11 @@ #include -/* - * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users - * that think a non-zero value indicates we cannot preempt. - */ +#define PREEMPT_ENABLED (0) + static __always_inline int preempt_count(void) { - return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; + return current_thread_info()->preempt_count; } static __always_inline int *preempt_count_ptr(void) @@ -17,11 +15,6 @@ static __always_inline int *preempt_count_ptr(void) return ¤t_thread_info()->preempt_count; } -/* - * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the - * alternative is loosing a reschedule. Better schedule too often -- also this - * should be a very rare operation. - */ static __always_inline void preempt_count_set(int pc) { *preempt_count_ptr() = pc; @@ -41,28 +34,17 @@ static __always_inline void preempt_count_set(int pc) task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ } while (0) -/* - * We fold the NEED_RESCHED bit into the preempt count such that - * preempt_enable() can decrement and test for needing to reschedule with a - * single instruction. - * - * We invert the actual bit, so that when the decrement hits 0 we know we both - * need to resched (the bit is cleared) and can resched (no preempt count). - */ - static __always_inline void set_preempt_need_resched(void) { - *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; } static __always_inline void clear_preempt_need_resched(void) { - *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; } static __always_inline bool test_preempt_need_resched(void) { - return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); + return false; } /* @@ -81,7 +63,12 @@ static __always_inline void __preempt_count_sub(int val) static __always_inline bool __preempt_count_dec_and_test(void) { - return !--*preempt_count_ptr(); + /* + * Because of load-store architectures cannot do per-cpu atomic + * operations; we cannot use PREEMPT_NEED_RESCHED because it might get + * lost. + */ + return !--*preempt_count_ptr() && tif_need_resched(); } /* @@ -89,7 +76,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(void) { - return unlikely(!*preempt_count_ptr()); + return unlikely(!preempt_count() && tif_need_resched()); } #ifdef CONFIG_PREEMPT diff --git a/include/linux/sched.h b/include/linux/sched.h index 768b037dfacb..96d674ba3876 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -440,8 +440,6 @@ struct task_cputime { .sum_exec_runtime = 0, \ } -#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) - #ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) #else -- cgit v1.2.3 From be5e610c0fd6ef772cafb9e0bd4128134804aef3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Nov 2013 18:27:06 +0100 Subject: math64: Add mul_u64_u32_shr() Introduce mul_u64_u32_shr() as proposed by Andy a while back; it allows using 64x64->128 muls on 64bit archs and recent GCC which defines __SIZEOF_INT128__ and __int128. (This new method will be used by the scheduler.) Signed-off-by: Peter Zijlstra Cc: fweisbec@gmail.com Cc: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-hxjoeuzmrcaumR0uZwjpe2pv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + include/linux/math64.h | 30 ++++++++++++++++++++++++++++++ init/Kconfig | 6 ++++++ 3 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e903c71f7e69..0952ecd60eca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,6 +26,7 @@ config X86 select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE diff --git a/include/linux/math64.h b/include/linux/math64.h index 69ed5f5e9f6e..c45c089bfdac 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -133,4 +133,34 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) return ret; } +#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) + +#ifndef mul_u64_u32_shr +static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +{ + return (u64)(((unsigned __int128)a * mul) >> shift); +} +#endif /* mul_u64_u32_shr */ + +#else + +#ifndef mul_u64_u32_shr +static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) +{ + u32 ah, al; + u64 ret; + + al = a; + ah = a >> 32; + + ret = ((u64)al * mul) >> shift; + if (ah) + ret += ((u64)ah * mul) << (32 - shift); + + return ret; +} +#endif /* mul_u64_u32_shr */ + +#endif + #endif /* _LINUX_MATH64_H */ diff --git a/init/Kconfig b/init/Kconfig index 79383d3aa5dc..4e5d96ab2034 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -809,6 +809,12 @@ config GENERIC_SCHED_CLOCK config ARCH_SUPPORTS_NUMA_BALANCING bool +# +# For architectures that know their GCC __int128 support is sound +# +config ARCH_SUPPORTS_INT128 + bool + # For architectures that (ab)use NUMA to represent different memory regions # all cpu-local but of different latencies, such as SuperH. # -- cgit v1.2.3 From 9dbdb155532395ba000c5d5d187658b0e17e529f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Nov 2013 18:27:06 +0100 Subject: sched/fair: Rework sched_fair time accounting Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This results in him occasionally seeing time going backwards - which crashes the scheduler ... Most of our time accounting can actually handle that except the most common one; the tick time update of sched_fair. There is a further problem with that code; previously we assumed that because we get a tick every TICK_NSEC our time delta could never exceed 32bits and math was simpler. However, ever since Frederic managed to get NO_HZ_FULL merged; this is no longer the case since now a task can run for a long time indeed without getting a tick. It only takes about ~4.2 seconds to overflow our u32 in nanoseconds. This means we not only need to better deal with time going backwards; but also means we need to be able to deal with large deltas. This patch reworks the entire code and uses mul_u64_u32_shr() as proposed by Andy a long while ago. We express our virtual time scale factor in a u32 multiplier and shift right and the 32bit mul_u64_u32_shr() implementation reduces to a single 32x32->64 multiply if the time delta is still short (common case). For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128. Reported-and-Tested-by: Christian Engelmayer Signed-off-by: Peter Zijlstra Cc: fweisbec@gmail.com Cc: Paul Turner Cc: Stanislaw Gruszka Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +- kernel/sched/fair.c | 144 ++++++++++++++++++++++---------------------------- 2 files changed, 66 insertions(+), 81 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 96d674ba3876..53f97eb8dbc7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -930,7 +930,8 @@ struct pipe_inode_info; struct uts_namespace; struct load_weight { - unsigned long weight, inv_weight; + unsigned long weight; + u32 inv_weight; }; struct sched_avg { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ade1a31..9030da7bcb15 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -178,59 +178,61 @@ void sched_init_granularity(void) update_sysctl(); } -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - +#define WMULT_CONST (~0U) #define WMULT_SHIFT 32 -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +static void __update_inv_weight(struct load_weight *lw) +{ + unsigned long w; + + if (likely(lw->inv_weight)) + return; + + w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; +} /* - * delta *= weight / lw + * delta_exec * weight / lw.weight + * OR + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT + * + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * we're guaranteed shift stays positive because inv_weight is guaranteed to + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. + * + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus + * weight/lw.weight <= 1, and therefore our shift will also be positive. */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { - u64 tmp; + u64 fact = scale_load_down(weight); + int shift = WMULT_SHIFT; - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; + __update_inv_weight(lw); - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; + if (unlikely(fact >> 32)) { + while (fact >> 32) { + fact >>= 1; + shift--; + } } - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + /* hint to use a 32x32->64 mul */ + fact = (u64)(u32)fact * lw->inv_weight; + + while (fact >> 32) { + fact >>= 1; + shift--; + } - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); + return mul_u64_u32_shr(delta_exec, fact, shift); } @@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, /* * delta /= w */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; } @@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&lw, se->load.weight); load = &lw; } - slice = calc_delta_mine(slice, se->load.weight, load); + slice = __calc_delta(slice, se->load.weight, load); } return slice; } @@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p) #endif /* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. + * Update the current task's runtime statistics. */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); -} - static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_clock_task(rq_of(cfs_rq)); - unsigned long delta_exec; + u64 delta_exec; if (unlikely(!curr)) return; - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) return; - __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); @@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) } } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; @@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)); } -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -- cgit v1.2.3 From f78dea064c5f7de07de4912a6e5136dbc443d614 Mon Sep 17 00:00:00 2001 From: Marc Carino Date: Mon, 16 Dec 2013 18:15:53 -0800 Subject: libata: implement ATA_HORKAGE_NO_NCQ_TRIM and apply it to Micro M500 SSDs Certain drives cannot handle queued TRIM commands properly, even though support is indicated in the IDENTIFY DEVICE buffer. This patch allows for disabling the commands for the affected drives and apply it to the Micron/Crucial M500 SSDs which exhibit incorrect protocol behavior when issued queued TRIM commands, which could lead to silent data corruption. tj: Merged two unnecessarily split patches and made minor edits including shortening horkage name. Signed-off-by: Marc Carino Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/1387246554-7311-1-git-send-email-marc.ceeeee@gmail.com Cc: stable@vger.kernel.org # 3.12+ --- drivers/ata/libata-core.c | 15 +++++++++++++-- include/linux/libata.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index ff0158481d53..1393a5890ed5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2149,9 +2149,16 @@ static int ata_dev_config_ncq(struct ata_device *dev, "failed to get NCQ Send/Recv Log Emask 0x%x\n", err_mask); } else { + u8 *cmds = dev->ncq_send_recv_cmds; + dev->flags |= ATA_DFLAG_NCQ_SEND_RECV; - memcpy(dev->ncq_send_recv_cmds, ap->sector_buf, - ATA_LOG_NCQ_SEND_RECV_SIZE); + memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_SEND_RECV_SIZE); + + if (dev->horkage & ATA_HORKAGE_NO_NCQ_TRIM) { + ata_dev_dbg(dev, "disabling queued TRIM support\n"); + cmds[ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET] &= + ~ATA_LOG_NCQ_SEND_RECV_DSM_TRIM; + } } } @@ -4205,6 +4212,10 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "PIONEER DVD-RW DVR-212D", NULL, ATA_HORKAGE_NOSETXFER }, { "PIONEER DVD-RW DVR-216D", NULL, ATA_HORKAGE_NOSETXFER }, + /* devices that don't properly handle queued TRIM commands */ + { "Micron_M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM, }, + { "Crucial_CT???M500SSD1", NULL, ATA_HORKAGE_NO_NCQ_TRIM, }, + /* End Marker */ { } }; diff --git a/include/linux/libata.h b/include/linux/libata.h index 0e23c26485f4..9b503376738f 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -418,6 +418,7 @@ enum { ATA_HORKAGE_DUMP_ID = (1 << 16), /* dump IDENTIFY data */ ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ ATA_HORKAGE_ATAPI_DMADIR = (1 << 18), /* device requires dmadir */ + ATA_HORKAGE_NO_NCQ_TRIM = (1 << 19), /* don't use queued TRIM */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 85328240c625f322af9f69c7b60e619717101d77 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 26 Nov 2013 06:33:52 +0000 Subject: net: allow netdev_all_upper_get_next_dev_rcu with rtnl lock held It is useful to be able to walk all upper devices when bringing a device online where the RTNL lock is held. In this case it is safe to walk the all_adj_list because the RTNL lock is used to protect the write side as well. This patch adds a check to see if the rtnl lock is held before throwing a warning in netdev_all_upper_get_next_dev_rcu(). Also because we now have a call site for lockdep_rtnl_is_held() outside COFIG_LOCK_PROVING an inline definition returning 1 is needed. Similar to the rcu_read_lock_is_held(). Fixes: 2a47fa45d4df ("ixgbe: enable l2 forwarding acceleration for macvlans") CC: Veaceslav Falico Reported-by: Yuanhan Liu Signed-off-by: John Fastabend Tested-by: Phil Schmitt Signed-off-by: Jeff Kirsher --- include/linux/rtnetlink.h | 5 +++++ net/core/dev.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 939428ad25ac..8e3e66ac0a52 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -24,6 +24,11 @@ extern int rtnl_trylock(void); extern int rtnl_is_locked(void); #ifdef CONFIG_PROVE_LOCKING extern int lockdep_rtnl_is_held(void); +#else +static inline int lockdep_rtnl_is_held(void) +{ + return 1; +} #endif /* #ifdef CONFIG_PROVE_LOCKING */ /** diff --git a/net/core/dev.c b/net/core/dev.c index ba3b7ea5ebb3..4fc17221545d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4500,7 +4500,7 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, { struct netdev_adjacent *upper; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); -- cgit v1.2.3 From 0e3da5bb8da45890b1dc413404e0f978ab71173e Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Mon, 16 Dec 2013 11:02:09 +0200 Subject: ip_gre: fix msg_name parsing for recvfrom/recvmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ipgre_header_parse() needs to parse the tunnel's ip header and it uses mac_header to locate the iphdr. This got broken when gre tunneling was refactored as mac_header is no longer updated to point to iphdr. Introduce skb_pop_mac_header() helper to do the mac_header assignment and use it in ipgre_rcv() to fix msg_name parsing. Bug introduced in commit c54419321455 (GRE: Refactor GRE tunneling code.) Cc: Pravin B Shelar Signed-off-by: Timo Teräs Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ net/ipv4/ip_gre.c | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 215b5ea1cb30..6aae83890520 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1638,6 +1638,11 @@ static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) skb->mac_header += offset; } +static inline void skb_pop_mac_header(struct sk_buff *skb) +{ + skb->mac_header = skb->network_header; +} + static inline void skb_probe_transport_header(struct sk_buff *skb, const int offset_hint) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d7aea4c5b940..e560ef34cf4b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -217,6 +217,7 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { + skb_pop_mac_header(skb); ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); return PACKET_RCVD; } -- cgit v1.2.3 From c97102ba96324da330078ad8619ba4dfe840dbe3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 18 Dec 2013 17:08:31 -0800 Subject: kexec: migrate to reboot cpu Commit 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") moved reboot= handling to generic code. In the process it also removed the code in native_machine_shutdown() which are moving reboot process to reboot_cpu/cpu0. I guess that thought must have been that all reboot paths are calling migrate_to_reboot_cpu(), so we don't need this special handling. But kexec reboot path (kernel_kexec()) is not calling migrate_to_reboot_cpu() so above change broke kexec. Now reboot can happen on non-boot cpu and when INIT is sent in second kerneo to bring up BP, it brings down the machine. So start calling migrate_to_reboot_cpu() in kexec reboot path to avoid this problem. Bisected by WANG Chao. Reported-by: Matthew Whitehead Reported-by: Dave Young Signed-off-by: Vivek Goyal Tested-by: Baoquan He Tested-by: WANG Chao Acked-by: H. Peter Anvin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 1 + kernel/kexec.c | 1 + kernel/reboot.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 8e00f9f6f963..9e7db9e73cc1 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -43,6 +43,7 @@ extern int unregister_reboot_notifier(struct notifier_block *); * Architecture-specific implementations of sys_reboot commands. */ +extern void migrate_to_reboot_cpu(void); extern void machine_restart(char *cmd); extern void machine_halt(void); extern void machine_power_off(void); diff --git a/kernel/kexec.c b/kernel/kexec.c index d0d8fca54065..9c970167e402 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1680,6 +1680,7 @@ int kernel_kexec(void) { kexec_in_progress = true; kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/reboot.c b/kernel/reboot.c index f813b3474646..662c83fc16b7 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -static void migrate_to_reboot_cpu(void) +void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ int cpu = reboot_cpu; -- cgit v1.2.3 From de466bd628e8d663fdf3f791bc8db318ee85c714 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:42 -0800 Subject: mm: numa: avoid unnecessary disruption of NUMA hinting during migration do_huge_pmd_numa_page() handles the case where there is parallel THP migration. However, by the time it is checked the NUMA hinting information has already been disrupted. This patch adds an earlier check with some helpers. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 +++++++++ mm/huge_memory.c | 22 ++++++++++++++++------ mm/migrate.c | 12 ++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f5096b58b20d..b7717d74da7f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -90,10 +90,19 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING +extern bool pmd_trans_migrating(pmd_t pmd); +extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); #else +static inline bool pmd_trans_migrating(pmd_t pmd) +{ + return false; +} +static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ +} static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 70e7429fd8ea..7de1bf85f683 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -882,6 +882,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = 0; goto out_unlock; } + + /* mmap_sem prevents this happening but warn if that changes */ + WARN_ON(pmd_trans_migrating(pmd)); + if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ spin_unlock(src_ptl); @@ -1299,6 +1303,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + spin_unlock(ptl); + wait_migrate_huge_page(vma->anon_vma, pmdp); + goto out; + } + page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); @@ -1329,12 +1344,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } - /* - * If there are potential migrations, wait for completion and retry. We - * do not relock and check_same as the page may no longer be mapped. - * Furtermore, even if the page is currently misplaced, there is no - * guarantee it is still misplaced after the migration completes. - */ + /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { spin_unlock(ptl); wait_on_page_locked(page); diff --git a/mm/migrate.c b/mm/migrate.c index a987525810ae..cfb419085261 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1655,6 +1655,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 1; } +bool pmd_trans_migrating(pmd_t pmd) +{ + struct page *page = pmd_page(pmd); + return PageLocked(page); +} + +void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + wait_on_page_locked(page); +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on -- cgit v1.2.3 From 20841405940e7be0617612d521e206e4b6b325db Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 18 Dec 2013 17:08:44 -0800 Subject: mm: fix TLB flush race between migration, and change_protection_range There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable.h | 11 ++++++++-- include/asm-generic/pgtable.h | 2 +- include/linux/mm_types.h | 44 +++++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + mm/huge_memory.c | 7 ++++++ mm/mprotect.c | 2 ++ mm/pgtable-generic.c | 5 +++-- 8 files changed, 69 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 8358dc144959..0f9e94537eee 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte) } #define pte_accessible pte_accessible -static inline unsigned long pte_accessible(pte_t a) +static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a) { return pte_val(a) & _PAGE_VALID; } @@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U * and SUN4V pte layout, so this inline test is fine. */ - if (likely(mm != &init_mm) && pte_accessible(orig)) + if (likely(mm != &init_mm) && pte_accessible(mm, orig)) tlb_batch_add(mm, addr, ptep, orig, fullmm); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3d1999458709..bbc8b12fa443 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -452,9 +452,16 @@ static inline int pte_present(pte_t a) } #define pte_accessible pte_accessible -static inline int pte_accessible(pte_t a) +static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { - return pte_flags(a) & _PAGE_PRESENT; + if (pte_flags(a) & _PAGE_PRESENT) + return true; + + if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + mm_tlb_flush_pending(mm)) + return true; + + return false; } static inline int pte_hidden(pte_t pte) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f330d28e4d0e..b12079afbd5f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #endif #ifndef pte_accessible -# define pte_accessible(pte) ((void)(pte),1) +# define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bd299418a934..e5c49c30460f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -442,6 +442,14 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + /* + * An operation with batched TLB flushing is going on. Anything that + * can move process memory needs to flush the TLB when moving a + * PROT_NONE or PROT_NUMA mapped page. + */ + bool tlb_flush_pending; #endif struct uprobes_state uprobes_state; }; @@ -459,4 +467,40 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return mm->cpu_vm_mask_var; } +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) +/* + * Memory barriers to keep this state in sync are graciously provided by + * the page table locks, outside of which no page table modifications happen. + * The barriers below prevent the compiler from re-ordering the instructions + * around the memory barriers that are already present in the code. + */ +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + barrier(); + return mm->tlb_flush_pending; +} +static inline void set_tlb_flush_pending(struct mm_struct *mm) +{ + mm->tlb_flush_pending = true; + barrier(); +} +/* Clearing is done after a TLB flush, which also provides a barrier. */ +static inline void clear_tlb_flush_pending(struct mm_struct *mm) +{ + barrier(); + mm->tlb_flush_pending = false; +} +#else +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + return false; +} +static inline void set_tlb_flush_pending(struct mm_struct *mm) +{ +} +static inline void clear_tlb_flush_pending(struct mm_struct *mm) +{ +} +#endif + #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be9548c..5721f0e3f2da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); mm_init_owner(mm, p); + clear_tlb_flush_pending(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7de1bf85f683..3d2783e10596 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,6 +1376,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto clear_pmdnuma; } + /* + * The page_table_lock above provides a memory barrier + * with change_protection_range. + */ + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + /* * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. diff --git a/mm/mprotect.c b/mm/mprotect.c index f8421722acb9..bb53a6591aea 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -188,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + set_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -199,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); + clear_tlb_flush_pending(mm); return pages; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e84cad27a801..a8b919925934 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + struct mm_struct *mm = (vma)->vm_mm; pte_t pte; - pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - if (pte_accessible(pte)) + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } -- cgit v1.2.3 From af2c1401e6f9177483be4fad876d0073669df9df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:45 -0800 Subject: mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates According to documentation on barriers, stores issued before a LOCK can complete after the lock implying that it's possible tlb_flush_pending can be visible after a page table update. As per revised documentation, this patch adds a smp_mb__before_spinlock to guarantee the correct ordering. Signed-off-by: Mel Gorman Acked-by: Paul E. McKenney Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e5c49c30460f..ad0616f2fe2c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -482,7 +482,12 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) static inline void set_tlb_flush_pending(struct mm_struct *mm) { mm->tlb_flush_pending = true; - barrier(); + + /* + * Guarantee that the tlb_flush_pending store does not leak into the + * critical section updating the page tables + */ + smp_mb__before_spinlock(); } /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void clear_tlb_flush_pending(struct mm_struct *mm) -- cgit v1.2.3 From 597d795a2a786d22dd872332428e2b9439ede639 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 20 Dec 2013 13:35:58 +0200 Subject: mm: do not allocate page->ptl dynamically, if spinlock_t fits to long In struct page we have enough space to fit long-size page->ptl there, but we use dynamically-allocated page->ptl if size(spinlock_t) is larger than sizeof(int). It hurts 64-bit architectures with CONFIG_GENERIC_LOCKBREAK, where sizeof(spinlock_t) == 8, but it easily fits into struct page. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/lockref.h | 2 +- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 3 ++- kernel/bounds.c | 2 +- mm/memory.c | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c8929c3832db..4bfde0e99ed5 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -19,7 +19,7 @@ #define USE_CMPXCHG_LOCKREF \ (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \ - IS_ENABLED(CONFIG_SMP) && !BLOATED_SPINLOCKS) + IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4) struct lockref { union { diff --git a/include/linux/mm.h b/include/linux/mm.h index 1cedd000cf29..35527173cf50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1317,7 +1317,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ #if USE_SPLIT_PTE_PTLOCKS -#if BLOATED_SPINLOCKS +#if ALLOC_SPLIT_PTLOCKS extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); @@ -1325,7 +1325,7 @@ static inline spinlock_t *ptlock_ptr(struct page *page) { return page->ptl; } -#else /* BLOATED_SPINLOCKS */ +#else /* ALLOC_SPLIT_PTLOCKS */ static inline bool ptlock_alloc(struct page *page) { return true; @@ -1339,7 +1339,7 @@ static inline spinlock_t *ptlock_ptr(struct page *page) { return &page->ptl; } -#endif /* BLOATED_SPINLOCKS */ +#endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ad0616f2fe2c..290901a8c1de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -26,6 +26,7 @@ struct address_space; #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) +#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) /* * Each physical page in the system has a struct page associated with @@ -155,7 +156,7 @@ struct page { * system if PG_buddy is set. */ #if USE_SPLIT_PTE_PTLOCKS -#if BLOATED_SPINLOCKS +#if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; diff --git a/kernel/bounds.c b/kernel/bounds.c index 5253204afdca..9fd4246b04b8 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,6 @@ void foo(void) #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif - DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ } diff --git a/mm/memory.c b/mm/memory.c index 5d9025f3b3e1..b6e211b779d0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4271,7 +4271,7 @@ void copy_user_huge_page(struct page *dst, struct page *src, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS +#if ALLOC_SPLIT_PTLOCKS bool ptlock_alloc(struct page *page) { spinlock_t *ptl; -- cgit v1.2.3 From df36ac1bc2a166eef90785d584e4cfed6f52bd32 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 18 Dec 2013 15:17:10 -0800 Subject: pstore: Don't allow high traffic options on fragile devices Some pstore backing devices use on board flash as persistent storage. These have limited numbers of write cycles so it is a poor idea to use them from high frequency operations. Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- drivers/acpi/apei/erst.c | 1 + drivers/firmware/efi/efi-pstore.c | 1 + fs/pstore/platform.c | 7 +++++-- include/linux/pstore.h | 3 +++ 4 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 26311f23c824..cb1d557fc22c 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -942,6 +942,7 @@ static int erst_clearer(enum pstore_type_id type, u64 id, int count, static struct pstore_info erst_info = { .owner = THIS_MODULE, .name = "erst", + .flags = PSTORE_FLAGS_FRAGILE, .open = erst_open_pstore, .close = erst_close_pstore, .read = erst_reader, diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 743fd426f21b..4b9dc836dcf9 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -356,6 +356,7 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count, static struct pstore_info efi_pstore_info = { .owner = THIS_MODULE, .name = "efi", + .flags = PSTORE_FLAGS_FRAGILE, .open = efi_pstore_open, .close = efi_pstore_close, .read = efi_pstore_read, diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b8e93a40a5d3..78c3c2097787 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); kmsg_dump_register(&pstore_dumper); - pstore_register_console(); - pstore_register_ftrace(); + + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_register_console(); + pstore_register_ftrace(); + } if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + diff --git a/include/linux/pstore.h b/include/linux/pstore.h index abd437d0a8a7..ece0c6bbfcc5 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -51,6 +51,7 @@ struct pstore_info { char *buf; size_t bufsize; struct mutex read_mutex; /* serialize open/read/close */ + int flags; int (*open)(struct pstore_info *psi); int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, @@ -70,6 +71,8 @@ struct pstore_info { void *data; }; +#define PSTORE_FLAGS_FRAGILE 1 + #ifdef CONFIG_PSTORE extern int pstore_register(struct pstore_info *); extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason); -- cgit v1.2.3 From 8e321fefb0e60bae4e2a28d20fc4fa30758d27c6 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sat, 21 Dec 2013 17:56:08 -0500 Subject: aio/migratepages: make aio migrate pages sane The arbitrary restriction on page counts offered by the core migrate_page_move_mapping() code results in rather suspicious looking fiddling with page reference counts in the aio_migratepage() operation. To fix this, make migrate_page_move_mapping() take an extra_count parameter that allows aio to tell the code about its own reference count on the page being migrated. While cleaning up aio_migratepage(), make it validate that the old page being passed in is actually what aio_migratepage() expects to prevent misbehaviour in the case of races. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 52 +++++++++++++++++++++++++++++++++++++++++-------- include/linux/migrate.h | 3 ++- mm/migrate.c | 13 +++++++------ 3 files changed, 53 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/aio.c b/fs/aio.c index fd1c0baf15bb..efa708b29054 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx) int i; for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); - put_page(ctx->ring_pages[i]); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); } put_aio_ring_file(ctx); @@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, unsigned long flags; int rc; + rc = 0; + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (ctx) { + pgoff_t idx; + spin_lock_irqsave(&ctx->completion_lock, flags); + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else + rc = -EINVAL; + spin_unlock(&mapping->private_lock); + + if (rc != 0) + return rc; + /* Writeback must be complete */ BUG_ON(PageWriteback(old)); - put_page(old); + get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { - get_page(old); + put_page(new); return rc; } - get_page(new); - /* We can potentially race against kioctx teardown here. Use the * address_space's private data lock to protect the mapping's * private_data. @@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) - ctx->ring_pages[idx] = new; + if (idx < (pgoff_t)ctx->nr_pages) { + /* And only do the move if things haven't changed */ + if (ctx->ring_pages[idx] == old) + ctx->ring_pages[idx] = new; + else + rc = -EAGAIN; + } else + rc = -EINVAL; spin_unlock_irqrestore(&ctx->completion_lock, flags); } else rc = -EBUSY; spin_unlock(&mapping->private_lock); + if (rc == MIGRATEPAGE_SUCCESS) + put_page(old); + else + put_page(new); + return rc; } #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index b7717d74da7f..f015c059e159 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -55,7 +55,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode); + struct buffer_head *head, enum migrate_mode mode, + int extra_count); #else static inline void putback_lru_pages(struct list_head *l) {} diff --git a/mm/migrate.c b/mm/migrate.c index e9b710201335..9194375b2307 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -317,14 +317,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, */ int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode) + struct buffer_head *head, enum migrate_mode mode, + int extra_count) { - int expected_count = 0; + int expected_count = 1 + extra_count; void **pslot; if (!mapping) { /* Anonymous page without mapping */ - if (page_count(page) != 1) + if (page_count(page) != expected_count) return -EAGAIN; return MIGRATEPAGE_SUCCESS; } @@ -334,7 +335,7 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + page_has_private(page); + expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -584,7 +585,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -611,7 +612,7 @@ int buffer_migrate_page(struct address_space *mapping, head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; -- cgit v1.2.3 From f60900f2609e893c7f8d0bccc7ada4947dac4cd5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 23 Dec 2013 18:49:30 +0100 Subject: auxvec.h: account for AT_HWCAP2 in AT_VECTOR_SIZE_BASE Commit 2171364d1a92 ("powerpc: Add HWCAP2 aux entry") introduced a new AT_ auxv entry type AT_HWCAP2 but failed to update AT_VECTOR_SIZE_BASE accordingly. Signed-off-by: Ard Biesheuvel Fixes: 2171364d1a92 (powerpc: Add HWCAP2 aux entry) Cc: stable@vger.kernel.org Acked-by: Michael Neuling Cc: Nishanth Aravamudan Cc: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- include/linux/auxvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h index 669fef5c745a..3e0fbe441763 100644 --- a/include/linux/auxvec.h +++ b/include/linux/auxvec.h @@ -3,6 +3,6 @@ #include -#define AT_VECTOR_SIZE_BASE 19 /* NEW_AUX_ENT entries in auxiliary table */ +#define AT_VECTOR_SIZE_BASE 20 /* NEW_AUX_ENT entries in auxiliary table */ /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */ #endif /* _LINUX_AUXVEC_H */ -- cgit v1.2.3 From 73409f3b0ff0ae7bc1f647936b23e6d5d5dcbe28 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 27 Dec 2013 13:04:33 -0500 Subject: net: Add some clarification to skb_tx_timestamp() comment. We've seen so many instances of people invoking skb_tx_timestamp() after the device already has been given the packet, that it's worth being a little bit more verbose and explicit in this comment. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6aae83890520..6f69b3f914fb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2531,6 +2531,10 @@ static inline void sw_tx_timestamp(struct sk_buff *skb) * Ethernet MAC Drivers should call this function in their hard_xmit() * function immediately before giving the sk_buff to the MAC hardware. * + * Specifically, one should make absolutely sure that this function is + * called before TX completion of this packet can trigger. Otherwise + * the packet could potentially already be freed. + * * @skb: A socket buffer. */ static inline void skb_tx_timestamp(struct sk_buff *skb) -- cgit v1.2.3 From 2205369a314e12fcec4781cc73ac9c08fc2b47de Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Dec 2013 16:23:35 -0500 Subject: vlan: Fix header ops passthru when doing TX VLAN offload. When the vlan code detects that the real device can do TX VLAN offloads in hardware, it tries to arrange for the real device's header_ops to be invoked directly. But it does so illegally, by simply hooking the real device's header_ops up to the VLAN device. This doesn't work because we will end up invoking a set of header_ops routines which expect a device type which matches the real device, but will see a VLAN device instead. Fix this by providing a pass-thru set of header_ops which will arrange to pass the proper real device instead. To facilitate this add a dev_rebuild_header(). There are implementations which provide a ->cache and ->create but not a ->rebuild (f.e. PLIP). So we need a helper function just like dev_hard_header() to avoid crashes. Use this helper in the one existing place where the header_ops->rebuild was being invoked, the neighbour code. With lots of help from Florian Westphal. Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ net/8021q/vlan_dev.c | 19 ++++++++++++++++++- net/core/neighbour.c | 2 +- 3 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d9a550bf3e8e..7514b9c37a39 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1912,6 +1912,15 @@ static inline int dev_parse_header(const struct sk_buff *skb, return dev->header_ops->parse(skb, haddr); } +static inline int dev_rebuild_header(struct sk_buff *skb) +{ + const struct net_device *dev = skb->dev; + + if (!dev->header_ops || !dev->header_ops->rebuild) + return 0; + return dev->header_ops->rebuild(skb); +} + typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len); int register_gifconf(unsigned int family, gifconf_func_t *gifconf); static inline int unregister_gifconf(unsigned int family) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 762896ebfcf5..47c908f1f626 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -530,6 +530,23 @@ static const struct header_ops vlan_header_ops = { .parse = eth_header_parse, }; +static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, + unsigned int len) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; + + return dev_hard_header(skb, real_dev, type, daddr, saddr, len); +} + +static const struct header_ops vlan_passthru_header_ops = { + .create = vlan_passthru_hard_header, + .rebuild = dev_rebuild_header, + .parse = eth_header_parse, +}; + static struct device_type vlan_type = { .name = "vlan", }; @@ -573,7 +590,7 @@ static int vlan_dev_init(struct net_device *dev) dev->needed_headroom = real_dev->needed_headroom; if (real_dev->features & NETIF_F_HW_VLAN_CTAG_TX) { - dev->header_ops = real_dev->header_ops; + dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { dev->header_ops = &vlan_header_ops; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 36b1443f9ae4..932c6d7cf666 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1275,7 +1275,7 @@ int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && - dev->header_ops->rebuild(skb)) + dev_rebuild_header(skb)) return 0; return dev_queue_xmit(skb); -- cgit v1.2.3 From 7a7ffbabf99445704be01bff5d7e360da908cf8e Mon Sep 17 00:00:00 2001 From: Wei-Chun Chao Date: Thu, 26 Dec 2013 13:10:22 -0800 Subject: ipv4: fix tunneled VM traffic over hw VXLAN/GRE GSO NIC VM to VM GSO traffic is broken if it goes through VXLAN or GRE tunnel and the physical NIC on the host supports hardware VXLAN/GRE GSO offload (e.g. bnx2x and next-gen mlx4). Two issues - (VXLAN) VM traffic has SKB_GSO_DODGY and SKB_GSO_UDP_TUNNEL with SKB_GSO_TCP/UDP set depending on the inner protocol. GSO header integrity check fails in udp4_ufo_fragment if inner protocol is TCP. Also gso_segs is calculated incorrectly using skb->len that includes tunnel header. Fix: robust check should only be applied to the inner packet. (VXLAN & GRE) Once GSO header integrity check passes, NULL segs is returned and the original skb is sent to hardware. However the tunnel header is already pulled. Fix: tunnel header needs to be restored so that hardware can perform GSO properly on the original packet. Signed-off-by: Wei-Chun Chao Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 +++++++++++++ net/ipv4/gre_offload.c | 11 +++++++---- net/ipv4/udp.c | 6 +++++- net/ipv4/udp_offload.c | 37 +++++++++++++++++++------------------ 4 files changed, 44 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7514b9c37a39..5faaadb0c74f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3017,6 +3017,19 @@ static inline void netif_set_gso_max_size(struct net_device *dev, dev->gso_max_size = size; } +static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, + int pulled_hlen, u16 mac_offset, + int mac_len) +{ + skb->protocol = protocol; + skb->encapsulation = 1; + skb_push(skb, pulled_hlen); + skb_reset_transport_header(skb); + skb->mac_header = mac_offset; + skb->network_header = skb->mac_header + mac_len; + skb->mac_len = mac_len; +} + static inline bool netif_is_macvlan(struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e5d436188464..2cd02f32f99f 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -28,6 +28,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t enc_features; int ghl = GRE_HEADER_SECTION; struct gre_base_hdr *greh; + u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; __be16 protocol = skb->protocol; int tnl_hlen; @@ -58,13 +59,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } else csum = false; + if (unlikely(!pskb_may_pull(skb, ghl))) + goto out; + /* setup inner skb. */ skb->protocol = greh->protocol; skb->encapsulation = 0; - if (unlikely(!pskb_may_pull(skb, ghl))) - goto out; - __skb_pull(skb, ghl); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); @@ -73,8 +74,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); goto out; + } skb = segs; tnl_hlen = skb_tnl_header_len(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f140048334ce..a7e4729e974b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2478,6 +2478,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; @@ -2497,8 +2498,11 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; + } outer_hlen = skb_tnl_header_len(skb); skb = segs; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 83206de2bc76..79c62bdcd3c5 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -41,6 +41,14 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; + int offset; + __wsum csum; + + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) { + segs = skb_udp_tunnel_segment(skb, features); + goto out; + } mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -63,27 +71,20 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ - if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) - segs = skb_udp_tunnel_segment(skb, features); - else { - int offset; - __wsum csum; - - /* Do software UFO. Complete and fill in the UDP checksum as - * HW cannot do checksum of UDP packets sent as multiple - * IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - segs = skb_segment(skb, features); - } + segs = skb_segment(skb, features); out: return segs; } -- cgit v1.2.3 From f663dd9aaf9ed124f25f0f8452edf238f087ad50 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 10 Jan 2014 16:18:26 +0800 Subject: net: core: explicitly select a txq before doing l2 forwarding Currently, the tx queue were selected implicitly in ndo_dfwd_start_xmit(). The will cause several issues: - NETIF_F_LLTX were removed for macvlan, so txq lock were done for macvlan instead of lower device which misses the necessary txq synchronization for lower device such as txq stopping or frozen required by dev watchdog or control path. - dev_hard_start_xmit() was called with NULL txq which bypasses the net device watchdog. - dev_hard_start_xmit() does not check txq everywhere which will lead a crash when tso is disabled for lower device. Fix this by explicitly introducing a new param for .ndo_select_queue() for just selecting queues in the case of l2 forwarding offload. netdev_pick_tx() was also extended to accept this parameter and dev_queue_xmit_accel() was used to do l2 forwarding transmission. With this fixes, NETIF_F_LLTX could be preserved for macvlan and there's no need to check txq against NULL in dev_hard_start_xmit(). Also there's no need to keep a dedicated ndo_dfwd_start_xmit() and we can just reuse the code of dev_queue_xmit() to do the transmission. In the future, it was also required for macvtap l2 forwarding support since it provides a necessary synchronization method. Cc: John Fastabend Cc: Neil Horman Cc: e1000-devel@lists.sourceforge.net Signed-off-by: Jason Wang Acked-by: Neil Horman Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 ++- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 ++- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 33 ++++++++++--------------- drivers/net/ethernet/lantiq_etop.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- drivers/net/ethernet/tile/tilegx.c | 3 ++- drivers/net/macvlan.c | 9 +++---- drivers/net/team/team.c | 3 ++- drivers/net/tun.c | 3 ++- drivers/net/wireless/mwifiex/main.c | 3 ++- drivers/staging/bcm/Bcmnet.c | 3 ++- drivers/staging/netlogic/xlr_net.c | 3 ++- drivers/staging/rtl8188eu/os_dep/os_intfs.c | 3 ++- include/linux/netdevice.h | 12 ++++++--- net/core/dev.c | 29 +++++++++++++--------- net/core/flow_dissector.c | 10 +++++--- net/core/netpoll.c | 2 +- net/mac80211/iface.c | 6 +++-- net/sched/sch_generic.c | 2 +- 21 files changed, 80 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 398e299ee1bd..4b8c58b0ec24 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3732,7 +3732,8 @@ static inline int bond_slave_override(struct bonding *bond, } -static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 550412088dd0..bf811565ee24 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1833,7 +1833,8 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw) bnx2x_napi_disable_cnic(bp); } -u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb) +u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { struct bnx2x *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index da8fcaa74495..41f3ca5ad972 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -524,7 +524,8 @@ int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac); int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos); /* select_queue callback */ -u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb); +u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv); static inline void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index cc06854296a3..5bcc870f8367 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6827,12 +6827,20 @@ static inline int ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size) return __ixgbe_maybe_stop_tx(tx_ring, size); } -#ifdef IXGBE_FCOE -static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { + struct ixgbe_fwd_adapter *fwd_adapter = accel_priv; +#ifdef IXGBE_FCOE struct ixgbe_adapter *adapter; struct ixgbe_ring_feature *f; int txq; +#endif + + if (fwd_adapter) + return skb->queue_mapping + fwd_adapter->tx_base_queue; + +#ifdef IXGBE_FCOE /* * only execute the code below if protocol is FCoE @@ -6858,9 +6866,11 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb) txq -= f->indices; return txq + f->offset; +#else + return __netdev_pick_tx(dev, skb); +#endif } -#endif netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct ixgbe_adapter *adapter, struct ixgbe_ring *tx_ring) @@ -7629,27 +7639,11 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv) kfree(fwd_adapter); } -static netdev_tx_t ixgbe_fwd_xmit(struct sk_buff *skb, - struct net_device *dev, - void *priv) -{ - struct ixgbe_fwd_adapter *fwd_adapter = priv; - unsigned int queue; - struct ixgbe_ring *tx_ring; - - queue = skb->queue_mapping + fwd_adapter->tx_base_queue; - tx_ring = fwd_adapter->real_adapter->tx_ring[queue]; - - return __ixgbe_xmit_frame(skb, dev, tx_ring); -} - static const struct net_device_ops ixgbe_netdev_ops = { .ndo_open = ixgbe_open, .ndo_stop = ixgbe_close, .ndo_start_xmit = ixgbe_xmit_frame, -#ifdef IXGBE_FCOE .ndo_select_queue = ixgbe_select_queue, -#endif .ndo_set_rx_mode = ixgbe_set_rx_mode, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ixgbe_set_mac, @@ -7689,7 +7683,6 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_bridge_getlink = ixgbe_ndo_bridge_getlink, .ndo_dfwd_add_station = ixgbe_fwd_add, .ndo_dfwd_del_station = ixgbe_fwd_del, - .ndo_dfwd_start_xmit = ixgbe_fwd_xmit, }; /** diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 6a6c1f76d8e0..ec94a20d7099 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -619,7 +619,8 @@ ltq_etop_set_multicast_list(struct net_device *dev) } static u16 -ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb) +ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { /* we are currently only using the first queue */ return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index f54ebd5a1702..a7fcd593b2db 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -592,7 +592,8 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk } } -u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb) +u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 rings_p_up = priv->num_tx_rings_p_up; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index f3758de59c05..d5758adceaa2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -714,7 +714,8 @@ int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); -u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb); +u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 628b736e5ae7..0e9fb3301b11 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -2080,7 +2080,8 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) } /* Return subqueue id on this core (one per core). */ -static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { return smp_processor_id(); } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 5360f73c9817..bc8faaec33f5 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -299,7 +299,7 @@ netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, if (vlan->fwd_priv) { skb->dev = vlan->lowerdev; - ret = dev_hard_start_xmit(skb, skb->dev, NULL, vlan->fwd_priv); + ret = dev_queue_xmit_accel(skb, vlan->fwd_priv); } else { ret = macvlan_queue_xmit(skb, dev); } @@ -365,10 +365,8 @@ static int macvlan_open(struct net_device *dev) */ if (IS_ERR_OR_NULL(vlan->fwd_priv)) { vlan->fwd_priv = NULL; - } else { - dev->features &= ~NETIF_F_LLTX; + } else return 0; - } } err = -EBUSY; @@ -702,8 +700,7 @@ static netdev_features_t macvlan_fix_features(struct net_device *dev, features = netdev_increment_features(vlan->lowerdev->features, features, mask); - if (!vlan->fwd_priv) - features |= NETIF_F_LLTX; + features |= NETIF_F_LLTX; return features; } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 736050d6b451..b75ae5bde673 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1647,7 +1647,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7c8343a4f918..ecec8029c5e8 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -348,7 +348,8 @@ unlock: * different rxq no. here. If we could not get rxhash, then we would * hope the rxq no. may help here. */ -static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c index 78e8a6666cc6..8bb8988c435c 100644 --- a/drivers/net/wireless/mwifiex/main.c +++ b/drivers/net/wireless/mwifiex/main.c @@ -746,7 +746,8 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev) } static u16 -mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb) +mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { skb->priority = cfg80211_classify8021d(skb); return mwifiex_1d_to_wmm_queue[skb->priority]; diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c index 53fee2f9a498..8dfdd2732bdc 100644 --- a/drivers/staging/bcm/Bcmnet.c +++ b/drivers/staging/bcm/Bcmnet.c @@ -39,7 +39,8 @@ static INT bcm_close(struct net_device *dev) return 0; } -static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { return ClassifyPacket(netdev_priv(dev), skb); } diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c index 235d2b1ec593..eedffed17e39 100644 --- a/drivers/staging/netlogic/xlr_net.c +++ b/drivers/staging/netlogic/xlr_net.c @@ -306,7 +306,8 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } -static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb) +static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb, + void *accel_priv) { return (u16)smp_processor_id(); } diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c index 17659bb04bef..dd69e344e409 100644 --- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c +++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c @@ -652,7 +652,8 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) return dscp >> 5; } -static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb) +static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5faaadb0c74f..ce2a1f5f9a1e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -769,7 +769,8 @@ struct netdev_phys_port_id { * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * - * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb); + * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, + * void *accel_priv); * Called to decide which queue to when device supports multiple * transmit queues. * @@ -990,7 +991,8 @@ struct net_device_ops { netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, - struct sk_buff *skb); + struct sk_buff *skb, + void *accel_priv); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); @@ -1529,7 +1531,8 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, } struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb); + struct sk_buff *skb, + void *accel_priv); u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb); /* @@ -1819,6 +1822,7 @@ int dev_close(struct net_device *dev); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); @@ -2426,7 +2430,7 @@ int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq, void *accel_priv); + struct netdev_queue *txq); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); extern int netdev_budget; diff --git a/net/core/dev.c b/net/core/dev.c index 4fc17221545d..0ce469e5ec80 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2539,7 +2539,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq, void *accel_priv) + struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; @@ -2605,13 +2605,10 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); skb_len = skb->len; - if (accel_priv) - rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv); - else rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK && txq) + if (rc == NETDEV_TX_OK) txq_trans_update(txq); return rc; } @@ -2627,10 +2624,7 @@ gso: dev_queue_xmit_nit(nskb, dev); skb_len = nskb->len; - if (accel_priv) - rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv); - else - rc = ops->ndo_start_xmit(nskb, dev); + rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) @@ -2811,7 +2805,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -int dev_queue_xmit(struct sk_buff *skb) +int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -2827,7 +2821,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = netdev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT @@ -2863,7 +2857,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq, NULL); + rc = dev_hard_start_xmit(skb, dev, txq); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); @@ -2892,8 +2886,19 @@ out: rcu_read_unlock_bh(); return rc; } + +int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} EXPORT_SYMBOL(dev_queue_xmit); +int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +{ + return __dev_queue_xmit(skb, accel_priv); +} +EXPORT_SYMBOL(dev_queue_xmit_accel); + /*======================================================================= Receiver routines diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d6ef17322500..2fc5beaf5783 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -395,17 +395,21 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) EXPORT_SYMBOL(__netdev_pick_tx); struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + void *accel_priv) { int queue_index = 0; if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); + queue_index = ops->ndo_select_queue(dev, skb, + accel_priv); else queue_index = __netdev_pick_tx(dev, skb); - queue_index = dev_cap_txqueue(dev, queue_index); + + if (!accel_priv) + queue_index = dev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 303097874633..19fe9c717ced 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -375,7 +375,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 36c3a4cbcabf..a0757913046e 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1061,7 +1061,8 @@ static void ieee80211_uninit(struct net_device *dev) } static u16 ieee80211_netdev_select_queue(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + void *accel_priv) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } @@ -1078,7 +1079,8 @@ static const struct net_device_ops ieee80211_dataif_ops = { }; static u16 ieee80211_monitor_select_queue(struct net_device *dev, - struct sk_buff *skb) + struct sk_buff *skb, + void *accel_priv) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 922a09406ba7..7fc899a943a8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq, NULL); + ret = dev_hard_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); -- cgit v1.2.3 From 0c3351d451ae2fa438d5d1ed719fc43354fbffbb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 2 Jan 2014 15:11:13 -0800 Subject: seqlock: Use raw_ prefix instead of _no_lockdep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linus disliked the _no_lockdep() naming, so instead use the more-consistent raw_* prefix to the non-lockdep enabled seqcount methods. This also adds raw_ methods for the write operations as well, which will be utilized in a following patch. Acked-by: Linus Torvalds Reviewed-by: Stephen Boyd Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra Cc: Krzysztof Hałasa Cc: Uwe Kleine-König Cc: Willy Tarreau Link: http://lkml.kernel.org/r/1388704274-5278-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- arch/x86/vdso/vclock_gettime.c | 8 ++++---- include/linux/seqlock.h | 27 +++++++++++++++++++-------- 2 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 2ada505067cc..eb5d7a56f8d4 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -178,7 +178,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) ts->tv_nsec = 0; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; @@ -198,7 +198,7 @@ notrace static int do_monotonic(struct timespec *ts) ts->tv_nsec = 0; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; @@ -214,7 +214,7 @@ notrace static int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -225,7 +225,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqcount_begin_no_lockdep(>od->seq); + seq = raw_read_seqcount_begin(>od->seq); ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index cf87a24c0f92..535f158977b9 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -117,15 +117,15 @@ repeat: } /** - * read_seqcount_begin_no_lockdep - start seq-read critical section w/o lockdep + * raw_read_seqcount_begin - start seq-read critical section w/o lockdep * @s: pointer to seqcount_t * Returns: count to be passed to read_seqcount_retry * - * read_seqcount_begin_no_lockdep opens a read critical section of the given + * raw_read_seqcount_begin opens a read critical section of the given * seqcount, but without any lockdep checking. Validity of the critical * section is tested by checking read_seqcount_retry function. */ -static inline unsigned read_seqcount_begin_no_lockdep(const seqcount_t *s) +static inline unsigned raw_read_seqcount_begin(const seqcount_t *s) { unsigned ret = __read_seqcount_begin(s); smp_rmb(); @@ -144,7 +144,7 @@ static inline unsigned read_seqcount_begin_no_lockdep(const seqcount_t *s) static inline unsigned read_seqcount_begin(const seqcount_t *s) { seqcount_lockdep_reader_access(s); - return read_seqcount_begin_no_lockdep(s); + return raw_read_seqcount_begin(s); } /** @@ -206,14 +206,26 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) } + +static inline void raw_write_seqcount_begin(seqcount_t *s) +{ + s->sequence++; + smp_wmb(); +} + +static inline void raw_write_seqcount_end(seqcount_t *s) +{ + smp_wmb(); + s->sequence++; +} + /* * Sequence counter only version assumes that callers are using their * own mutexing. */ static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass) { - s->sequence++; - smp_wmb(); + raw_write_seqcount_begin(s); seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); } @@ -225,8 +237,7 @@ static inline void write_seqcount_begin(seqcount_t *s) static inline void write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, 1, _RET_IP_); - smp_wmb(); - s->sequence++; + raw_write_seqcount_end(s); } /** -- cgit v1.2.3 From 2fac2b891f287691c27ee8d2eeecf39571b27fea Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Mon, 13 Jan 2014 14:29:04 -0700 Subject: i2c: Re-instate body of i2c_parent_is_i2c_adapter() The body of i2c_parent_is_i2c_adapter() is currently guarded by I2C_MUX. It should be CONFIG_I2C_MUX instead. Among potentially other problems, this resulted in i2c_lock_adapter() only locking I2C mux child adapters, and not the parent adapter. In turn, this could allow inter-mingling of mux child selection and I2C transactions, which could result in I2C transactions being directed to the wrong I2C bus, and possibly even switching between busses in the middle of a transaction. One concrete issue caused by this bug was corrupted HDMI EDID reads during boot on the NVIDIA Tegra Seaboard system, although this only became apparent in recent linux-next, when the boot timing was changed just enough to trigger the race condition. Fixes: 3923172b3d70 ("i2c: reduce parent checking to a NOOP in non-I2C_MUX case") Cc: Phil Carmody Cc: Signed-off-by: Stephen Warren Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index eff50e062be8..d9c8dbd3373f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -445,7 +445,7 @@ static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data) static inline struct i2c_adapter * i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter) { -#if IS_ENABLED(I2C_MUX) +#if IS_ENABLED(CONFIG_I2C_MUX) struct device *parent = adapter->dev.parent; if (parent != NULL && parent->type == &i2c_adapter_type) -- cgit v1.2.3 From 5a610fcc7390ee60308deaf09426ada87a1eeec2 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 14 Jan 2014 17:56:41 -0800 Subject: crash_dump: fix compilation error (on MIPS at least) In file included from kernel/crash_dump.c:2:0: include/linux/crash_dump.h:22:27: error: unknown type name `pgprot_t' when CONFIG_CRASH_DUMP=y The error was traced back to commit 9cb218131de1 ("vmcore: introduce remap_oldmem_pfn_range()") include to get the missing definition Signed-off-by: Qais Yousef Reviewed-by: James Hogan Cc: Michael Holzheu Acked-by: Vivek Goyal Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index fe68a5a98583..7032518f8542 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -6,6 +6,8 @@ #include #include +#include /* for pgprot_t */ + #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) -- cgit v1.2.3