From 34c06254ff82a815fdccdfae7517a06c9b768cee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 00:12:24 -0500 Subject: cgroup: fix cftype->file_offset handling 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") introduced cftype->file_offset so that the handles for per-css file instances can be recorded. These handles then can be used, for example, to generate file modified notifications. Unfortunately, it made the wrong assumption that files are created once for a given css and removed on its destruction. Due to the dependencies among subsystems, a css may be hidden from userland and then later shown again. This is implemented by removing and re-creating the affected files, so the associated kernfs_node for a given cgroup file may change over time. This incorrect assumption led to the corruption of css->files lists. Reimplement cftype->file_offset handling so that cgroup_file->kn is protected by a lock and updated as files are created and destroyed. This also makes keeping them on per-cgroup list unnecessary. Signed-off-by: Tejun Heo Reported-by: James Sedgwick Fixes: 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") Acked-by: Johannes Weiner Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 4 ---- include/linux/cgroup.h | 14 +------------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b26276d..869fd4a3d28e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -90,7 +90,6 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ - struct list_head node; /* anchored at css->files */ struct kernfs_node *kn; }; @@ -134,9 +133,6 @@ struct cgroup_subsys_state { */ u64 serial_nr; - /* all cgroup_files associated with this css */ - struct list_head files; - /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754f89c5..f64083030ad5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -88,6 +88,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_file_notify(struct cgroup_file *cfile); char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); @@ -516,19 +517,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -/** - * cgroup_file_notify - generate a file modified event for a cgroup_file - * @cfile: target cgroup_file - * - * @cfile must have been obtained by setting cftype->file_offset. - */ -static inline void cgroup_file_notify(struct cgroup_file *cfile) -{ - /* might not have been created due to one of the CFTYPE selector flags */ - if (cfile->kn) - kernfs_notify(cfile->kn); -} - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; -- cgit v1.2.3 From 614e4c4ebc75517295bccd29b20ddbc5b52af6fc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 12 Nov 2015 11:00:04 +0100 Subject: perf/core: Robustify the perf_cgroup_from_task() RCU checks This patch reinforces the lockdep checks performed by perf_cgroup_from_tsk() by passing the perf_event_context whenever possible. It is okay to not hold the RCU read lock when we know we hold the ctx->lock. This patch makes sure this property holds. In some functions, such as perf_cgroup_sched_in(), we do not pass the context because we are sure we are holding the RCU read lock. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: edumazet@google.com Link: http://lkml.kernel.org/r/1447322404-10920-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 2 +- include/linux/perf_event.h | 6 ++++-- kernel/events/core.c | 20 +++++++++++++------- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8ed391..a316ca96f1b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d841d33bcdc9..f9828a48f16a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -697,9 +697,11 @@ struct perf_cgroup { * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) +perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { - return container_of(task_css(task, perf_event_cgrp_id), + return container_of(task_css_check(task, perf_event_cgrp_id, + ctx ? lockdep_is_held(&ctx->lock) + : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 60e71ca42c22..1ac857aff7b0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -521,8 +521,10 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -542,15 +544,17 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -572,11 +576,13 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing -- cgit v1.2.3 From 90eec103b96e30401c0b846045bf8a1c7159b6da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2015 11:08:45 +0100 Subject: treewide: Remove old email address There were still a number of references to my old Red Hat email address in the kernel source. Remove these while keeping the Red Hat copyright notices intact. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/blackfin/kernel/perf_event.c | 2 +- arch/sh/kernel/perf_event.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/tile/kernel/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.h | 2 +- arch/x86/kernel/irq_work.c | 2 +- include/asm-generic/tlb.h | 2 +- include/linux/jump_label.h | 2 +- include/linux/lockdep.h | 2 +- include/linux/proportions.h | 2 +- include/linux/uprobes.h | 2 +- kernel/events/callchain.c | 2 +- kernel/events/core.c | 2 +- kernel/events/ring_buffer.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/irq_work.c | 2 +- kernel/jump_label.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_proc.c | 2 +- kernel/sched/clock.c | 2 +- kernel/sched/fair.c | 2 +- kernel/trace/trace_event_perf.c | 2 +- lib/btree.c | 2 +- lib/proportions.c | 2 +- mm/page-writeback.c | 2 +- 26 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/arch/blackfin/kernel/perf_event.c b/arch/blackfin/kernel/perf_event.c index 1e9c8b0bf486..170d786807c4 100644 --- a/arch/blackfin/kernel/perf_event.c +++ b/arch/blackfin/kernel/perf_event.c @@ -14,7 +14,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 7cfd7f153966..4dca18347ee9 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -10,7 +10,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index b0da5aedb336..3091267c5cc3 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -9,7 +9,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c index bb509cee3b59..8767060d70fb 100644 --- a/arch/tile/kernel/perf_event.c +++ b/arch/tile/kernel/perf_event.c @@ -21,7 +21,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4562cf070c27..2bf79d7c97df 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ffa7a92025e1..ab18b8a91583 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a1e8d6..3512ba607361 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index db284bff29dc..9dbb739cafa0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -5,7 +5,7 @@ * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * - * Copyright 2011 Red Hat, Inc., Peter Zijlstra + * Copyright 2011 Red Hat, Inc., Peter Zijlstra * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 8dde55974f18..0536524bb9eb 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -5,7 +5,7 @@ * Jump label support * * Copyright (C) 2009-2012 Jason Baron - * Copyright (C) 2011-2012 Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 70400dc7660f..c57e424d914b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -2,7 +2,7 @@ * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.txt for more details. */ diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 5440f64d2942..21221338ad18 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -1,7 +1,7 @@ /* * FLoating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * This file contains the public data structure and API definitions. */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0bdc72f36905..4a29c75b146e 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -21,7 +21,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index d659487254d5..9c418002b8c1 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/core.c b/kernel/events/core.c index 1ac857aff7b0..5854fcf7f05a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b5d1ea79c595..adfdc0536117 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e5e9798aa0c..7dad84913abf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/irq_work.c b/kernel/irq_work.c index cbf9fb899d92..bcf107ce0854 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f7dd15d537f9..05254eeb4b4e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron - * Copyright (C) 2011 Peter Zijlstra + * Copyright (C) 2011 Peter Zijlstra * */ #include diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index deae3907ac1e..60ace56618f6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index d83d798bef95..dbb61a302548 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a205101c23..caf4041f5b0a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8f669c..90e26b11deaa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index abfc903e741e..cc9f7a9319be 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker */ diff --git a/lib/btree.c b/lib/btree.c index 4264871ea1a0..f93a945274af 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -5,7 +5,7 @@ * * Copyright (c) 2007-2008 Joern Engel * Bits and pieces stolen from Peter Zijlstra's code, which is - * Copyright 2007, Red Hat Inc. Peter Zijlstra + * Copyright 2007, Red Hat Inc. Peter Zijlstra * GPLv2 * * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch diff --git a/lib/proportions.c b/lib/proportions.c index 6f724298f67a..efa54f259ea9 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -1,7 +1,7 @@ /* * Floating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Description: * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e4d65445fa7..d15d88c8efa1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2,7 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. -- cgit v1.2.3 From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup: fix handling of multi-destination migration from subtree_control enabling Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_null+0x1a/0x20 [] pids_cancel.constprop.6+0x31/0x40 [] pids_can_attach+0x6d/0xf0 [] cgroup_taskset_migrate+0x6c/0x330 [] cgroup_migrate+0xf5/0x190 [] cgroup_attach_task+0x176/0x200 [] __cgroup_procs_write+0x2ad/0x460 [] cgroup_procs_write+0x14/0x20 [] cgroup_file_write+0x35/0x1c0 [] kernfs_fop_write+0x141/0x190 [] __vfs_write+0x28/0xe0 [] vfs_write+0xac/0x1a0 [] SyS_write+0x49/0xb0 [] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo Reported-and-tested-by: Daniel Wagner Cc: Aleksa Sarai --- block/blk-cgroup.c | 6 +++--- include/linux/cgroup-defs.h | 9 +++------ include/linux/cgroup.h | 33 +++++++++++++++++++++----------- kernel/cgroup.c | 43 +++++++++++++++++++++++++++++++++--------- kernel/cgroup_freezer.c | 6 +++--- kernel/cgroup_pids.c | 16 ++++++++-------- kernel/cpuset.c | 33 ++++++++++++++++++++------------ kernel/events/core.c | 6 +++--- kernel/sched/core.c | 12 ++++++------ mm/memcontrol.c | 45 ++++++++++++++++++++++---------------------- net/core/netclassid_cgroup.c | 11 ++++++----- net/core/netprio_cgroup.c | 9 +++++---- 12 files changed, 137 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5bcdfc10c23a..5a37188b559f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 869fd4a3d28e..06b77f9dd3f2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -422,12 +422,9 @@ struct cgroup_subsys { void (*css_reset)(struct cgroup_subsys_state *css); void (*css_e_css_changed)(struct cgroup_subsys_state *css); - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); + int (*can_attach)(struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_taskset *tset); + void (*attach)(struct cgroup_taskset *tset); int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f64083030ad5..cb91b44f5f78 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it); @@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor + * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple - * processes. When there are multiple tasks in @tset, if a task of a - * process is in @tset, all tasks of the process are in @tset. Also, all - * are guaranteed to share the same source and destination csses. + * processes. + * + * On the v2 hierarchy, there may be tasks from multiple processes and they + * may not share the source or destination csses. + * + * On traditional hierarchies, when there are multiple tasks in @tset, if a + * task of a process is in @tset, all tasks of the process are in @tset. + * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) +#define cgroup_taskset_for_each(task, dst_css, tset) \ + for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ + (task); \ + (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor + * @dst_css: the destination css * @tset: takset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ -#define cgroup_taskset_for_each_leader(leader, tset) \ - for ((leader) = cgroup_taskset_first((tset)); (leader); \ - (leader) = cgroup_taskset_next((tset))) \ +#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ + for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ + (leader); \ + (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cea63fe4095..470f6536b9e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2237,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, /* check that we can legitimately attach to the cgroup */ for_each_e_css(css, i, dst_cgrp) { if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + tset->ssid = i; + ret = css->ss->can_attach(tset); if (ret) { failed_css = css; goto out_cancel_attach; @@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } ret = 0; goto out_release_tset; @@ -2411,8 +2434,10 @@ out_cancel_attach: for_each_e_css(css, i, dst_cgrp) { if (css == failed_css) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } } out_release_tset: spin_lock_bh(&css_set_lock); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ff02a8e51bb3..2d3df82c54f2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index de3359a48dbb..8e27fc5dbb20 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae73611d80..02a8ea5c9963 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1468,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 36babfd20648..026305dfe523 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9456,12 +9456,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..a9db4819e586 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9acfb165eb52..c92a65b2b4ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4779,23 +4779,18 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; - /* - * We are now commited to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; /* @@ -4805,13 +4800,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * multiple. */ p = NULL; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { WARN_ON_ONCE(p); p = leader; + memcg = mem_cgroup_from_css(css); } if (!p) return 0; + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -4842,8 +4847,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -4985,10 +4989,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5000,17 +5004,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47b1a8f..81cb3c72efe8 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -67,14 +67,15 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_taskset *tset) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a199bf52..40fd09fe06ae 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); -- cgit v1.2.3 From ae5515d66362b9d96cdcfce504567f0b8b7bd83e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 4 Dec 2015 08:38:42 -0700 Subject: Revert: "vfio: Include No-IOMMU mode" Revert commit 033291eccbdb ("vfio: Include No-IOMMU mode") due to lack of a user. This was originally intended to fill a need for the DPDK driver, but uptake has been slow so rather than support an unproven kernel interface revert it and revisit when userspace catches up. Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 15 ---- drivers/vfio/pci/vfio_pci.c | 8 +- drivers/vfio/vfio.c | 186 ++------------------------------------------ include/linux/vfio.h | 3 - include/uapi/linux/vfio.h | 7 -- 5 files changed, 10 insertions(+), 209 deletions(-) (limited to 'include') diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce77495..850d86ca685b 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -31,21 +31,6 @@ menuconfig VFIO If you don't know what to do here, say N. -menuconfig VFIO_NOIOMMU - bool "VFIO No-IOMMU support" - depends on VFIO - help - VFIO is built on the ability to isolate devices using the IOMMU. - Only with an IOMMU can userspace access to DMA capable devices be - considered secure. VFIO No-IOMMU mode enables IOMMU groups for - devices without IOMMU backing for the purpose of re-using the VFIO - infrastructure in a non-secure mode. Use of this mode will result - in an unsupportable kernel and will therefore taint the kernel. - Device assignment to virtual machines is also not possible with - this mode since there is no IOMMU to provide DMA translation. - - If you don't know what to do here, say N. - source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2760a7ba3f30..56bf6dbb93db 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -940,13 +940,13 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; - group = vfio_iommu_group_get(&pdev->dev); + group = iommu_group_get(&pdev->dev); if (!group) return -EINVAL; vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); return -ENOMEM; } @@ -957,7 +957,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); kfree(vdev); return ret; } @@ -993,7 +993,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) if (!vdev) return; - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); + iommu_group_put(pdev->dev.iommu_group); kfree(vdev); if (vfio_pci_is_vga(pdev)) { diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 9da0703e09d0..6070b793cbcb 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -62,7 +62,6 @@ struct vfio_container { struct rw_semaphore group_lock; struct vfio_iommu_driver *iommu_driver; void *iommu_data; - bool noiommu; }; struct vfio_unbound_dev { @@ -85,7 +84,6 @@ struct vfio_group { struct list_head unbound_list; struct mutex unbound_lock; atomic_t opened; - bool noiommu; }; struct vfio_device { @@ -97,147 +95,6 @@ struct vfio_device { void *device_data; }; -#ifdef CONFIG_VFIO_NOIOMMU -static bool noiommu __read_mostly; -module_param_named(enable_unsafe_noiommu_support, - noiommu, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); -#endif - -/* - * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe - * and remove functions, any use cases other than acquiring the first - * reference for the purpose of calling vfio_add_group_dev() or removing - * that symmetric reference after vfio_del_group_dev() should use the raw - * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() - * removes the device from the dummy group and cannot be nested. - */ -struct iommu_group *vfio_iommu_group_get(struct device *dev) -{ - struct iommu_group *group; - int __maybe_unused ret; - - group = iommu_group_get(dev); - -#ifdef CONFIG_VFIO_NOIOMMU - /* - * With noiommu enabled, an IOMMU group will be created for a device - * that doesn't already have one and doesn't have an iommu_ops on their - * bus. We use iommu_present() again in the main code to detect these - * fake groups. - */ - if (group || !noiommu || iommu_present(dev->bus)) - return group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - iommu_group_set_name(group, "vfio-noiommu"); - ret = iommu_group_add_device(group, dev); - iommu_group_put(group); - if (ret) - return NULL; - - /* - * Where to taint? At this point we've added an IOMMU group for a - * device that is not backed by iommu_ops, therefore any iommu_ - * callback using iommu_ops can legitimately Oops. So, while we may - * be about to give a DMA capable device to a user without IOMMU - * protection, which is clearly taint-worthy, let's go ahead and do - * it here. - */ - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); -#endif - - return group; -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_get); - -void vfio_iommu_group_put(struct iommu_group *group, struct device *dev) -{ -#ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_present(dev->bus)) - iommu_group_remove_device(dev); -#endif - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_put); - -#ifdef CONFIG_VFIO_NOIOMMU -static void *vfio_noiommu_open(unsigned long arg) -{ - if (arg != VFIO_NOIOMMU_IOMMU) - return ERR_PTR(-EINVAL); - if (!capable(CAP_SYS_RAWIO)) - return ERR_PTR(-EPERM); - - return NULL; -} - -static void vfio_noiommu_release(void *iommu_data) -{ -} - -static long vfio_noiommu_ioctl(void *iommu_data, - unsigned int cmd, unsigned long arg) -{ - if (cmd == VFIO_CHECK_EXTENSION) - return arg == VFIO_NOIOMMU_IOMMU ? 1 : 0; - - return -ENOTTY; -} - -static int vfio_iommu_present(struct device *dev, void *unused) -{ - return iommu_present(dev->bus) ? 1 : 0; -} - -static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ - return iommu_group_for_each_dev(iommu_group, NULL, - vfio_iommu_present) ? -EINVAL : 0; -} - -static void vfio_noiommu_detach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ -} - -static struct vfio_iommu_driver_ops vfio_noiommu_ops = { - .name = "vfio-noiommu", - .owner = THIS_MODULE, - .open = vfio_noiommu_open, - .release = vfio_noiommu_release, - .ioctl = vfio_noiommu_ioctl, - .attach_group = vfio_noiommu_attach_group, - .detach_group = vfio_noiommu_detach_group, -}; - -static struct vfio_iommu_driver vfio_noiommu_driver = { - .ops = &vfio_noiommu_ops, -}; - -/* - * Wrap IOMMU drivers, the noiommu driver is the one and only driver for - * noiommu groups (and thus containers) and not available for normal groups. - */ -#define vfio_for_each_iommu_driver(con, pos) \ - for (pos = con->noiommu ? &vfio_noiommu_driver : \ - list_first_entry(&vfio.iommu_drivers_list, \ - struct vfio_iommu_driver, vfio_next); \ - (con->noiommu ? pos != NULL : \ - &pos->vfio_next != &vfio.iommu_drivers_list); \ - pos = con->noiommu ? NULL : list_next_entry(pos, vfio_next)) -#else -#define vfio_for_each_iommu_driver(con, pos) \ - list_for_each_entry(pos, &vfio.iommu_drivers_list, vfio_next) -#endif - - /** * IOMMU driver registration */ @@ -342,8 +199,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) /** * Group objects - create, release, get, put, search */ -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - bool noiommu) +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) { struct vfio_group *group, *tmp; struct device *dev; @@ -361,7 +217,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, atomic_set(&group->container_users, 0); atomic_set(&group->opened, 0); group->iommu_group = iommu_group; - group->noiommu = noiommu; group->nb.notifier_call = vfio_iommu_group_notifier; @@ -397,8 +252,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.group_devt), minor), - group, "%s%d", noiommu ? "noiommu-" : "", - iommu_group_id(iommu_group)); + group, "%d", iommu_group_id(iommu_group)); if (IS_ERR(dev)) { vfio_free_group_minor(minor); vfio_group_unlock_and_free(group); @@ -786,8 +640,7 @@ int vfio_add_group_dev(struct device *dev, group = vfio_group_get_from_iommu(iommu_group); if (!group) { - group = vfio_create_group(iommu_group, - !iommu_present(dev->bus)); + group = vfio_create_group(iommu_group); if (IS_ERR(group)) { iommu_group_put(iommu_group); return PTR_ERR(group); @@ -999,7 +852,8 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, */ if (!driver) { mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { if (!try_module_get(driver->ops->owner)) continue; @@ -1068,7 +922,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, } mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { void *data; if (!try_module_get(driver->ops->owner)) @@ -1333,9 +1187,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (atomic_read(&group->container_users)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - f = fdget(container_fd); if (!f.file) return -EBADF; @@ -1351,13 +1202,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) down_write(&container->group_lock); - /* Real groups and fake groups cannot mix */ - if (!list_empty(&container->group_list) && - container->noiommu != group->noiommu) { - ret = -EPERM; - goto unlock_out; - } - driver = container->iommu_driver; if (driver) { ret = driver->ops->attach_group(container->iommu_data, @@ -1367,7 +1211,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) } group->container = container; - container->noiommu = group->noiommu; list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ @@ -1398,9 +1241,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) !group->container->iommu_driver || !vfio_group_viable(group)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - device = vfio_device_get_from_name(group, buf); if (!device) return -ENODEV; @@ -1443,10 +1283,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) fd_install(ret, filep); - if (group->noiommu) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - return ret; } @@ -1535,11 +1371,6 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) if (!group) return -ENODEV; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) { - vfio_group_put(group); - return -EPERM; - } - /* Do we need multiple instances of the group open? Seems not. */ opened = atomic_cmpxchg(&group->opened, 0, 1); if (opened) { @@ -1702,11 +1533,6 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep) if (!atomic_inc_not_zero(&group->container_users)) return ERR_PTR(-EINVAL); - if (group->noiommu) { - atomic_dec(&group->container_users); - return ERR_PTR(-EPERM); - } - if (!group->container->iommu_driver || !vfio_group_viable(group)) { atomic_dec(&group->container_users); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 610a86a892b8..ddb440975382 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -44,9 +44,6 @@ struct vfio_device_ops { void (*request)(void *device_data, unsigned int count); }; -extern struct iommu_group *vfio_iommu_group_get(struct device *dev); -extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); - extern int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops, void *device_data); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 751b69f858c8..9fd7b5d8df2f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -38,13 +38,6 @@ #define VFIO_SPAPR_TCE_v2_IOMMU 7 -/* - * The No-IOMMU IOMMU offers no translation or isolation for devices and - * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU - * code will taint the host kernel and should be used with extreme caution. - */ -#define VFIO_NOIOMMU_IOMMU 8 - /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between -- cgit v1.2.3 From 3cf92222a39cc7842c373dd90a0c204fa7d7cced Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 3 Dec 2015 20:41:29 +0800 Subject: rhashtable: Prevent spurious EBUSY errors on insertion Thomas and Phil observed that under stress rhashtable insertion sometimes failed with EBUSY, even though this error should only ever been seen when we're under attack and our hash chain length has grown to an unacceptable level, even after a rehash. It turns out that the logic for detecting whether there is an existing rehash is faulty. In particular, when two threads both try to grow the same table at the same time, one of them may see the newly grown table and thus erroneously conclude that it had been rehashed. This is what leads to the EBUSY error. This patch fixes this by remembering the current last table we used during insertion so that rhashtable_insert_rehash can detect when another thread has also done a resize/rehash. When this is detected we will give up our resize/rehash and simply retry the insertion with the new table. Reported-by: Thomas Graf Reported-by: Phil Sutter Signed-off-by: Herbert Xu Tested-by: Phil Sutter Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 18 +++++++++++------- lib/rhashtable.c | 45 ++++++++++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 843ceca9a21e..e50b31d18462 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -339,10 +340,11 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params); -int rhashtable_insert_slow(struct rhashtable *ht, const void *key, - struct rhash_head *obj, - struct bucket_table *old_tbl); -int rhashtable_insert_rehash(struct rhashtable *ht); +struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, + const void *key, + struct rhash_head *obj, + struct bucket_table *old_tbl); +int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); @@ -598,9 +600,11 @@ restart: new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(new_tbl)) { - err = rhashtable_insert_slow(ht, key, obj, new_tbl); - if (err == -EAGAIN) + tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); + if (!IS_ERR_OR_NULL(tbl)) goto slow_path; + + err = PTR_ERR(tbl); goto out; } @@ -611,7 +615,7 @@ restart: if (unlikely(rht_grow_above_100(ht, tbl))) { slow_path: spin_unlock_bh(lock); - err = rhashtable_insert_rehash(ht); + err = rhashtable_insert_rehash(ht, tbl); rcu_read_unlock(); if (err) return err; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a54ff8949f91..2ff7ed91663a 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -389,33 +389,31 @@ static bool rhashtable_check_elasticity(struct rhashtable *ht, return false; } -int rhashtable_insert_rehash(struct rhashtable *ht) +int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; - struct bucket_table *tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = rhashtable_last_table(ht, old_tbl); size = tbl->size; + err = -EBUSY; + if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) - return -EBUSY; + goto fail; + + err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); - if (new_tbl == NULL) { - /* Schedule async resize/rehash to try allocation - * non-atomic context. - */ - schedule_work(&ht->run_work); - return -ENOMEM; - } + if (new_tbl == NULL) + goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { @@ -426,12 +424,24 @@ int rhashtable_insert_rehash(struct rhashtable *ht) schedule_work(&ht->run_work); return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_dereference_raw(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; } EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -int rhashtable_insert_slow(struct rhashtable *ht, const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, + const void *key, + struct rhash_head *obj, + struct bucket_table *tbl) { struct rhash_head *head; unsigned int hash; @@ -467,7 +477,12 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key, exit: spin_unlock(rht_bucket_lock(tbl, hash)); - return err; + if (err == 0) + return NULL; + else if (err == -EAGAIN) + return tbl; + else + return ERR_PTR(err); } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); -- cgit v1.2.3 From c5fb8caaf91ea6a92920cf24db10cfc94d58de0f Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 4 Dec 2015 13:54:03 +0100 Subject: vxlan: fix incorrect RCO bit in VXLAN header Commit 3511494ce2f3d ("vxlan: Group Policy extension") changed definition of VXLAN_HF_RCO from 0x00200000 to BIT(24). This is obviously incorrect. It's also in violation with the RFC draft. Fixes: 3511494ce2f3d ("vxlan: Group Policy extension") Cc: Thomas Graf Cc: Tom Herbert Signed-off-by: Jiri Benc Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index c1c899c3a51b..e289ada6adf6 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -79,7 +79,7 @@ struct vxlanhdr { }; /* VXLAN header flags. */ -#define VXLAN_HF_RCO BIT(24) +#define VXLAN_HF_RCO BIT(21) #define VXLAN_HF_VNI BIT(27) #define VXLAN_HF_GBP BIT(31) -- cgit v1.2.3 From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 4 Dec 2015 15:14:04 -0200 Subject: sctp: update the netstamp_needed counter when copying sockets Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy related to disabling sock timestamp. When SCTP accepts an association or peel one off, it copies sock flags but forgot to call net_enable_timestamp() if a packet timestamping flag was copied, leading to extra calls to net_disable_timestamp() whenever such clones were closed. The fix is to call net_enable_timestamp() whenever we copy a sock with that flag on, like tcp does. Reported-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 2 -- net/sctp/socket.c | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 52d27ee924f4..b1d475b5db68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -740,6 +740,8 @@ enum sock_flags { SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ }; +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) { nsk->sk_flags = osk->sk_flags; diff --git a/net/core/sock.c b/net/core/sock.c index e31dfcee1729..d01c8f42dbb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -433,8 +433,6 @@ static bool sock_needs_netstamp(const struct sock *sk) } } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) - static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 03c8256063ec..4c9282bdd067 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7199,6 +7199,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; + + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); } static inline void sctp_copy_descendant(struct sock *sk_to, -- cgit v1.2.3 From 8a0d19c5ed417c78d03f4e0fa7215e58c40896d8 Mon Sep 17 00:00:00 2001 From: lucien Date: Sat, 5 Dec 2015 15:35:36 +0800 Subject: sctp: start t5 timer only when peer rwnd is 0 and local state is SHUTDOWN_PENDING when A sends a data to B, then A close() and enter into SHUTDOWN_PENDING state, if B neither claim his rwnd is 0 nor send SACK for this data, A will keep retransmitting this data until t5 timeout, Max.Retrans times can't work anymore, which is bad. if B's rwnd is not 0, it should send abort after Max.Retrans times, only when B's rwnd == 0 and A's retransmitting beyonds Max.Retrans times, A will start t5 timer, which is also commit f8d960524328 ("sctp: Enforce retransmission limit during shutdown") means, but it lacks the condition peer rwnd == 0. so fix it by adding a bit (zero_window_announced) in peer to record if the last rwnd is 0. If it was, zero_window_announced will be set. and use this bit to decide if start t5 timer when local.state is SHUTDOWN_PENDING. Fixes: commit f8d960524328 ("sctp: Enforce retransmission limit during shutdown") Signed-off-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 ++- net/sctp/outqueue.c | 1 + net/sctp/sm_statefuns.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7bbb71081aeb..eea9bdeecba2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1493,7 +1493,8 @@ struct sctp_association { * : SACK's are not delayed (see Section 6). */ __u8 sack_needed:1, /* Do we need to sack the peer? */ - sack_generation:1; + sack_generation:1, + zero_window_announced:1; __u32 sack_cnt; __u32 adaptation_ind; /* Adaptation Code point. */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 0b3d8189f140..c0380cfb16ae 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1252,6 +1252,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) */ sack_a_rwnd = ntohl(sack->a_rwnd); + asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 6f46aa16cb76..cd34a4a34065 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5412,7 +5412,8 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS); if (asoc->overall_error_count >= asoc->max_retrans) { - if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { + if (asoc->peer.zero_window_announced && + asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { /* * We are here likely because the receiver had its rwnd * closed for a while and we have not been able to -- cgit v1.2.3 From 326fcfa5acca446b3f71e99f6d19881145556e5c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 5 Dec 2015 13:58:11 +0100 Subject: net: remove unnecessary semicolon in netdev_alloc_pcpu_stats() This semicolon causes a build error if the function call is wrapped in parentheses. Fixes: aabc92bbe3cf ("net: add __netdev_alloc_pcpu_stats() to indicate gfp flags") Reported-by: Imre Kaloz Signed-off-by: Felix Fietkau Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3b5d134e945a..3143c847bddb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2084,7 +2084,7 @@ struct pcpu_sw_netstats { }) #define netdev_alloc_pcpu_stats(type) \ - __netdev_alloc_pcpu_stats(type, GFP_KERNEL); + __netdev_alloc_pcpu_stats(type, GFP_KERNEL) #include -- cgit v1.2.3 From 7c23b7c1996597dd9d60bb282fb5fa1be6ebd18b Mon Sep 17 00:00:00 2001 From: "Lu, Han" Date: Mon, 7 Dec 2015 15:59:13 +0800 Subject: ALSA: hda - Fix playback noise with 24/32 bit sample size on BXT In BXT-P A0, HD-Audio DMA requests is later than expected, and makes an audio stream sensitive to system latencies when 24/32 bits are playing. Adjusting threshold of DMA fifo to force the DMA request sooner to improve latency tolerance at the expense of power. v2: move Intel specific code to hda_intel.c Signed-off-by: Lu, Han Signed-off-by: Takashi Iwai --- include/sound/hda_register.h | 3 +++ sound/pci/hda/hda_intel.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/sound/hda_register.h b/include/sound/hda_register.h index 2ae8812d7b1a..94dc6a9772e0 100644 --- a/include/sound/hda_register.h +++ b/include/sound/hda_register.h @@ -93,6 +93,9 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 }; #define AZX_REG_HSW_EM4 0x100c #define AZX_REG_HSW_EM5 0x1010 +/* Skylake/Broxton display HD-A controller Extended Mode registers */ +#define AZX_REG_SKL_EM4L 0x1040 + /* PCI space */ #define AZX_PCIREG_TCSEL 0x44 diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 963f82430938..bff5c8b329d1 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -355,6 +355,8 @@ enum { ((pci)->device == 0x0d0c) || \ ((pci)->device == 0x160c)) +#define IS_BROXTON(pci) ((pci)->device == 0x5a98) + static char *driver_short_names[] = { [AZX_DRIVER_ICH] = "HDA Intel", [AZX_DRIVER_PCH] = "HDA Intel PCH", @@ -506,15 +508,36 @@ static void azx_init_pci(struct azx *chip) } } +/* + * In BXT-P A0, HD-Audio DMA requests is later than expected, + * and makes an audio stream sensitive to system latencies when + * 24/32 bits are playing. + * Adjusting threshold of DMA fifo to force the DMA request + * sooner to improve latency tolerance at the expense of power. + */ +static void bxt_reduce_dma_latency(struct azx *chip) +{ + u32 val; + + val = azx_readl(chip, SKL_EM4L); + val &= (0x3 << 20); + azx_writel(chip, SKL_EM4L, val); +} + static void hda_intel_init_chip(struct azx *chip, bool full_reset) { struct hdac_bus *bus = azx_bus(chip); + struct pci_dev *pci = chip->pci; if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) snd_hdac_set_codec_wakeup(bus, true); azx_init_chip(chip, full_reset); if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) snd_hdac_set_codec_wakeup(bus, false); + + /* reduce dma latency to avoid noise */ + if (IS_BROXTON(pci)) + bxt_reduce_dma_latency(chip); } /* calculate runtime delay from LPIB */ -- cgit v1.2.3 From ea013a9b205b47b1fcbc72522146fad560af0712 Mon Sep 17 00:00:00 2001 From: Andreas Werner Date: Fri, 4 Dec 2015 18:12:49 +0100 Subject: libata-eh.c: Introduce new ata port flag for controller which lockup on read log page Some controller lockup on a ata_read_log_page. Add new ata port flag ATA_FLAG_NO_LOG_PAGE which can used to blacklist a controller. If this flag is set, any attempt to read a log page returns an error without actually issuing the command. Signed-off-by: Andreas Werner Signed-off-by: Tejun Heo --- drivers/ata/libata-eh.c | 8 ++++++++ include/linux/libata.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index cb0508af1459..961acc788f44 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1505,12 +1505,20 @@ static const char *ata_err_string(unsigned int err_mask) unsigned int ata_read_log_page(struct ata_device *dev, u8 log, u8 page, void *buf, unsigned int sectors) { + unsigned long ap_flags = dev->link->ap->flags; struct ata_taskfile tf; unsigned int err_mask; bool dma = false; DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page); + /* + * Return error without actually issuing the command on controllers + * which e.g. lockup on a read log page. + */ + if (ap_flags & ATA_FLAG_NO_LOG_PAGE) + return AC_ERR_DEV; + retry: ata_tf_init(dev, &tf); if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && diff --git a/include/linux/libata.h b/include/linux/libata.h index 83577f8fd15b..600c1e0626a5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -210,6 +210,7 @@ enum { ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ /* (doesn't imply presence) */ ATA_FLAG_SATA = (1 << 1), + ATA_FLAG_NO_LOG_PAGE = (1 << 5), /* do not issue log page read */ ATA_FLAG_NO_ATAPI = (1 << 6), /* No ATAPI support */ ATA_FLAG_PIO_DMA = (1 << 7), /* PIO cmds via DMA */ ATA_FLAG_PIO_LBA48 = (1 << 8), /* Host DMA engine is LBA28 only */ -- cgit v1.2.3 From 57b4bd06ff0372fe1e3617889c4b37fbd500364a Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sun, 6 Dec 2015 11:25:47 +0100 Subject: lightnvm: comments on constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not obvious what NVM_IO_* and NVM_BLK_T_* are used for. Make sure to comment them appropriately as the other constants. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index c6916aec43b6..935ef3844c05 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -50,9 +50,16 @@ enum { NVM_IO_DUAL_ACCESS = 0x1, NVM_IO_QUAD_ACCESS = 0x2, + /* NAND Access Modes */ NVM_IO_SUSPEND = 0x80, NVM_IO_SLC_MODE = 0x100, NVM_IO_SCRAMBLE_DISABLE = 0x200, + + /* Block Types */ + NVM_BLK_T_FREE = 0x0, + NVM_BLK_T_BAD = 0x1, + NVM_BLK_T_DEV = 0x2, + NVM_BLK_T_HOST = 0x4, }; struct nvm_id_group { -- cgit v1.2.3 From 16f26c3aa9b9c36a9d1092ae3258461d1008481e Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sun, 6 Dec 2015 11:25:48 +0100 Subject: lightnvm: replace req queue with nvmdev for lld MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case where a request queue is passed to the low lever lightnvm device drive integration, the device driver might pass its admin commands through another queue. Instead pass nvm_dev, and let the low level drive the appropriate queue. Reported-by: Christoph Hellwig Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 9 +++++---- drivers/lightnvm/core.c | 7 +++---- drivers/lightnvm/gennvm.c | 8 ++++---- drivers/lightnvm/rrpc.c | 2 +- drivers/nvme/host/lightnvm.c | 24 +++++++++++++----------- include/linux/lightnvm.h | 14 +++++++------- 6 files changed, 33 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 0c3940ec5e62..7981b7407305 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -444,8 +444,9 @@ static void null_lnvm_end_io(struct request *rq, int error) blk_put_request(rq); } -static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct request *rq; struct bio *bio = rqd->bio; @@ -470,7 +471,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) +static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) { sector_t size = gb * 1024 * 1024 * 1024ULL; sector_t blksize; @@ -523,7 +524,7 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) return 0; } -static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name) +static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) { mempool_t *virtmem_pool; @@ -541,7 +542,7 @@ static void null_lnvm_destroy_dma_pool(void *pool) mempool_destroy(pool); } -static void *null_lnvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return mempool_alloc(pool, mem_flags); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 86ce887b2ed6..4a8d1fe34c4e 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -74,7 +74,7 @@ EXPORT_SYMBOL(nvm_unregister_target); void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, dma_addr_t *dma_handler) { - return dev->ops->dev_dma_alloc(dev->q, dev->ppalist_pool, mem_flags, + return dev->ops->dev_dma_alloc(dev, dev->ppalist_pool, mem_flags, dma_handler); } EXPORT_SYMBOL(nvm_dev_dma_alloc); @@ -246,7 +246,7 @@ static int nvm_init(struct nvm_dev *dev) if (!dev->q || !dev->ops) return ret; - if (dev->ops->identity(dev->q, &dev->identity)) { + if (dev->ops->identity(dev, &dev->identity)) { pr_err("nvm: device could not be identified\n"); goto err; } @@ -326,8 +326,7 @@ int nvm_register(struct request_queue *q, char *disk_name, } if (dev->ops->max_phys_sect > 1) { - dev->ppalist_pool = dev->ops->create_dma_pool(dev->q, - "ppalist"); + dev->ppalist_pool = dev->ops->create_dma_pool(dev, "ppalist"); if (!dev->ppalist_pool) { pr_err("nvm: could not create ppa pool\n"); ret = -ENOMEM; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ce6025487a5c..52b513a65946 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -195,7 +195,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_l2p_tbl) { - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, gennvm_block_map, dev); if (ret) { pr_err("gennvm: could not read L2P table.\n"); @@ -346,7 +346,7 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) gennvm_generic_to_addr_mode(dev, rqd); rqd->dev = dev; - return dev->ops->submit_io(dev->q, rqd); + return dev->ops->submit_io(dev, rqd); } static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, @@ -382,7 +382,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) if (!dev->ops->set_bb_tbl) return; - if (dev->ops->set_bb_tbl(dev->q, rqd, 1)) + if (dev->ops->set_bb_tbl(dev, rqd, 1)) return; gennvm_addr_to_generic_mode(dev, rqd); @@ -450,7 +450,7 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, gennvm_generic_to_addr_mode(dev, &rqd); - ret = dev->ops->erase_block(dev->q, &rqd); + ret = dev->ops->erase_block(dev, &rqd); if (plane_cnt) nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf1a4a515b76..134e4faba482 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1016,7 +1016,7 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, rrpc_l2p_update, rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 762c9a7cbfa6..15f2acb4d5cd 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -271,9 +271,9 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) return 0; } -static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) +static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; @@ -308,10 +308,10 @@ out: return ret; } -static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, +static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; u32 len = queue_max_hw_sectors(dev->admin_q) << 9; @@ -415,10 +415,10 @@ out: return ret; } -static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, int type) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -463,8 +463,9 @@ static void nvme_nvm_end_io(struct request *rq, int error) blk_mq_free_request(rq); } -static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct request *rq; struct bio *bio = rqd->bio; @@ -502,8 +503,9 @@ static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct nvme_nvm_command c = {}; @@ -515,9 +517,9 @@ static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); } -static void *nvme_nvm_create_dma_pool(struct request_queue *q, char *name) +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); @@ -530,7 +532,7 @@ static void nvme_nvm_destroy_dma_pool(void *pool) dma_pool_destroy(dma_pool); } -static void *nvme_nvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return dma_pool_alloc(pool, mem_flags, dma_handler); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 935ef3844c05..034117b3be5f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -183,17 +183,17 @@ struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct request_queue *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct request_queue *, char *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct request_queue *, void *, gfp_t, +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, dma_addr_t *); typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); -- cgit v1.2.3 From 4639d60d2bfb7f5007b5d93788fd93c19b63f000 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 7 Dec 2015 06:25:56 -0500 Subject: qed: Fix corner case for chain in-between pages The amount of chain next pointer elements between the producer and the consumer indices depends on which pages they currently point to. The current calculation is based only on their difference, and it can lead to a number of free elements which is higher by 1 than the actual value. Signed-off-by: Tomer Tayar Signed-off-by: Manish Chopra Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index b920c3605c46..41b9049b57e2 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -111,7 +111,8 @@ static inline u16 qed_chain_get_elem_left(struct qed_chain *p_chain) used = ((u32)0x10000u + (u32)(p_chain->prod_idx)) - (u32)p_chain->cons_idx; if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR) - used -= (used / p_chain->elem_per_page); + used -= p_chain->prod_idx / p_chain->elem_per_page - + p_chain->cons_idx / p_chain->elem_per_page; return p_chain->capacity - used; } -- cgit v1.2.3 From 76a9a3642a0b72d5687d680150580d55b6ea9804 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 7 Dec 2015 06:25:57 -0500 Subject: qed: fix handling of concurrent ramrods. Concurrent non-blocking slowpath ramrods can be completed out-of-order on the completion chain. Recycling completed elements, while previously sent elements are still completion pending, can lead to overriding of active elements on the chain. Furthermore, sending pending slowpath ramrods currently lacks the update of the chain element physical pointer. This patch: * Ensures that ramrods are sent to the FW with consecutive echo values. * Handles out-of-order completions by freeing only first successive completed entries. * Updates the chain element physical pointer when copying a pending element into a free element for sending. Signed-off-by: Tomer Tayar Signed-off-by: Manish Chopra Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_sp.h | 8 +++- drivers/net/ethernet/qlogic/qed/qed_spq.c | 63 +++++++++++++++++++++++-------- include/linux/qed/common_hsi.h | 2 + 3 files changed, 56 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h index 31a1f1eb4f56..287fadfab52d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h @@ -124,8 +124,12 @@ struct qed_spq { dma_addr_t p_phys; struct qed_spq_entry *p_virt; - /* Used as index for completions (returns on EQ by FW) */ - u16 echo_idx; +#define SPQ_RING_SIZE \ + (CORE_SPQE_PAGE_SIZE_BYTES / sizeof(struct slow_path_element)) + + /* Bitmap for handling out-of-order completions */ + DECLARE_BITMAP(p_comp_bitmap, SPQ_RING_SIZE); + u8 comp_bitmap_idx; /* Statistics */ u32 unlimited_pending_count; diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index 7c0b8459666e..3dd548ab8df1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -112,8 +112,6 @@ static int qed_spq_fill_entry(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent) { - p_ent->elem.hdr.echo = 0; - p_hwfn->p_spq->echo_idx++; p_ent->flags = 0; switch (p_ent->comp_mode) { @@ -195,10 +193,12 @@ static int qed_spq_hw_post(struct qed_hwfn *p_hwfn, struct qed_spq *p_spq, struct qed_spq_entry *p_ent) { - struct qed_chain *p_chain = &p_hwfn->p_spq->chain; + struct qed_chain *p_chain = &p_hwfn->p_spq->chain; + u16 echo = qed_chain_get_prod_idx(p_chain); struct slow_path_element *elem; struct core_db_data db; + p_ent->elem.hdr.echo = cpu_to_le16(echo); elem = qed_chain_produce(p_chain); if (!elem) { DP_NOTICE(p_hwfn, "Failed to produce from SPQ chain\n"); @@ -437,7 +437,9 @@ void qed_spq_setup(struct qed_hwfn *p_hwfn) p_spq->comp_count = 0; p_spq->comp_sent_count = 0; p_spq->unlimited_pending_count = 0; - p_spq->echo_idx = 0; + + bitmap_zero(p_spq->p_comp_bitmap, SPQ_RING_SIZE); + p_spq->comp_bitmap_idx = 0; /* SPQ cid, cannot fail */ qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_CORE, &p_spq->cid); @@ -582,26 +584,32 @@ qed_spq_add_entry(struct qed_hwfn *p_hwfn, struct qed_spq *p_spq = p_hwfn->p_spq; if (p_ent->queue == &p_spq->unlimited_pending) { - struct qed_spq_entry *p_en2; if (list_empty(&p_spq->free_pool)) { list_add_tail(&p_ent->list, &p_spq->unlimited_pending); p_spq->unlimited_pending_count++; return 0; - } + } else { + struct qed_spq_entry *p_en2; - p_en2 = list_first_entry(&p_spq->free_pool, - struct qed_spq_entry, - list); - list_del(&p_en2->list); + p_en2 = list_first_entry(&p_spq->free_pool, + struct qed_spq_entry, + list); + list_del(&p_en2->list); + + /* Copy the ring element physical pointer to the new + * entry, since we are about to override the entire ring + * entry and don't want to lose the pointer. + */ + p_ent->elem.data_ptr = p_en2->elem.data_ptr; - /* Strcut assignment */ - *p_en2 = *p_ent; + *p_en2 = *p_ent; - kfree(p_ent); + kfree(p_ent); - p_ent = p_en2; + p_ent = p_en2; + } } /* entry is to be placed in 'pending' queue */ @@ -777,13 +785,38 @@ int qed_spq_completion(struct qed_hwfn *p_hwfn, list_for_each_entry_safe(p_ent, tmp, &p_spq->completion_pending, list) { if (p_ent->elem.hdr.echo == echo) { + u16 pos = le16_to_cpu(echo) % SPQ_RING_SIZE; + list_del(&p_ent->list); - qed_chain_return_produced(&p_spq->chain); + /* Avoid overriding of SPQ entries when getting + * out-of-order completions, by marking the completions + * in a bitmap and increasing the chain consumer only + * for the first successive completed entries. + */ + bitmap_set(p_spq->p_comp_bitmap, pos, SPQ_RING_SIZE); + + while (test_bit(p_spq->comp_bitmap_idx, + p_spq->p_comp_bitmap)) { + bitmap_clear(p_spq->p_comp_bitmap, + p_spq->comp_bitmap_idx, + SPQ_RING_SIZE); + p_spq->comp_bitmap_idx++; + qed_chain_return_produced(&p_spq->chain); + } + p_spq->comp_count++; found = p_ent; break; } + + /* This is relatively uncommon - depends on scenarios + * which have mutliple per-PF sent ramrods. + */ + DP_VERBOSE(p_hwfn, QED_MSG_SPQ, + "Got completion for echo %04x - doesn't match echo %04x in completion pending list\n", + le16_to_cpu(echo), + le16_to_cpu(p_ent->elem.hdr.echo)); } /* Release lock before callback, as callback may post diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 6a4347639c03..1d1ba2c5ee7a 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -9,6 +9,8 @@ #ifndef __COMMON_HSI__ #define __COMMON_HSI__ +#define CORE_SPQE_PAGE_SIZE_BYTES 4096 + #define FW_MAJOR_VERSION 8 #define FW_MINOR_VERSION 4 #define FW_REVISION_VERSION 2 -- cgit v1.2.3 From d144da8c6f51f48ec39d891ea9dff80169c45f3b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 2 Nov 2015 12:13:25 -0500 Subject: IB/core: use RCU for uverbs id lookup The current implementation gets a spin_lock, and at any scale with qib and hfi1 post send, the lock contention grows exponentially with the number of QPs. idr_find() is RCU compatibile, so read doesn't need the lock. Change to use rcu_read_lock() and rcu_read_unlock() in __idr_get_uobj(). kfree_rcu() is used to insure a grace period between the idr removal and actual free. Reviewed-by: Ira Weiny Signed-off-by: Mike Marciniszyn Reviewed-By: Jason Gunthorpe Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 12 +++++++----- include/rdma/ib_verbs.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4cb8e9d9966c..1c02deab068f 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -62,9 +62,11 @@ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; * The ib_uobject locking scheme is as follows: * * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr operations. When an object is + * needs to be held during all idr write operations. When an object is * looked up, a reference must be taken on the object's kref before - * dropping this lock. + * dropping this lock. For read operations, the rcu_read_lock() + * and rcu_write_lock() but similarly the kref reference is grabbed + * before the rcu_read_unlock(). * * - Each object also has an rwsem. This rwsem must be held for * reading while an operation that uses the object is performed. @@ -96,7 +98,7 @@ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, static void release_uobj(struct kref *kref) { - kfree(container_of(kref, struct ib_uobject, ref)); + kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); } static void put_uobj(struct ib_uobject *uobj) @@ -145,7 +147,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, { struct ib_uobject *uobj; - spin_lock(&ib_uverbs_idr_lock); + rcu_read_lock(); uobj = idr_find(idr, id); if (uobj) { if (uobj->context == context) @@ -153,7 +155,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, else uobj = NULL; } - spin_unlock(&ib_uverbs_idr_lock); + rcu_read_unlock(); return uobj; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9a68a19532ba..120da1d7f57e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1271,6 +1271,7 @@ struct ib_uobject { int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ + struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; -- cgit v1.2.3 From bd5eb35f16a9c55afcf5eb1c920cbbaf09747369 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Dec 2015 08:53:17 -0800 Subject: xfrm: take care of request sockets TCP SYNACK messages might now be attached to request sockets. XFRM needs to get back to a listener socket. Adds new helpers that might be used elsewhere : sk_to_full_sk() and sk_const_to_full_sk() Note: We also need to add RCU protection for xfrm lookups, now TCP/DCCP have lockless listener processing. This will be addressed in separate patches. Fixes: ca6fb0651883 ("tcp: attach SYNACK messages to request sockets instead of listener") Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: David S. Miller --- include/net/inet_sock.h | 27 +++++++++++++++++++++++---- net/xfrm/xfrm_policy.c | 2 ++ 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 2134e6d815bc..625bdf95d673 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -210,18 +210,37 @@ struct inet_sock { #define IP_CMSG_ORIGDSTADDR BIT(6) #define IP_CMSG_CHECKSUM BIT(7) -/* SYNACK messages might be attached to request sockets. +/** + * sk_to_full_sk - Access to a full socket + * @sk: pointer to a socket + * + * SYNACK messages might be attached to request sockets. * Some places want to reach the listener in this case. */ -static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) +static inline struct sock *sk_to_full_sk(struct sock *sk) { - struct sock *sk = skb->sk; - +#ifdef CONFIG_INET if (sk && sk->sk_state == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; +#endif + return sk; +} + +/* sk_to_full_sk() variant with a const argument */ +static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) +{ +#ifdef CONFIG_INET + if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + sk = ((const struct request_sock *)sk)->rsk_listener; +#endif return sk; } +static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) +{ + return sk_to_full_sk(skb->sk); +} + static inline struct inet_sock *inet_sk(const struct sock *sk) { return (struct inet_sock *)sk; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 09bfcbac63bb..18276f0cc32b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2198,6 +2198,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, xdst = NULL; route = NULL; + sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); @@ -2477,6 +2478,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } pol = NULL; + sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl); if (IS_ERR(pol)) { -- cgit v1.2.3 From 533708867dd6388f643f12c87465b59e732d729d Mon Sep 17 00:00:00 2001 From: Hal Rosenstock Date: Fri, 13 Nov 2015 15:22:22 -0500 Subject: IB/mad: Require CM send method for everything except ClassPortInfo Receipt of CM MAD with other than the Send method for an attribute other than the ClassPortInfo attribute is invalid. CM attributes other than ClassPortInfo only use the send method. The SRP initiator does not maintain a timeout policy for CM connect requests relies on the CM layer to do that. The result was that the SRP initiator hung as the connect request never completed. A new SRP target has been observed to respond to Send CM REQ with GetResp of CM REQ with bad status. This is non conformant with IBA spec but exposes a vulnerability in the current MAD/CM code which will respond to the incoming GetResp of CM REQ as if it was a valid incoming Send of CM REQ rather than tossing this on the floor. It also causes the MAD layer not to retransmit the original REQ even though it has not received a REP. Reviewed-by: Sagi Grimberg Signed-off-by: Hal Rosenstock Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 5 +++++ include/rdma/ib_mad.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 8d8af7a41a30..2281de122038 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1811,6 +1811,11 @@ static int validate_mad(const struct ib_mad_hdr *mad_hdr, if (qp_num == 0) valid = 1; } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; /* Filter GSI packets sent to QP0 */ if (qp_num != 0) valid = 1; diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 188df91d5851..ec9b44dd3d80 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -237,6 +237,8 @@ struct ib_vendor_mad { u8 data[IB_MGMT_VENDOR_DATA]; }; +#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001) + struct ib_class_port_info { u8 base_version; u8 class_version; -- cgit v1.2.3 From a5e14ba334e202c58e45ef47414ec94c585c1a8c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 28 Oct 2015 13:28:15 +0200 Subject: mlx4: Expose correct max_sge_rd limit mlx4 devices (ConnectX-2, ConnectX-3) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Signed-off-by: Sagi Grimberg Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 2 +- include/linux/mlx4/device.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f567160a4a56..97d6878f9938 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -456,7 +456,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_sge_rd = props->max_sge; + props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7501626ab529..d3133be12d92 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -426,6 +426,17 @@ enum { MLX4_MAX_FAST_REG_PAGES = 511, }; +enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX4_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + enum { MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, -- cgit v1.2.3 From 4c3141e09cfa6460bfcd5e90f73e498db654c917 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Tue, 1 Dec 2015 17:24:17 +0100 Subject: of/irq: Export of_irq_find_parent again of_irq_find_parent was made static since it had no users outside of of_irq.c. Export it again since we are going to use it again. Signed-off-by: Carlo Caione [robh: move of_irq_find_parent to correct ifdef section] Signed-off-by: Rob Herring --- drivers/of/irq.c | 3 ++- include/linux/of_irq.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 902b89be7217..4fa916dffc91 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(irq_of_parse_and_map); * Returns a pointer to the interrupt parent node, or NULL if the interrupt * parent could not be determined. */ -static struct device_node *of_irq_find_parent(struct device_node *child) +struct device_node *of_irq_find_parent(struct device_node *child) { struct device_node *p; const __be32 *parp; @@ -77,6 +77,7 @@ static struct device_node *of_irq_find_parent(struct device_node *child) return p; } +EXPORT_SYMBOL_GPL(of_irq_find_parent); /** * of_irq_parse_raw - Low level interrupt tree parsing diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 039f2eec49ce..f648acf27ed7 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -46,6 +46,7 @@ extern int of_irq_get(struct device_node *dev, int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); +extern struct device_node *of_irq_find_parent(struct device_node *child); extern struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token); @@ -70,6 +71,11 @@ static inline int of_irq_to_resource_table(struct device_node *dev, { return 0; } +static inline void *of_irq_find_parent(struct device_node *child) +{ + return NULL; +} + static inline struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token) -- cgit v1.2.3 From eaddb5725357e9f05ffe5d271630f8197d089da4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 9 Dec 2015 09:11:10 -0600 Subject: of/irq: move of_msi_map_rid declaration to the correct ifdef section In checking fixes for of_irq_find_parent declaration location, I found that of_msi_map_rid is also wrong. of_msi_map_rid is not implemented for Sparc, so it should not be in the Sparc specific section of the header. Move it to just depend on OF_IRQ. Cc: Frank Rowand Signed-off-by: Rob Herring --- include/linux/of_irq.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index f648acf27ed7..1e0deb8e8494 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -53,6 +53,7 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev, extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid); extern void of_msi_configure(struct device *dev, struct device_node *np); +u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else static inline int of_irq_count(struct device_node *dev) { @@ -90,6 +91,11 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } +static inline u32 of_msi_map_rid(struct device *dev, + struct device_node *msi_np, u32 rid_in) +{ + return rid_in; +} #endif #if defined(CONFIG_OF_IRQ) || defined(CONFIG_SPARC) @@ -99,7 +105,6 @@ static inline void of_msi_configure(struct device *dev, struct device_node *np) * so declare it here regardless of the CONFIG_OF_IRQ setting. */ extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else /* !CONFIG_OF && !CONFIG_SPARC */ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, @@ -107,12 +112,6 @@ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, { return 0; } - -static inline u32 of_msi_map_rid(struct device *dev, - struct device_node *msi_np, u32 rid_in) -{ - return rid_in; -} #endif /* !CONFIG_OF */ #endif /* __OF_IRQ_H */ -- cgit v1.2.3 From d7e35dfa2531b53618b9e6edcd8752ce988ac555 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Dec 2015 22:04:01 -0500 Subject: bitops.h: correctly handle rol32 with 0 byte shift ROL on a 32 bit integer with a shift of 32 or more is undefined and the result is arch-dependent. Avoid this by handling the trivial case of roling by 0 correctly. The trivial solution of checking if shift is 0 breaks gcc's detection of this code as a ROL instruction, which is unacceptable. This bug was reported and fixed in GCC (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157): The standard rotate idiom, (x << n) | (x >> (32 - n)) is recognized by gcc (for concreteness, I discuss only the case that x is an uint32_t here). However, this is portable C only for n in the range 0 < n < 32. For n == 0, we get x >> 32 which gives undefined behaviour according to the C standard (6.5.7, Bitwise shift operators). To portably support n == 0, one has to write the rotate as something like (x << n) | (x >> ((-n) & 31)) And this is apparently not recognized by gcc. Note that this is broken on older GCCs and will result in slower ROL. Acked-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2b8ed123ad36..defeaac0745f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -107,7 +107,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> (32 - shift)); + return (word << shift) | (word >> ((-shift) & 31)); } /** -- cgit v1.2.3 From ecb7deceff2a51d3be50518969bc06411f485a62 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 9 Dec 2015 10:18:10 +0200 Subject: dmaengine: edma: DT: Change memcpy channel array from 16bit to 32bit type This change makes the DT file to be easier to read since the memcpy channels array does not need the '/bits/ 16' to be specified, which might confuse some people. Signed-off-by: Peter Ujfalusi Acked-by: Arnd Bergmann Acked-by: Rob Herring Acked-by: Tony Lindgren Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/ti-edma.txt | 5 ++--- drivers/dma/edma.c | 22 ++++++++++------------ include/linux/platform_data/edma.h | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/Documentation/devicetree/bindings/dma/ti-edma.txt b/Documentation/devicetree/bindings/dma/ti-edma.txt index d3d0a4fb1c73..ae8b8f1d6e69 100644 --- a/Documentation/devicetree/bindings/dma/ti-edma.txt +++ b/Documentation/devicetree/bindings/dma/ti-edma.txt @@ -22,8 +22,7 @@ Required properties: Optional properties: - ti,hwmods: Name of the hwmods associated to the eDMA CC - ti,edma-memcpy-channels: List of channels allocated to be used for memcpy, iow - these channels will be SW triggered channels. The list must - contain 16 bits numbers, see example. + these channels will be SW triggered channels. See example. - ti,edma-reserved-slot-ranges: PaRAM slot ranges which should not be used by the driver, they are allocated to be used by for example the DSP. See example. @@ -56,7 +55,7 @@ edma: edma@49000000 { ti,tptcs = <&edma_tptc0 7>, <&edma_tptc1 7>, <&edma_tptc2 0>; /* Channel 20 and 21 is allocated for memcpy */ - ti,edma-memcpy-channels = /bits/ 16 <20 21>; + ti,edma-memcpy-channels = <20 21>; /* The following PaRAM slots are reserved: 35-45 and 100-110 */ ti,edma-reserved-slot-ranges = /bits/ 16 <35 10>, /bits/ 16 <100 10>; diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c index 6b03e4e84e6b..3da20291db56 100644 --- a/drivers/dma/edma.c +++ b/drivers/dma/edma.c @@ -1752,16 +1752,14 @@ static enum dma_status edma_tx_status(struct dma_chan *chan, return ret; } -static bool edma_is_memcpy_channel(int ch_num, u16 *memcpy_channels) +static bool edma_is_memcpy_channel(int ch_num, s32 *memcpy_channels) { - s16 *memcpy_ch = memcpy_channels; - if (!memcpy_channels) return false; - while (*memcpy_ch != -1) { - if (*memcpy_ch == ch_num) + while (*memcpy_channels != -1) { + if (*memcpy_channels == ch_num) return true; - memcpy_ch++; + memcpy_channels++; } return false; } @@ -1775,7 +1773,7 @@ static void edma_dma_init(struct edma_cc *ecc, bool legacy_mode) { struct dma_device *s_ddev = &ecc->dma_slave; struct dma_device *m_ddev = NULL; - s16 *memcpy_channels = ecc->info->memcpy_channels; + s32 *memcpy_channels = ecc->info->memcpy_channels; int i, j; dma_cap_zero(s_ddev->cap_mask); @@ -1996,16 +1994,16 @@ static struct edma_soc_info *edma_setup_info_from_dt(struct device *dev, prop = of_find_property(dev->of_node, "ti,edma-memcpy-channels", &sz); if (prop) { const char pname[] = "ti,edma-memcpy-channels"; - size_t nelm = sz / sizeof(s16); - s16 *memcpy_ch; + size_t nelm = sz / sizeof(s32); + s32 *memcpy_ch; - memcpy_ch = devm_kcalloc(dev, nelm + 1, sizeof(s16), + memcpy_ch = devm_kcalloc(dev, nelm + 1, sizeof(s32), GFP_KERNEL); if (!memcpy_ch) return ERR_PTR(-ENOMEM); - ret = of_property_read_u16_array(dev->of_node, pname, - (u16 *)memcpy_ch, nelm); + ret = of_property_read_u32_array(dev->of_node, pname, + (u32 *)memcpy_ch, nelm); if (ret) return ERR_PTR(ret); diff --git a/include/linux/platform_data/edma.h b/include/linux/platform_data/edma.h index e2878baeb90e..4299f4ba03bd 100644 --- a/include/linux/platform_data/edma.h +++ b/include/linux/platform_data/edma.h @@ -72,7 +72,7 @@ struct edma_soc_info { struct edma_rsv_info *rsv; /* List of channels allocated for memcpy, terminated with -1 */ - s16 *memcpy_channels; + s32 *memcpy_channels; s8 (*queue_priority_mapping)[2]; const s16 (*xbar_chans)[2]; -- cgit v1.2.3 From 633c9a840d0bf1cce690f3165bdacd8ab412949e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Dec 2015 12:08:26 +0100 Subject: netfilter: nfnetlink: avoid recurrent netns lookups in call_batch Pass the net pointer to the call_batch callback functions so we can skip recurrent lookups. Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- include/linux/netfilter/nfnetlink.h | 2 +- net/netfilter/nf_tables_api.c | 96 +++++++++++++++++-------------------- net/netfilter/nfnetlink.c | 2 +- 3 files changed, 47 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 249d1bb01e03..5646b24bfc64 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -14,7 +14,7 @@ struct nfnl_callback { int (*call_rcu)(struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); - int (*call_batch)(struct sock *nl, struct sk_buff *skb, + int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); const struct nla_policy *policy; /* netlink attribute policy */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 93cc4737018f..f1002dcfa1c9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -89,6 +89,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) } static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, struct nft_af_info *afi, @@ -96,7 +97,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, struct nft_chain *chain, const struct nlattr * const *nla) { - ctx->net = sock_net(skb->sk); + ctx->net = net; ctx->afi = afi; ctx->table = table; ctx->chain = chain; @@ -672,15 +673,14 @@ err: return ret; } -static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; @@ -706,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -730,7 +730,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&table->sets); table->flags = flags; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err3; @@ -810,18 +810,17 @@ out: return err; } -static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_deltable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) return nft_flush(&ctx, family); @@ -1221,8 +1220,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) } } -static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1232,7 +1231,6 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct net_device *dev = NULL; u8 policy = NF_ACCEPT; @@ -1313,7 +1311,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(stats); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); if (trans == NULL) { @@ -1461,7 +1459,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (err < 0) goto err1; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1476,15 +1474,14 @@ err1: return err; } -static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; @@ -1506,7 +1503,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, if (chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); return nft_delchain(&ctx); } @@ -2010,13 +2007,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static struct nft_expr_info *info; -static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2075,7 +2071,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); n = 0; size = 0; @@ -2176,13 +2172,12 @@ err1: return err; } -static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; @@ -2205,7 +2200,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(chain); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2344,12 +2339,11 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; struct nft_table *table = NULL; @@ -2371,7 +2365,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, return -ENOENT; } - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -2623,6 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2630,7 +2625,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2693,14 +2688,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -2798,7 +2792,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) { @@ -2882,8 +2876,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set nft_set_destroy(set); } -static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2896,7 +2890,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -3024,7 +3018,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -3033,7 +3027,6 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); if (IS_ERR(afi)) @@ -3045,7 +3038,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, if (!trans && (table->flags & NFT_TABLE_INACTIVE)) return -ENOENT; - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -3135,6 +3128,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3150,8 +3144,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, - false); + err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, + (void *)nla, false); if (err < 0) return err; @@ -3212,11 +3206,12 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; @@ -3528,11 +3523,10 @@ err1: return err; } -static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3541,7 +3535,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true); if (err < 0) return err; @@ -3623,8 +3617,8 @@ err1: return err; } -static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nlattr *attr; @@ -3635,7 +3629,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 46453ab318db..445590f2c673 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -381,7 +381,7 @@ replay: goto ack; if (nc->call_batch) { - err = nc->call_batch(net->nfnl, skb, nlh, + err = nc->call_batch(net, net->nfnl, skb, nlh, (const struct nlattr **)cda); } -- cgit v1.2.3 From 059393c5bdd1420bdf1bed2972f33196dff263ae Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 7 Dec 2015 10:11:11 +0000 Subject: irqchip/gic-v3: Add missing struct device_node declaration When the GICv3 header file is used in a C file that doesn't include any of the OF stuff, we end up with a bunch of ugly warnings. Let's keep GCC quiet by adding a forward declaration. Signed-off-by: Marc Zyngier Cc: Cc: Jason Cooper Link: http://lkml.kernel.org/r/1449483072-17694-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c9ae0c6ec050..d5d798b35c1f 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -330,6 +330,7 @@ struct rdists { }; struct irq_domain; +struct device_node; int its_cpu_init(void); int its_init(struct device_node *node, struct rdists *rdists, struct irq_domain *domain); -- cgit v1.2.3 From 5e1033561da1152c57b97ee84371dba2b3d64c25 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Fri, 11 Dec 2015 09:16:38 -0800 Subject: ses: fix additional element traversal bug KASAN found that our additional element processing scripts drop off the end of the VPD page into unallocated space. The reason is that not every element has additional information but our traversal routines think they do, leading to them expecting far more additional information than is present. Fix this by adding a gate to the traversal routine so that it only processes elements that are expected to have additional information (list is in SES-2 section 6.1.13.1: Additional Element Status diagnostic page overview) Reported-by: Pavel Tikhomirov Tested-by: Pavel Tikhomirov Cc: stable@vger.kernel.org Signed-off-by: James Bottomley --- drivers/scsi/ses.c | 10 +++++++++- include/linux/enclosure.h | 4 ++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index 7d9cec50b77d..044d06410d4c 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -559,7 +559,15 @@ static void ses_enclosure_data_process(struct enclosure_device *edev, if (desc_ptr) desc_ptr += len; - if (addl_desc_ptr) + if (addl_desc_ptr && + /* only find additional descriptions for specific devices */ + (type_ptr[0] == ENCLOSURE_COMPONENT_DEVICE || + type_ptr[0] == ENCLOSURE_COMPONENT_ARRAY_DEVICE || + type_ptr[0] == ENCLOSURE_COMPONENT_SAS_EXPANDER || + /* these elements are optional */ + type_ptr[0] == ENCLOSURE_COMPONENT_SCSI_TARGET_PORT || + type_ptr[0] == ENCLOSURE_COMPONENT_SCSI_INITIATOR_PORT || + type_ptr[0] == ENCLOSURE_COMPONENT_CONTROLLER_ELECTRONICS)) addl_desc_ptr += addl_desc_ptr[1] + 2; } diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h index 7be22da321f3..a4cf57cd0f75 100644 --- a/include/linux/enclosure.h +++ b/include/linux/enclosure.h @@ -29,7 +29,11 @@ /* A few generic types ... taken from ses-2 */ enum enclosure_component_type { ENCLOSURE_COMPONENT_DEVICE = 0x01, + ENCLOSURE_COMPONENT_CONTROLLER_ELECTRONICS = 0x07, + ENCLOSURE_COMPONENT_SCSI_TARGET_PORT = 0x14, + ENCLOSURE_COMPONENT_SCSI_INITIATOR_PORT = 0x15, ENCLOSURE_COMPONENT_ARRAY_DEVICE = 0x17, + ENCLOSURE_COMPONENT_SAS_EXPANDER = 0x18, }; /* ses-2 common element status */ -- cgit v1.2.3 From ad87e03213b552a5c33d5e1e7a19a73768397010 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 10 Dec 2015 15:27:21 -0500 Subject: USB: add quirk for devices with broken LPM Some USB device / host controller combinations seem to have problems with Link Power Management. For example, Steinar found that his xHCI controller wouldn't handle bandwidth calculations correctly for two video cards simultaneously when LPM was enabled, even though the bus had plenty of bandwidth available. This patch introduces a new quirk flag for devices that should remain disabled for LPM, and creates quirk entries for Steinar's devices. Signed-off-by: Alan Stern Reported-by: Steinar H. Gunderson Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 7 ++++++- drivers/usb/core/quirks.c | 6 ++++++ include/linux/usb/quirks.h | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 585c3cb07da6..a5cc032ef77a 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -124,6 +124,10 @@ struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev) int usb_device_supports_lpm(struct usb_device *udev) { + /* Some devices have trouble with LPM */ + if (udev->quirks & USB_QUIRK_NO_LPM) + return 0; + /* USB 2.1 (and greater) devices indicate LPM support through * their USB 2.0 Extended Capabilities BOS descriptor. */ @@ -4512,6 +4516,8 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, goto fail; } + usb_detect_quirks(udev); + if (udev->wusb == 0 && le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201) { retval = usb_get_bos_descriptor(udev); if (!retval) { @@ -4710,7 +4716,6 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, if (status < 0) goto loop; - usb_detect_quirks(udev); if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(1000); diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index fcd6ac0c667f..6dc810bce295 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -202,6 +202,12 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Blackmagic Design Intensity Shuttle */ + { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, + + /* Blackmagic Design UltraStudio SDI */ + { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, + { } /* terminating entry must be last */ }; diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 9948c874e3f1..1d0043dc34e4 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -47,4 +47,7 @@ /* device generates spurious wakeup, ignore remote wakeup capability */ #define USB_QUIRK_IGNORE_REMOTE_WAKEUP BIT(9) +/* device can't handle Link Power Management */ +#define USB_QUIRK_NO_LPM BIT(10) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 56f047305dd4b6b61771ac4f523718e4111052a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2015 07:22:01 -0800 Subject: xfrm: add rcu grace period in xfrm_policy_destroy() We will soon switch sk->sk_policy[] to RCU protection, as SYNACK packets are sent while listener socket is not locked. This patch simply adds RCU grace period before struct xfrm_policy freeing, and the corresponding rcu_head in struct xfrm_policy. Signed-off-by: Eric Dumazet Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4a9c21f9b4ea..8bae1ef647cd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -548,6 +548,7 @@ struct xfrm_policy { u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; + struct rcu_head rcu; }; static inline struct net *xp_net(const struct xfrm_policy *xp) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 18276f0cc32b..f57a5712cedd 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -303,6 +303,14 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) } EXPORT_SYMBOL(xfrm_policy_alloc); +static void xfrm_policy_destroy_rcu(struct rcu_head *head) +{ + struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); + + security_xfrm_policy_free(policy->security); + kfree(policy); +} + /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) @@ -312,8 +320,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); - security_xfrm_policy_free(policy->security); - kfree(policy); + call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); -- cgit v1.2.3 From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2015 07:22:02 -0800 Subject: xfrm: add rcu protection to sk->sk_policy[] XFRM can deal with SYNACK messages, sent while listener socket is not locked. We add proper rcu protection to __xfrm_sk_clone_policy() and xfrm_sk_policy_lookup() This might serve as the first step to remove xfrm.xfrm_policy_lock use in fast path. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- include/net/xfrm.h | 24 +++++++++++++++--------- net/core/sock.c | 2 +- net/xfrm/xfrm_policy.c | 37 +++++++++++++++++++++++++------------ 4 files changed, 42 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b1d475b5db68..eaef41433d7a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -388,7 +388,7 @@ struct sock { struct socket_wq *sk_wq_raw; }; #ifdef CONFIG_XFRM - struct xfrm_policy *sk_policy[2]; + struct xfrm_policy __rcu *sk_policy[2]; #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8bae1ef647cd..d6f6e5006ee9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1142,12 +1142,14 @@ static inline int xfrm6_route_forward(struct sk_buff *skb) return xfrm_route_forward(skb, AF_INET6); } -int __xfrm_sk_clone_policy(struct sock *sk); +int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk); -static inline int xfrm_sk_clone_policy(struct sock *sk) +static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { - if (unlikely(sk->sk_policy[0] || sk->sk_policy[1])) - return __xfrm_sk_clone_policy(sk); + sk->sk_policy[0] = NULL; + sk->sk_policy[1] = NULL; + if (unlikely(osk->sk_policy[0] || osk->sk_policy[1])) + return __xfrm_sk_clone_policy(sk, osk); return 0; } @@ -1155,12 +1157,16 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir); static inline void xfrm_sk_free_policy(struct sock *sk) { - if (unlikely(sk->sk_policy[0] != NULL)) { - xfrm_policy_delete(sk->sk_policy[0], XFRM_POLICY_MAX); + struct xfrm_policy *pol; + + pol = rcu_dereference_protected(sk->sk_policy[0], 1); + if (unlikely(pol != NULL)) { + xfrm_policy_delete(pol, XFRM_POLICY_MAX); sk->sk_policy[0] = NULL; } - if (unlikely(sk->sk_policy[1] != NULL)) { - xfrm_policy_delete(sk->sk_policy[1], XFRM_POLICY_MAX+1); + pol = rcu_dereference_protected(sk->sk_policy[1], 1); + if (unlikely(pol != NULL)) { + xfrm_policy_delete(pol, XFRM_POLICY_MAX+1); sk->sk_policy[1] = NULL; } } @@ -1170,7 +1176,7 @@ void xfrm_garbage_collect(struct net *net); #else static inline void xfrm_sk_free_policy(struct sock *sk) {} -static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; } +static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; } static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) diff --git a/net/core/sock.c b/net/core/sock.c index d01c8f42dbb2..765be835b06c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1550,7 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ is_charged = sk_filter_charge(newsk, filter); - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f57a5712cedd..948fa5560de5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1221,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, struct xfrm_policy *pol; struct net *net = sock_net(sk); + rcu_read_lock(); read_lock_bh(&net->xfrm.xfrm_policy_lock); - if ((pol = sk->sk_policy[dir]) != NULL) { + pol = rcu_dereference(sk->sk_policy[dir]); + if (pol != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); int err = 0; @@ -1246,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, } out: read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return pol; } @@ -1314,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) #endif write_lock_bh(&net->xfrm.xfrm_policy_lock); - old_pol = sk->sk_policy[dir]; - sk->sk_policy[dir] = pol; + old_pol = rcu_dereference_protected(sk->sk_policy[dir], + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = get_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } + rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); @@ -1368,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) return newp; } -int __xfrm_sk_clone_policy(struct sock *sk) +int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { - struct xfrm_policy *p0 = sk->sk_policy[0], - *p1 = sk->sk_policy[1]; + const struct xfrm_policy *p; + struct xfrm_policy *np; + int i, ret = 0; - sk->sk_policy[0] = sk->sk_policy[1] = NULL; - if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL) - return -ENOMEM; - if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL) - return -ENOMEM; - return 0; + rcu_read_lock(); + for (i = 0; i < 2; i++) { + p = rcu_dereference(osk->sk_policy[i]); + if (p) { + np = clone_policy(p, i); + if (unlikely(!np)) { + ret = -ENOMEM; + break; + } + rcu_assign_pointer(sk->sk_policy[i], np); + } + } + rcu_read_unlock(); + return ret; } static int -- cgit v1.2.3 From f7fc6bc414121954c45c5f18b70e2a8717d0d5b4 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 10 Dec 2015 09:14:20 -0800 Subject: uapi: export ila.h The file ila.h used for lightweight tunnels is being used by iproute2 but is not exported yet. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 628e6e64c2fb..c2e5d6cb34e3 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -186,6 +186,7 @@ header-y += if_tunnel.h header-y += if_vlan.h header-y += if_x25.h header-y += igmp.h +header-y += ila.h header-y += in6.h header-y += inet_diag.h header-y += in.h -- cgit v1.2.3 From 98e89cf02aed11166698dd53c6f14865613babb3 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Fri, 11 Dec 2015 13:40:43 -0800 Subject: mm: kmemleak: mark kmemleak_init prototype as __init The kmemleak_init() definition in mm/kmemleak.c is marked __init but its prototype in include/linux/kmemleak.h is marked __ref since commit a6186d89c913 ("kmemleak: Mark the early log buffer as __initdata"). This causes a section mismatch which is reported as a warning when building with clang -Wsection, because kmemleak_init() is declared in section .ref.text but defined in .init.text. Fix this by marking kmemleak_init() prototype __init. Signed-off-by: Nicolas Iooss Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index d0a1f99e24e3..4894c6888bc6 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -25,7 +25,7 @@ #ifdef CONFIG_DEBUG_KMEMLEAK -extern void kmemleak_init(void) __ref; +extern void kmemleak_init(void) __init; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, -- cgit v1.2.3 From 86fffe4a61dd972d5a4e23260d530be6da02f614 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 11 Dec 2015 13:40:46 -0800 Subject: kernel: remove stop_machine() Kconfig dependency Currently the full stop_machine() routine is only enabled on SMP if module unloading is enabled, or if the CPUs are hotpluggable. This leads to configurations where stop_machine() is broken as it will then only run the callback on the local CPU with irqs disabled, and not stop the other CPUs or run the callback on them. For example, this breaks MTRR setup on x86 in certain configs since ea8596bb2d8d379 ("kprobes/x86: Remove unused text_poke_smp() and text_poke_smp_batch() functions") as the MTRR is only established on the boot CPU. This patch removes the Kconfig option for STOP_MACHINE and uses the SMP and HOTPLUG_CPU config options to compile the correct stop_machine() for the architecture, removing the false dependency on MODULE_UNLOAD in the process. Link: https://lkml.org/lkml/2014/10/8/124 References: https://bugs.freedesktop.org/show_bug.cgi?id=84794 Signed-off-by: Chris Wilson Acked-by: Ingo Molnar Cc: "Paul E. McKenney" Cc: Pranith Kumar Cc: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: H. Peter Anvin Cc: Tejun Heo Cc: Iulia Manda Cc: Andy Lutomirski Cc: Rusty Russell Cc: Peter Zijlstra Cc: Chuck Ebbert Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stop_machine.h | 6 +++--- init/Kconfig | 7 ------- kernel/stop_machine.c | 4 ++-- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca24c5b..0e1b1540597a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preemption. */ -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -118,7 +118,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); -#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ static inline int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) @@ -137,5 +137,5 @@ static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return stop_machine(fn, data, cpus); } -#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/init/Kconfig b/init/Kconfig index c24b6f767bf0..235c7a2c0d20 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2030,13 +2030,6 @@ config INIT_ALL_POSSIBLE it was better to provide this option than to break all the archs and have several arch maintainers pursuing me down dark alleys. -config STOP_MACHINE - bool - default y - depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU - help - Need stop_machine() primitive. - source "block/Kconfig" config PREEMPT_NOTIFIERS diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..a3bbaee77c58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From dfd01f026058a59a513f8a365b439a0681b803af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 13 Dec 2015 22:11:16 +0100 Subject: sched/wait: Fix the signal handling fix Jan Stancek reported that I wrecked things for him by fixing things for Vladimir :/ His report was due to an UNINTERRUPTIBLE wait getting -EINTR, which should not be possible, however my previous patch made this possible by unconditionally checking signal_pending(). We cannot use current->state as was done previously, because the instruction after the store to that variable it can be changed. We must instead pass the initial state along and use that. Fixes: 68985633bccb ("sched/wait: Fix signal handling in bit wait helpers") Reported-by: Jan Stancek Reported-by: Chris Mason Tested-by: Jan Stancek Tested-by: Vladimir Murzin Tested-by: Chris Mason Reviewed-by: Paul Turner Cc: Ingo Molnar Cc: tglx@linutronix.de Cc: Oleg Nesterov Cc: hpa@zytor.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- fs/cifs/inode.c | 6 +++--- fs/nfs/inode.c | 6 +++--- fs/nfs/internal.h | 2 +- fs/nfs/pagelist.c | 2 +- fs/nfs/pnfs.c | 4 ++-- include/linux/wait.h | 10 +++++----- kernel/sched/wait.c | 20 ++++++++++---------- net/sunrpc/sched.c | 6 +++--- 8 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6b66dd5d1540..a329f5ba35aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(struct wait_bit_key *key) +cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 31b0a52223a7..c7e8b87da5b2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,11 +75,11 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(struct wait_bit_key *key) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 56cfde26fb9c..9dea85f7f918 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -379,7 +379,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(struct wait_bit_key *key); +extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index fe3ddd20ff89..452a011ba0d8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -129,7 +129,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&q.key); + ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); } while (atomic_read(&c->io_count) != 0 && !ret); finish_wait(wq, &q.wait); return ret; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5a8ae2125b50..bec0384499f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1466,11 +1466,11 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, } /* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) { if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) return 1; - return nfs_wait_bit_killable(key); + return nfs_wait_bit_killable(key, mode); } static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..513b36f04dfd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -145,7 +145,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) list_del(&old->task_list); } -typedef int wait_bit_action_f(struct wait_bit_key *); +typedef int wait_bit_action_f(struct wait_bit_key *, int mode); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); @@ -960,10 +960,10 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); } while (0) -extern int bit_wait(struct wait_bit_key *); -extern int bit_wait_io(struct wait_bit_key *); -extern int bit_wait_timeout(struct wait_bit_key *); -extern int bit_wait_io_timeout(struct wait_bit_key *); +extern int bit_wait(struct wait_bit_key *, int); +extern int bit_wait_io(struct wait_bit_key *, int); +extern int bit_wait_timeout(struct wait_bit_key *, int); +extern int bit_wait_io_timeout(struct wait_bit_key *, int); /** * wait_on_bit - wait for a bit to be cleared diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f10bd873e684..f15d6b6a538a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,43 +581,43 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { io_schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f14f24ee9983..73ad57a59989 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(struct wait_bit_key *key) +static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } -- cgit v1.2.3 From e5f5d74747afa799bff109644be04b00af36043e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Dec 2015 14:29:58 +0100 Subject: openvswitch: fix trivial comment typo The commit 33db4125ec74 ("openvswitch: Rename LABEL->LABELS") left over an old OVS_CT_ATTR_LABEL instance, fix it. Fixes: 33db4125ec74 ("openvswitch: Rename LABEL->LABELS") Signed-off-by: Paolo Abeni Acked-by: Joe Stringer Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 28ccedd000f5..a27222d5b413 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -628,7 +628,7 @@ struct ovs_action_hash { * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the * mask, the corresponding bit in the value is copied to the connection * tracking mark field in the connection. - * @OVS_CT_ATTR_LABEL: %OVS_CT_LABELS_LEN value followed by %OVS_CT_LABELS_LEN + * @OVS_CT_ATTR_LABELS: %OVS_CT_LABELS_LEN value followed by %OVS_CT_LABELS_LEN * mask. For each bit set in the mask, the corresponding bit in the value is * copied to the connection tracking label field in the connection. * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG. -- cgit v1.2.3 From 79462ad02e861803b3840cc782248c7359451cd9 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 14 Dec 2015 22:03:39 +0100 Subject: net: add validation for the socket syscall protocol argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 郭永刚 reported that one could simply crash the kernel as root by using a simple program: int socket_fd; struct sockaddr_in addr; addr.sin_port = 0; addr.sin_addr.s_addr = INADDR_ANY; addr.sin_family = 10; socket_fd = socket(10,3,0x40000000); connect(socket_fd , &addr,16); AF_INET, AF_INET6 sockets actually only support 8-bit protocol identifiers. inet_sock's skc_protocol field thus is sized accordingly, thus larger protocol identifiers simply cut off the higher bits and store a zero in the protocol fields. This could lead to e.g. NULL function pointer because as a result of the cut off inet_num is zero and we call down to inet_autobind, which is NULL for raw sockets. kernel: Call Trace: kernel: [] ? inet_autobind+0x2e/0x70 kernel: [] inet_dgram_connect+0x54/0x80 kernel: [] SYSC_connect+0xd9/0x110 kernel: [] ? ptrace_notify+0x5b/0x80 kernel: [] ? syscall_trace_enter_phase2+0x108/0x200 kernel: [] SyS_connect+0xe/0x10 kernel: [] tracesys_phase2+0x84/0x89 I found no particular commit which introduced this problem. CVE: CVE-2015-8543 Cc: Cong Wang Reported-by: 郭永刚 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/ax25/af_ax25.c | 3 +++ net/decnet/af_decnet.c | 3 +++ net/ipv4/af_inet.c | 3 +++ net/ipv6/af_inet6.c | 3 +++ net/irda/af_irda.c | 3 +++ 6 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index eaef41433d7a..c4205e0a3a2d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -403,6 +403,7 @@ struct sock { sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, +#define SK_PROTOCOL_MAX U8_MAX sk_type : 16; kmemcheck_bitfield_end(flags); int sk_wmem_queued; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ae3a47f9d1d5..fbd0acf80b13 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -805,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index eebf5ac8ce18..13d6b1a6e0fc 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 11c4ca13ec3b..5c5db6636704 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -257,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8ec0df75f1c4..9f5137cd604e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index e6aa48b5395c..923abd6b3064 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; struct irda_sock *self; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (net != &init_net) return -EAFNOSUPPORT; -- cgit v1.2.3 From 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Dec 2015 14:08:53 -0800 Subject: net: fix IP early demux races David Wilder reported crashes caused by dst reuse. I am seeing a crash on a distro V4.2.3 kernel caused by a double release of a dst_entry. In ipv4_dst_destroy() the call to list_empty() finds a poisoned next pointer, indicating the dst_entry has already been removed from the list and freed. The crash occurs 18 to 24 hours into a run of a network stress exerciser. Thanks to his detailed report and analysis, we were able to understand the core issue. IP early demux can associate a dst to skb, after a lookup in TCP/UDP sockets. When socket cache is not properly set, we want to store into sk->sk_dst_cache the dst for future IP early demux lookups, by acquiring a stable refcount on the dst. Problem is this acquisition is simply using an atomic_inc(), which works well, unless the dst was queued for destruction from dst_release() noticing dst refcount went to zero, if DST_NOCACHE was set on dst. We need to make sure current refcount is not zero before incrementing it, or risk double free as David reported. This patch, being a stable candidate, adds two new helpers, and use them only from IP early demux problematic paths. It might be possible to merge in net-next skb_dst_force() and skb_dst_force_safe(), but I prefer having the smallest patch for stable kernels : Maybe some skb_dst_force() callers do not expect skb->dst can suddenly be cleared. Can probably be backported back to linux-3.6 kernels Reported-by: David J. Wilder Tested-by: David J. Wilder Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 33 +++++++++++++++++++++++++++++++++ include/net/sock.h | 2 +- net/ipv4/tcp_ipv4.c | 5 ++--- net/ipv6/tcp_ipv6.c | 3 +-- 4 files changed, 37 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 1279f9b09791..c7329dcd90cc 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -322,6 +322,39 @@ static inline void skb_dst_force(struct sk_buff *skb) } } +/** + * dst_hold_safe - Take a reference on a dst if possible + * @dst: pointer to dst entry + * + * This helper returns false if it could not safely + * take a reference on a dst. + */ +static inline bool dst_hold_safe(struct dst_entry *dst) +{ + if (dst->flags & DST_NOCACHE) + return atomic_inc_not_zero(&dst->__refcnt); + dst_hold(dst); + return true; +} + +/** + * skb_dst_force_safe - makes sure skb dst is refcounted + * @skb: buffer + * + * If dst is not yet refcounted and not destroyed, grab a ref on it. + */ +static inline void skb_dst_force_safe(struct sk_buff *skb) +{ + if (skb_dst_is_noref(skb)) { + struct dst_entry *dst = skb_dst(skb); + + if (!dst_hold_safe(dst)) + dst = NULL; + + skb->_skb_refdst = (unsigned long)dst; + } +} + /** * __skb_tunnel_rx - prepare skb for rx reinsert diff --git a/include/net/sock.h b/include/net/sock.h index c4205e0a3a2d..28790fe18206 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -817,7 +817,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force(skb); + skb_dst_force_safe(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index db003438aaf5..d8841a2f1569 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1493,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else - skb_dst_force(skb); + skb_dst_force_safe(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; @@ -1721,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { - dst_hold(dst); + if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e7aab561b7b4..6b8a8a9091fa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { + if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); -- cgit v1.2.3 From 887dc9f2cef6e98dcccf807da5e6faf4f60ba483 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Dec 2015 20:56:44 -0800 Subject: inet: tcp: fix inetpeer_set_addr_v4() David Ahern added a vif field in the a4 part of inetpeer_addr struct. This broke IPv4 TCP fast open client side and more generally tcp metrics cache, because inetpeer_addr_cmp() is now comparing two u32 instead of one. inetpeer_set_addr_v4() needs to properly init vif field, otherwise the comparison result depends on uninitialized data. Fixes: 192132b9a034 ("net: Add support for VRFs to inetpeer cache") Reported-by: Yuchung Cheng Signed-off-by: Eric Dumazet Cc: Neal Cardwell Signed-off-by: David S. Miller --- include/net/inetpeer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 4a6009d4486b..235c7811a86a 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -78,6 +78,7 @@ void inet_initpeers(void) __init; static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { iaddr->a4.addr = ip; + iaddr->a4.vif = 0; iaddr->family = AF_INET; } -- cgit v1.2.3 From 7bbadd2d1009575dad675afc16650ebb5aa10612 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 14 Dec 2015 23:30:43 +0100 Subject: net: fix warnings in 'make htmldocs' by moving macro definition out of field declaration Docbook does not like the definition of macros inside a field declaration and adds a warning. Move the definition out. Fixes: 79462ad02e86180 ("net: add validation for the socket syscall protocol argument") Reported-by: kbuild test robot Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 28790fe18206..14d3c0734007 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -403,8 +403,8 @@ struct sock { sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, -#define SK_PROTOCOL_MAX U8_MAX sk_type : 16; +#define SK_PROTOCOL_MAX U8_MAX kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; -- cgit v1.2.3 From 454d5d882c7e412b840e3c99010fe81a9862f6fb Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 30 Oct 2015 14:58:08 +0000 Subject: xen: Add RING_COPY_REQUEST() Using RING_GET_REQUEST() on a shared ring is easy to use incorrectly (i.e., by not considering that the other end may alter the data in the shared ring while it is being inspected). Safe usage of a request generally requires taking a local copy. Provide a RING_COPY_REQUEST() macro to use instead of RING_GET_REQUEST() and an open-coded memcpy(). This takes care of ensuring that the copy is done correctly regardless of any possible compiler optimizations. Use a volatile source to prevent the compiler from reordering or omitting the copy. This is part of XSA155. CC: stable@vger.kernel.org Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/interface/io/ring.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 7d28aff605c7..7dc685b4057d 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -181,6 +181,20 @@ struct __name##_back_ring { \ #define RING_GET_REQUEST(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) +/* + * Get a local copy of a request. + * + * Use this in preference to RING_GET_REQUEST() so all processing is + * done on a local copy that cannot be modified by the other end. + * + * Note that https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 may cause this + * to be ineffective where _req is a struct which consists of only bitfields. + */ +#define RING_COPY_REQUEST(_r, _idx, _req) do { \ + /* Use volatile to force the copy into _req. */ \ + *(_req) = *(volatile typeof(_req))RING_GET_REQUEST(_r, _idx); \ +} while (0) + #define RING_GET_RESPONSE(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) -- cgit v1.2.3 From 1d5cda4076d930d6d52088ed2c7753f7c564cbd7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Dec 2015 14:22:07 -0800 Subject: include/linux/mmdebug.h: should include linux/bug.h mmdebug.h uses BUILD_BUG_ON_INVALID(), assuming someone else included linux/bug.h. Include it ourselves. This saves build-failures such as: arch/arm64/include/asm/pgtable.h: In function 'set_pte_at': arch/arm64/include/asm/pgtable.h:281:3: error: implicit declaration of function 'BUILD_BUG_ON_INVALID' [-Werror=implicit-function-declaration] VM_WARN_ONCE(!pte_young(pte), Fixes: 02602a18c32d7 ("bug: completely remove code generated by disabled VM_BUG_ON()") Signed-off-by: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 877ef226f90f..772362adf471 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -1,6 +1,7 @@ #ifndef LINUX_MM_DEBUG_H #define LINUX_MM_DEBUG_H 1 +#include #include struct page; -- cgit v1.2.3