From 5bfd126e80dca70431aef8fdbc1cf14535f3c338 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 15 Apr 2014 13:49:04 +0200 Subject: sched/deadline: Fix sched_yield() behavior yield_task_dl() is broken: o it forces current to be throttled setting its runtime to zero; o it sets current's dl_se->dl_new to one, expecting that dl_task_timer() will queue it back with proper parameters at replenish time. Unfortunately, dl_task_timer() has this check at the very beginning: if (!dl_task(p) || dl_se->dl_new) goto unlock; So, it just bails out and the task is never replenished. It actually yielded forever. To fix this, introduce a new flag indicating that the task properly yielded the CPU before its current runtime expired. While this is a little overdoing at the moment, the flag would be useful in the future to discriminate between "good" jobs (of which remaining runtime could be reclaimed, i.e. recycled) and "bad" jobs (for which dl_throttled task has been set) that needed to be stopped. Reported-by: yjay.kim Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140429103953.e68eba1b2ac3309214e3dc5a@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25f54c79f757..2a4298fb0d85 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1153,9 +1153,12 @@ struct sched_dl_entity { * * @dl_boosted tells if we are boosted due to DI. If so we are * outside bandwidth enforcement mechanism (but only until we - * exit the critical section). + * exit the critical section); + * + * @dl_yielded tells if task gave up the cpu before consuming + * all its available runtime during the last job. */ - int dl_throttled, dl_new, dl_boosted; + int dl_throttled, dl_new, dl_boosted, dl_yielded; /* * Bandwidth enforcement timer. Each -deadline task has its -- cgit v1.2.3 From c1e756bfcbcac838a86a23f3e4501b556a961e3c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 5 May 2014 15:00:44 +0200 Subject: Revert "net: core: introduce netif_skb_dev_features" This reverts commit d206940319c41df4299db75ed56142177bb2e5f6, there are no more callers. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +------ net/core/dev.c | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ed3a3aa6604..20e99efb1ca6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3180,12 +3180,7 @@ void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev); -static inline netdev_features_t netif_skb_features(struct sk_buff *skb) -{ - return netif_skb_dev_features(skb, skb->dev); -} +netdev_features_t netif_skb_features(struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { diff --git a/net/core/dev.c b/net/core/dev.c index d2c8a06b3a98..c619b8641337 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2418,7 +2418,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2493,38 +2493,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - const struct net_device *dev, - netdev_features_t features) + netdev_features_t features) { int tmp; if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(dev, skb)) { + } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - netdev_features_t features = dev->features; + netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } - features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) @@ -2532,9 +2530,9 @@ netdev_features_t netif_skb_dev_features(struct sk_buff *skb, NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } -EXPORT_SYMBOL(netif_skb_dev_features); +EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) -- cgit v1.2.3 From 23a456f05353035d1a2b3f1b9a92707acdc036e0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 6 May 2014 18:52:16 +0200 Subject: net: mdio: of_mdiobus_register(): fall back to mdiobus_register() for !CONFIG_OF If CONFIG_OF is not set, make of_mdiobus_register() call mdiobus_register() instead of returning -ENOSYS. This way, we can just call of_mdiobus_register() from all DT-enabled drivers to handle the compat cases. Signed-off-by: Daniel Mack Suggested-by: Florian Fainelli Acked-by: Florian Fainelli Acked-by: Mugunthan V N Signed-off-by: David S. Miller --- include/linux/of_mdio.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 6fe8464ed767..881a7c3571f4 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -31,7 +31,12 @@ extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np); #else /* CONFIG_OF */ static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) { - return -ENOSYS; + /* + * Fall back to the non-DT function to register a bus. + * This way, we don't have to keep compat bits around in drivers. + */ + + return mdiobus_register(mdio); } static inline struct phy_device *of_phy_find_device(struct device_node *phy_np) -- cgit v1.2.3 From 555724a831b4a146e7bdf16ecc989cda032b076d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 May 2014 13:56:27 -0400 Subject: kernfs, sysfs, cgroup: restrict extra perm check on open to sysfs The kernfs open method - kernfs_fop_open() - inherited extra permission checks from sysfs. While the vfs layer allows ignoring the read/write permissions checks if the issuer has CAP_DAC_OVERRIDE, sysfs explicitly denied open regardless of the cap if the file doesn't have any of the UGO perms of the requested access or doesn't implement the requested operation. It can be debated whether this was a good idea or not but the behavior is too subtle and dangerous to change at this point. After cgroup got converted to kernfs, this extra perm check also got applied to cgroup breaking libcgroup which opens write-only files with O_RDWR as root. This patch gates the extra open permission check with a new flag KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK and enables it for sysfs. For sysfs, nothing changes. For cgroup, root now can perform any operation regardless of the permissions as it was before kernfs conversion. Note that kernfs still fails unimplemented operations with -EINVAL. While at it, add comments explaining KERNFS_ROOT flags. Signed-off-by: Tejun Heo Reported-by: Andrey Wagin Tested-by: Andrey Wagin Cc: Li Zefan References: http://lkml.kernel.org/g/CANaxB-xUm3rJ-Cbp72q-rQJO5mZe1qK6qXsQM=vh0U8upJ44+A@mail.gmail.com Fixes: 2bd59d48ebfb ("cgroup: convert to kernfs") Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 17 ++++++++++------- fs/sysfs/mount.c | 3 ++- include/linux/kernfs.h | 19 ++++++++++++++++++- 3 files changed, 30 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e01ea4a14a01..5e9a80cfc3d8 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -610,6 +610,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -624,14 +625,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) has_write = ops->write || ops->mmap; has_mmap = ops->mmap; - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index a66ad6196f59..8794423f7efb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -63,7 +63,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, 0, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b0122dc6f96a..ca1be5c9136c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -50,7 +50,24 @@ enum kernfs_node_flag { /* @flags for kernfs_create_root() */ enum kernfs_root_flag { - KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, + /* + * kernfs_nodes are created in the deactivated state and invisible. + * They require explicit kernfs_activate() to become visible. This + * can be used to make related nodes become visible atomically + * after all nodes are created successfully. + */ + KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, + + /* + * For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2) + * succeeds regardless of the RW permissions. sysfs had an extra + * layer of enforcement where open(2) fails with -EACCES regardless + * of CAP_DAC_OVERRIDE if the permission doesn't have the + * respective read or write access at all (none of S_IRUGO or + * S_IWUGO) or the respective operation isn't implemented. The + * following flag enables that behavior. + */ + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, }; /* type-specific structures for kernfs_node union members */ -- cgit v1.2.3 From 5024ae29cd285ce9e736776414da645d3a91828c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 May 2014 21:31:17 -0400 Subject: cgroup: introduce task_css_is_root() Determining the css of a task usually requires RCU read lock as that's the only thing which keeps the returned css accessible till its reference is acquired; however, testing whether a task belongs to the root can be performed without dereferencing the returned css by comparing the returned pointer against the root one in init_css_set[] which never changes. Implement task_css_is_root() which can be invoked in any context. This will be used by the scheduled cgroup_freezer change. v2: cgroup no longer supports modular controllers. No need to export init_css_set. Pointed out by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 15 +++++++++++++++ kernel/cgroup.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c2515851c1aa..d60904b9e505 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -473,6 +473,7 @@ struct cftype { }; extern struct cgroup_root cgrp_dfl_root; +extern struct css_set init_css_set; static inline bool cgroup_on_dfl(const struct cgroup *cgrp) { @@ -700,6 +701,20 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, return task_css_check(task, subsys_id, false); } +/** + * task_css_is_root - test whether a task belongs to the root css + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Test whether @task belongs to the root css on the specified subsystem. + * May be invoked in any context. + */ +static inline bool task_css_is_root(struct task_struct *task, int subsys_id) +{ + return task_css_check(task, subsys_id, true) == + init_css_set.subsys[subsys_id]; +} + static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635a..3f1ca934a237 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -348,7 +348,7 @@ struct cgrp_cset_link { * reference-counted, to improve performance when child cgroups * haven't been created. */ -static struct css_set init_css_set = { +struct css_set init_css_set = { .refcount = ATOMIC_INIT(1), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), -- cgit v1.2.3 From 3d4405226d27b3a215e4d03cfa51f536244e5de7 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sun, 11 May 2014 22:59:30 +0200 Subject: net: avoid dependency of net_get_random_once on nop patching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net_get_random_once depends on the static keys infrastructure to patch up the branch to the slow path during boot. This was realized by abusing the static keys api and defining a new initializer to not enable the call site while still indicating that the branch point should get patched up. This was needed to have the fast path considered likely by gcc. The static key initialization during boot up normally walks through all the registered keys and either patches in ideal nops or enables the jump site but omitted that step on x86 if ideal nops where already placed at static_key branch points. Thus net_get_random_once branches not always became active. This patch switches net_get_random_once to the ordinary static_key api and thus places the kernel fast path in the - by gcc considered - unlikely path. Microbenchmarks on Intel and AMD x86-64 showed that the unlikely path actually beats the likely path in terms of cycle cost and that different nop patterns did not make much difference, thus this switch should not be noticeable. Fixes: a48e42920ff38b ("net: introduce new macro net_get_random_once") Reported-by: Tuomas Räsänen Cc: Linus Torvalds Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/net.h | 15 ++++----------- net/core/utils.c | 8 ++++---- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 94734a6259a4..17d83393afcc 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -248,24 +248,17 @@ do { \ bool __net_get_random_once(void *buf, int nbytes, bool *done, struct static_key *done_key); -#ifdef HAVE_JUMP_LABEL -#define ___NET_RANDOM_STATIC_KEY_INIT ((struct static_key) \ - { .enabled = ATOMIC_INIT(0), .entries = (void *)1 }) -#else /* !HAVE_JUMP_LABEL */ -#define ___NET_RANDOM_STATIC_KEY_INIT STATIC_KEY_INIT_FALSE -#endif /* HAVE_JUMP_LABEL */ - #define net_get_random_once(buf, nbytes) \ ({ \ bool ___ret = false; \ static bool ___done = false; \ - static struct static_key ___done_key = \ - ___NET_RANDOM_STATIC_KEY_INIT; \ - if (!static_key_true(&___done_key)) \ + static struct static_key ___once_key = \ + STATIC_KEY_INIT_TRUE; \ + if (static_key_true(&___once_key)) \ ___ret = __net_get_random_once(buf, \ nbytes, \ &___done, \ - &___done_key); \ + &___once_key); \ ___ret; \ }) diff --git a/net/core/utils.c b/net/core/utils.c index 2f737bf90b3f..eed34338736c 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -348,8 +348,8 @@ static void __net_random_once_deferred(struct work_struct *w) { struct __net_random_once_work *work = container_of(w, struct __net_random_once_work, work); - if (!static_key_enabled(work->key)) - static_key_slow_inc(work->key); + BUG_ON(!static_key_enabled(work->key)); + static_key_slow_dec(work->key); kfree(work); } @@ -367,7 +367,7 @@ static void __net_random_once_disable_jump(struct static_key *key) } bool __net_get_random_once(void *buf, int nbytes, bool *done, - struct static_key *done_key) + struct static_key *once_key) { static DEFINE_SPINLOCK(lock); unsigned long flags; @@ -382,7 +382,7 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, *done = true; spin_unlock_irqrestore(&lock, flags); - __net_random_once_disable_jump(done_key); + __net_random_once_disable_jump(once_key); return true; } -- cgit v1.2.3 From 4c358e15553ed88bf2ddae422624624e1dd663d1 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 15 May 2014 14:44:30 +1000 Subject: of: fix CONFIG_OF=n prototype of of_node_full_name() Make the CONFIG_OF=n prototpe of of_node_full_name() mateh the CONFIG_OF=y version. Fixes compile warnings like this: sound/soc/soc-core.c: In function 'soc_check_aux_dev': sound/soc/soc-core.c:1667:3: warning: passing argument 1 of 'of_node_full_name' discards 'const' qualifier from pointer target type [enabled by default] codecname = of_node_full_name(aux_dev->codec_of_node); when CONFIG_OF is not defined. Signed-off-by: Stephen Rothwell Signed-off-by: Grant Likely --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 3bad8d106e0e..e6f0988c1c68 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -349,7 +349,7 @@ int of_device_is_stdout_path(struct device_node *dn); #else /* CONFIG_OF */ -static inline const char* of_node_full_name(struct device_node *np) +static inline const char* of_node_full_name(const struct device_node *np) { return ""; } -- cgit v1.2.3 From 200b916f3575bdf11609cb447661b8d5957b0bbf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 May 2014 15:11:20 -0700 Subject: rtnetlink: wait for unregistering devices in rtnl_link_unregister() From: Cong Wang commit 50624c934db18ab90 (net: Delay default_device_exit_batch until no devices are unregistering) introduced rtnl_lock_unregistering() for default_device_exit_batch(). Same race could happen we when rmmod a driver which calls rtnl_link_unregister() as we call dev->destructor without rtnl lock. For long term, I think we should clean up the mess of netdev_run_todo() and net namespce exit code. Cc: Eric W. Biederman Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 5 +++++ net/core/dev.c | 2 +- net/core/net_namespace.c | 2 +- net/core/rtnetlink.c | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 39 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 8e3e66ac0a52..953937ea5233 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -4,6 +4,7 @@ #include #include +#include #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); @@ -22,6 +23,10 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); + +extern wait_queue_head_t netdev_unregistering_wq; +extern struct mutex net_mutex; + #ifdef CONFIG_PROVE_LOCKING extern int lockdep_rtnl_is_held(void); #else diff --git a/net/core/dev.c b/net/core/dev.c index c619b8641337..6da649bde4f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5541,7 +5541,7 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); -static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 81d3a9a08453..7c8ffd974961 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -24,7 +24,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9837bebf93ce..2d8d8fcfa060 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -353,15 +353,46 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + for_each_net(net) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - rtnl_lock(); + /* Close the race with cleanup_net() */ + mutex_lock(&net_mutex); + rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); + mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); -- cgit v1.2.3 From ce8d9e0d6746ff67c1870386b7121a4448f21130 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 15 May 2014 15:29:27 +0300 Subject: net/mlx4_core: Add UPDATE_QP SRIOV wrapper support This patch adds UPDATE_QP SRIOV wrapper support. The mechanism is a general one, but currently only source MAC index changes are allowed for VFs. Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 4 +- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 6 +++ drivers/net/ethernet/mellanox/mlx4/qp.c | 35 ++++++++++++++ .../net/ethernet/mellanox/mlx4/resource_tracker.c | 54 ++++++++++++++++++++++ include/linux/mlx4/qp.h | 11 +++++ 5 files changed, 108 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 78099eab7673..92d3249f63f1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1253,12 +1253,12 @@ static struct mlx4_cmd_info cmd_info[] = { }, { .opcode = MLX4_CMD_UPDATE_QP, - .has_inbox = false, + .has_inbox = true, .has_outbox = false, .out_is_imm = false, .encode_slave_id = false, .verify = NULL, - .wrapper = mlx4_CMD_EPERM_wrapper + .wrapper = mlx4_UPDATE_QP_wrapper }, { .opcode = MLX4_CMD_GET_OP_REQ, diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index f9c465101963..212cea440f90 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1195,6 +1195,12 @@ int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd); +int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 61d64ebffd56..fbd32af89c7c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -389,6 +389,41 @@ err_icm: EXPORT_SYMBOL_GPL(mlx4_qp_alloc); +#define MLX4_UPDATE_QP_SUPPORTED_ATTRS MLX4_UPDATE_QP_SMAC +int mlx4_update_qp(struct mlx4_dev *dev, struct mlx4_qp *qp, + enum mlx4_update_qp_attr attr, + struct mlx4_update_qp_params *params) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_update_qp_context *cmd; + u64 pri_addr_path_mask = 0; + int err = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + cmd = (struct mlx4_update_qp_context *)mailbox->buf; + + if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS)) + return -EINVAL; + + if (attr & MLX4_UPDATE_QP_SMAC) { + pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX; + cmd->qp_context.pri_path.grh_mylmc = params->smac_index; + } + + cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask); + + err = mlx4_cmd(dev, mailbox->dma, qp->qpn & 0xffffff, 0, + MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_update_qp); + void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp) { struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 1c3fdd4a1f7d..8f1254a79832 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -3895,6 +3895,60 @@ static int add_eth_header(struct mlx4_dev *dev, int slave, } +#define MLX4_UPD_QP_PATH_MASK_SUPPORTED (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX) +int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd_info) +{ + int err; + u32 qpn = vhcr->in_modifier & 0xffffff; + struct res_qp *rqp; + u64 mac; + unsigned port; + u64 pri_addr_path_mask; + struct mlx4_update_qp_context *cmd; + int smac_index; + + cmd = (struct mlx4_update_qp_context *)inbox->buf; + + pri_addr_path_mask = be64_to_cpu(cmd->primary_addr_path_mask); + if (cmd->qp_mask || cmd->secondary_addr_path_mask || + (pri_addr_path_mask & ~MLX4_UPD_QP_PATH_MASK_SUPPORTED)) + return -EPERM; + + /* Just change the smac for the QP */ + err = get_res(dev, slave, qpn, RES_QP, &rqp); + if (err) { + mlx4_err(dev, "Updating qpn 0x%x for slave %d rejected\n", qpn, slave); + return err; + } + + port = (rqp->sched_queue >> 6 & 1) + 1; + smac_index = cmd->qp_context.pri_path.grh_mylmc; + err = mac_find_smac_ix_in_slave(dev, slave, port, + smac_index, &mac); + if (err) { + mlx4_err(dev, "Failed to update qpn 0x%x, MAC is invalid. smac_ix: %d\n", + qpn, smac_index); + goto err_mac; + } + + err = mlx4_cmd(dev, inbox->dma, + vhcr->in_modifier, 0, + MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A, + MLX4_CMD_NATIVE); + if (err) { + mlx4_err(dev, "Failed to update qpn on qpn 0x%x, command failed\n", qpn); + goto err_mac; + } + +err_mac: + put_res(dev, slave, qpn, RES_QP); + return err; +} + int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index b66e7610d4ee..7040dc98ff8b 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -421,6 +421,17 @@ struct mlx4_wqe_inline_seg { __be32 byte_count; }; +enum mlx4_update_qp_attr { + MLX4_UPDATE_QP_SMAC = 1 << 0, +}; + +struct mlx4_update_qp_params { + u8 smac_index; +}; + +int mlx4_update_qp(struct mlx4_dev *dev, struct mlx4_qp *qp, + enum mlx4_update_qp_attr attr, + struct mlx4_update_qp_params *params); int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar, -- cgit v1.2.3 From 4085ebe8c31face855fd01ee40372cb4aab1df3a Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:04:53 -0400 Subject: net: Find the nesting level of a given device by type. Multiple devices in the kernel can be stacked/nested and they need to know their nesting level for the purposes of lockdep. This patch provides a generic function that determines a nesting level of a particular device by its type (ex: vlan, macvlan, etc). We only care about nesting of the same type of devices. For example: eth0 <- vlan0.10 <- macvlan0 <- vlan1.20 The nesting level of vlan1.20 would be 1, since there is another vlan in the stack under it. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ net/core/dev.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20e99efb1ca6..fb912e8e5c7f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3077,6 +3077,14 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, priv; \ priv = netdev_lower_get_next_private_rcu(dev, &(iter))) +void *netdev_lower_get_next(struct net_device *dev, + struct list_head **iter); +#define netdev_for_each_lower_dev(dev, ldev, iter) \ + for (iter = &(dev)->adj_list.lower, \ + ldev = netdev_lower_get_next(dev, &(iter)); \ + ldev; \ + ldev = netdev_lower_get_next(dev, &(iter))) + void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); @@ -3092,6 +3100,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); diff --git a/net/core/dev.c b/net/core/dev.c index ed928e846559..6ee3ac25ed72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4622,6 +4622,32 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU @@ -5072,6 +5098,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_dev_get_private); + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) +{ + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; + + ASSERT_RTNL(); + + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } + + if (type_check(dev)) + max_nest++; + + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.2.3 From 25175ba5c9bff9aaf0229df34bb5d54c81633ec3 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:04:54 -0400 Subject: net: Allow for more then a single subclass for netif_addr_lock Currently netif_addr_lock_nested assumes that there can be only a single nesting level between 2 devices. However, if we have multiple devices of the same type stacked, this fails. For example: eth0 <-- vlan0.10 <-- vlan0.10.20 A more complicated configuration may stack more then one type of device in different order. Ex: eth0 <-- vlan0.10 <-- macvlan0 <-- vlan1.10.20 <-- macvlan1 This patch adds an ndo_* function that allows each stackable device to report its nesting level. If the device doesn't provide this function default subclass of 1 is used. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fb912e8e5c7f..9d4b1f1b6b75 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1144,6 +1144,7 @@ struct net_device_ops { netdev_tx_t (*ndo_dfwd_start_xmit) (struct sk_buff *skb, struct net_device *dev, void *priv); + int (*ndo_get_lock_subclass)(struct net_device *dev); }; /** @@ -2950,7 +2951,12 @@ static inline void netif_addr_lock(struct net_device *dev) static inline void netif_addr_lock_nested(struct net_device *dev) { - spin_lock_nested(&dev->addr_list_lock, SINGLE_DEPTH_NESTING); + int subclass = SINGLE_DEPTH_NESTING; + + if (dev->netdev_ops->ndo_get_lock_subclass) + subclass = dev->netdev_ops->ndo_get_lock_subclass(dev); + + spin_lock_nested(&dev->addr_list_lock, subclass); } static inline void netif_addr_lock_bh(struct net_device *dev) -- cgit v1.2.3 From d38569ab2bba6e6b3233acfc3a84cdbcfbd1f79f Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:04:55 -0400 Subject: vlan: Fix lockdep warning with stacked vlan devices. This reverts commit dc8eaaa006350d24030502a4521542e74b5cb39f. vlan: Fix lockdep warning when vlan dev handle notification Instead we use the new new API to find the lock subclass of our vlan device. This way we can support configurations where vlans are interspersed with other devices: bond -> vlan -> macvlan -> vlan Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 3 ++- net/8021q/vlan.c | 1 + net/8021q/vlan_dev.c | 52 +++++++++---------------------------------------- net/core/dev.c | 1 - 4 files changed, 12 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 13bbbde00e68..724bde8477b2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); -static inline int is_vlan_dev(struct net_device *dev) +static inline bool is_vlan_dev(struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } @@ -159,6 +159,7 @@ struct vlan_dev_priv { #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif + unsigned int nest_level; }; static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 175273f38cb1..44ebd5c2cd4a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -169,6 +169,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; + vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 733ec283ed1b..019efb79708f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -493,48 +493,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change) } } -static int vlan_calculate_locking_subclass(struct net_device *real_dev) -{ - int subclass = 0; - - while (is_vlan_dev(real_dev)) { - subclass++; - real_dev = vlan_dev_priv(real_dev)->real_dev; - } - - return subclass; -} - -static void vlan_dev_mc_sync(struct net_device *to, struct net_device *from) -{ - int err = 0, subclass; - - subclass = vlan_calculate_locking_subclass(to); - - spin_lock_nested(&to->addr_list_lock, subclass); - err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); - spin_unlock(&to->addr_list_lock); -} - -static void vlan_dev_uc_sync(struct net_device *to, struct net_device *from) -{ - int err = 0, subclass; - - subclass = vlan_calculate_locking_subclass(to); - - spin_lock_nested(&to->addr_list_lock, subclass); - err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); - spin_unlock(&to->addr_list_lock); -} - static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { - vlan_dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); - vlan_dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); + dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); + dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } /* @@ -562,6 +524,11 @@ static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); } +static int vlan_dev_get_lock_subclass(struct net_device *dev) +{ + return vlan_dev_priv(dev)->nest_level; +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .rebuild = vlan_dev_rebuild_header, @@ -597,7 +564,6 @@ static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - int subclass = 0; netif_carrier_off(dev); @@ -646,8 +612,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - subclass = vlan_calculate_locking_subclass(dev); - vlan_dev_set_lockdep_class(dev, subclass); + vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan_dev_priv(dev)->vlan_pcpu_stats) @@ -819,6 +784,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, }; void vlan_setup(struct net_device *dev) diff --git a/net/core/dev.c b/net/core/dev.c index 6ee3ac25ed72..2b872bfbd172 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5287,7 +5287,6 @@ void __dev_set_rx_mode(struct net_device *dev) if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } -EXPORT_SYMBOL(__dev_set_rx_mode); void dev_set_rx_mode(struct net_device *dev) { -- cgit v1.2.3 From c674ac30c549596295eb0a5af7f4714c0b905b6f Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:04:56 -0400 Subject: macvlan: Fix lockdep warnings with stacked macvlan devices Macvlan devices try to avoid stacking, but that's not always successfull or even desired. As an example, the following configuration is perefectly legal and valid: eth0 <--- macvlan0 <---- vlan0.10 <--- macvlan1 However, this configuration produces the following lockdep trace: [ 115.620418] ====================================================== [ 115.620477] [ INFO: possible circular locking dependency detected ] [ 115.620516] 3.15.0-rc1+ #24 Not tainted [ 115.620540] ------------------------------------------------------- [ 115.620577] ip/1704 is trying to acquire lock: [ 115.620604] (&vlan_netdev_addr_lock_key/1){+.....}, at: [] dev_uc_sync+0x3c/0x80 [ 115.620686] but task is already holding lock: [ 115.620723] (&macvlan_netdev_addr_lock_key){+.....}, at: [] dev_set_rx_mode+0x1e/0x40 [ 115.620795] which lock already depends on the new lock. [ 115.620853] the existing dependency chain (in reverse order) is: [ 115.620894] -> #1 (&macvlan_netdev_addr_lock_key){+.....}: [ 115.620935] [] lock_acquire+0xa2/0x130 [ 115.620974] [] _raw_spin_lock_nested+0x37/0x50 [ 115.621019] [] vlan_dev_set_rx_mode+0x53/0x110 [8021q] [ 115.621066] [] __dev_set_rx_mode+0x57/0xa0 [ 115.621105] [] dev_set_rx_mode+0x26/0x40 [ 115.621143] [] __dev_open+0xde/0x140 [ 115.621174] [] __dev_change_flags+0x9d/0x170 [ 115.621174] [] dev_change_flags+0x29/0x60 [ 115.621174] [] do_setlink+0x321/0x9a0 [ 115.621174] [] rtnl_newlink+0x51f/0x730 [ 115.621174] [] rtnetlink_rcv_msg+0x95/0x250 [ 115.621174] [] netlink_rcv_skb+0xa9/0xc0 [ 115.621174] [] rtnetlink_rcv+0x2a/0x40 [ 115.621174] [] netlink_unicast+0xf0/0x1c0 [ 115.621174] [] netlink_sendmsg+0x2ff/0x740 [ 115.621174] [] sock_sendmsg+0x8b/0xc0 [ 115.621174] [] ___sys_sendmsg+0x369/0x380 [ 115.621174] [] __sys_sendmsg+0x42/0x80 [ 115.621174] [] SyS_sendmsg+0x12/0x20 [ 115.621174] [] system_call_fastpath+0x16/0x1b [ 115.621174] -> #0 (&vlan_netdev_addr_lock_key/1){+.....}: [ 115.621174] [] __lock_acquire+0x1773/0x1a60 [ 115.621174] [] lock_acquire+0xa2/0x130 [ 115.621174] [] _raw_spin_lock_nested+0x37/0x50 [ 115.621174] [] dev_uc_sync+0x3c/0x80 [ 115.621174] [] macvlan_set_mac_lists+0xca/0x110 [macvlan] [ 115.621174] [] __dev_set_rx_mode+0x57/0xa0 [ 115.621174] [] dev_set_rx_mode+0x26/0x40 [ 115.621174] [] __dev_open+0xde/0x140 [ 115.621174] [] __dev_change_flags+0x9d/0x170 [ 115.621174] [] dev_change_flags+0x29/0x60 [ 115.621174] [] do_setlink+0x321/0x9a0 [ 115.621174] [] rtnl_newlink+0x51f/0x730 [ 115.621174] [] rtnetlink_rcv_msg+0x95/0x250 [ 115.621174] [] netlink_rcv_skb+0xa9/0xc0 [ 115.621174] [] rtnetlink_rcv+0x2a/0x40 [ 115.621174] [] netlink_unicast+0xf0/0x1c0 [ 115.621174] [] netlink_sendmsg+0x2ff/0x740 [ 115.621174] [] sock_sendmsg+0x8b/0xc0 [ 115.621174] [] ___sys_sendmsg+0x369/0x380 [ 115.621174] [] __sys_sendmsg+0x42/0x80 [ 115.621174] [] SyS_sendmsg+0x12/0x20 [ 115.621174] [] system_call_fastpath+0x16/0x1b [ 115.621174] other info that might help us debug this: [ 115.621174] Possible unsafe locking scenario: [ 115.621174] CPU0 CPU1 [ 115.621174] ---- ---- [ 115.621174] lock(&macvlan_netdev_addr_lock_key); [ 115.621174] lock(&vlan_netdev_addr_lock_key/1); [ 115.621174] lock(&macvlan_netdev_addr_lock_key); [ 115.621174] lock(&vlan_netdev_addr_lock_key/1); [ 115.621174] *** DEADLOCK *** [ 115.621174] 2 locks held by ip/1704: [ 115.621174] #0: (rtnl_mutex){+.+.+.}, at: [] rtnetlink_rcv+0x1b/0x40 [ 115.621174] #1: (&macvlan_netdev_addr_lock_key){+.....}, at: [] dev_set_rx_mode+0x1e/0x40 [ 115.621174] stack backtrace: [ 115.621174] CPU: 3 PID: 1704 Comm: ip Not tainted 3.15.0-rc1+ #24 [ 115.621174] Hardware name: Hewlett-Packard HP xw8400 Workstation/0A08h, BIOS 786D5 v02.38 10/25/2010 [ 115.621174] ffffffff82339ae0 ffff880465f79568 ffffffff816ee20c ffffffff82339ae0 [ 115.621174] ffff880465f795a8 ffffffff816e9e1b ffff880465f79600 ffff880465b019c8 [ 115.621174] 0000000000000001 0000000000000002 ffff880465b019c8 ffff880465b01230 [ 115.621174] Call Trace: [ 115.621174] [] dump_stack+0x4d/0x66 [ 115.621174] [] print_circular_bug+0x200/0x20e [ 115.621174] [] __lock_acquire+0x1773/0x1a60 [ 115.621174] [] ? trace_hardirqs_on_caller+0xb2/0x1d0 [ 115.621174] [] lock_acquire+0xa2/0x130 [ 115.621174] [] ? dev_uc_sync+0x3c/0x80 [ 115.621174] [] _raw_spin_lock_nested+0x37/0x50 [ 115.621174] [] ? dev_uc_sync+0x3c/0x80 [ 115.621174] [] dev_uc_sync+0x3c/0x80 [ 115.621174] [] macvlan_set_mac_lists+0xca/0x110 [macvlan] [ 115.621174] [] __dev_set_rx_mode+0x57/0xa0 [ 115.621174] [] dev_set_rx_mode+0x26/0x40 [ 115.621174] [] __dev_open+0xde/0x140 [ 115.621174] [] __dev_change_flags+0x9d/0x170 [ 115.621174] [] dev_change_flags+0x29/0x60 [ 115.621174] [] ? mem_cgroup_bad_page_check+0x21/0x30 [ 115.621174] [] do_setlink+0x321/0x9a0 [ 115.621174] [] ? __lock_acquire+0x37c/0x1a60 [ 115.621174] [] rtnl_newlink+0x51f/0x730 [ 115.621174] [] ? rtnl_newlink+0xe9/0x730 [ 115.621174] [] rtnetlink_rcv_msg+0x95/0x250 [ 115.621174] [] ? trace_hardirqs_on+0xd/0x10 [ 115.621174] [] ? rtnetlink_rcv+0x1b/0x40 [ 115.621174] [] ? rtnetlink_rcv+0x40/0x40 [ 115.621174] [] netlink_rcv_skb+0xa9/0xc0 [ 115.621174] [] rtnetlink_rcv+0x2a/0x40 [ 115.621174] [] netlink_unicast+0xf0/0x1c0 [ 115.621174] [] netlink_sendmsg+0x2ff/0x740 [ 115.621174] [] sock_sendmsg+0x8b/0xc0 [ 115.621174] [] ? might_fault+0x5f/0xb0 [ 115.621174] [] ? might_fault+0xa8/0xb0 [ 115.621174] [] ? might_fault+0x5f/0xb0 [ 115.621174] [] ? verify_iovec+0x5e/0xe0 [ 115.621174] [] ___sys_sendmsg+0x369/0x380 [ 115.621174] [] ? __do_page_fault+0x11d/0x570 [ 115.621174] [] ? up_read+0x1f/0x40 [ 115.621174] [] ? __do_page_fault+0x214/0x570 [ 115.621174] [] ? mntput_no_expire+0x6b/0x1c0 [ 115.621174] [] ? mntput_no_expire+0x17/0x1c0 [ 115.621174] [] ? mntput+0x24/0x40 [ 115.621174] [] __sys_sendmsg+0x42/0x80 [ 115.621174] [] SyS_sendmsg+0x12/0x20 [ 115.621174] [] system_call_fastpath+0x16/0x1b Fix this by correctly providing macvlan lockdep class. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 12 ++++++++++-- include/linux/if_macvlan.h | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index c5fb9cf95c12..d53e299ae1d9 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -517,6 +517,11 @@ static struct lock_class_key macvlan_netdev_addr_lock_key; #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) +static int macvlan_get_nest_level(struct net_device *dev) +{ + return ((struct macvlan_dev *)netdev_priv(dev))->nest_level; +} + static void macvlan_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) @@ -527,8 +532,9 @@ static void macvlan_set_lockdep_class_one(struct net_device *dev, static void macvlan_set_lockdep_class(struct net_device *dev) { - lockdep_set_class(&dev->addr_list_lock, - &macvlan_netdev_addr_lock_key); + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &macvlan_netdev_addr_lock_key, + macvlan_get_nest_level(dev)); netdev_for_each_tx_queue(dev, macvlan_set_lockdep_class_one, NULL); } @@ -723,6 +729,7 @@ static const struct net_device_ops macvlan_netdev_ops = { .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, + .ndo_get_lock_subclass = macvlan_get_nest_level, }; void macvlan_common_setup(struct net_device *dev) @@ -851,6 +858,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; + vlan->nest_level = dev_get_nest_level(lowerdev, netif_is_macvlan) + 1; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 7c8b20b120ea..a9a53b12397b 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -56,6 +56,7 @@ struct macvlan_dev { int numqueues; netdev_features_t tap_features; int minor; + int nest_level; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, -- cgit v1.2.3 From 44a4085538c844e79d6ee6bcf46fabf7c57a9a38 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:20:38 -0400 Subject: bonding: Fix stacked device detection in arp monitoring Prior to commit fbd929f2dce460456807a51e18d623db3db9f077 bonding: support QinQ for bond arp interval the arp monitoring code allowed for proper detection of devices stacked on top of vlans. Since the above commit, the code can still detect a device stacked on top of single vlan, but not a device stacked on top of Q-in-Q configuration. The search will only set the inner vlan tag if the route device is the vlan device. However, this is not always the case, as it is possible to extend the stacked configuration. With this patch it is possible to provision devices on top Q-in-Q vlan configuration that should be used as a source of ARP monitoring information. For example: ip link add link bond0 vlan10 type vlan proto 802.1q id 10 ip link add link vlan10 vlan100 type vlan proto 802.1q id 100 ip link add link vlan100 type macvlan Note: This patch limites the number of stacked VLANs to 2, just like before. The original, however had another issue in that if we had more then 2 levels of VLANs, we would end up generating incorrectly tagged traffic. This is no longer possible. Fixes: fbd929f2dce460456807a51e18d623db3db9f077 (bonding: support QinQ for bond arp interval) CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: Ding Tianhong CC: Patric McHardy Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 134 +++++++++++++++++++--------------------- drivers/net/bonding/bonding.h | 1 + include/linux/if_vlan.h | 6 ++ include/linux/netdevice.h | 9 +++ net/core/dev.c | 26 ++++++++ 5 files changed, 107 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 69aff72c8957..d3a67896d435 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2126,10 +2126,10 @@ static bool bond_has_this_ip(struct bonding *bond, __be32 ip) */ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, - struct bond_vlan_tag *inner, - struct bond_vlan_tag *outer) + struct bond_vlan_tag *tags) { struct sk_buff *skb; + int i; pr_debug("arp %d on slave %s: dst %pI4 src %pI4\n", arp_op, slave_dev->name, &dest_ip, &src_ip); @@ -2141,21 +2141,26 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, net_err_ratelimited("ARP packet allocation failed\n"); return; } - if (outer->vlan_id) { - if (inner->vlan_id) { - pr_debug("inner tag: proto %X vid %X\n", - ntohs(inner->vlan_proto), inner->vlan_id); - skb = __vlan_put_tag(skb, inner->vlan_proto, - inner->vlan_id); - if (!skb) { - net_err_ratelimited("failed to insert inner VLAN tag\n"); - return; - } - } - pr_debug("outer reg: proto %X vid %X\n", - ntohs(outer->vlan_proto), outer->vlan_id); - skb = vlan_put_tag(skb, outer->vlan_proto, outer->vlan_id); + /* Go through all the tags backwards and add them to the packet */ + for (i = BOND_MAX_VLAN_ENCAP - 1; i > 0; i--) { + if (!tags[i].vlan_id) + continue; + + pr_debug("inner tag: proto %X vid %X\n", + ntohs(tags[i].vlan_proto), tags[i].vlan_id); + skb = __vlan_put_tag(skb, tags[i].vlan_proto, + tags[i].vlan_id); + if (!skb) { + net_err_ratelimited("failed to insert inner VLAN tag\n"); + return; + } + } + /* Set the outer tag */ + if (tags[0].vlan_id) { + pr_debug("outer tag: proto %X vid %X\n", + ntohs(tags[0].vlan_proto), tags[0].vlan_id); + skb = vlan_put_tag(skb, tags[0].vlan_proto, tags[0].vlan_id); if (!skb) { net_err_ratelimited("failed to insert outer VLAN tag\n"); return; @@ -2164,22 +2169,52 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, arp_xmit(skb); } +/* Validate the device path between the @start_dev and the @end_dev. + * The path is valid if the @end_dev is reachable through device + * stacking. + * When the path is validated, collect any vlan information in the + * path. + */ +static bool bond_verify_device_path(struct net_device *start_dev, + struct net_device *end_dev, + struct bond_vlan_tag *tags) +{ + struct net_device *upper; + struct list_head *iter; + int idx; + + if (start_dev == end_dev) + return true; + + netdev_for_each_upper_dev_rcu(start_dev, upper, iter) { + if (bond_verify_device_path(upper, end_dev, tags)) { + if (is_vlan_dev(upper)) { + idx = vlan_get_encap_level(upper); + if (idx >= BOND_MAX_VLAN_ENCAP) + return false; + + tags[idx].vlan_proto = + vlan_dev_vlan_proto(upper); + tags[idx].vlan_id = vlan_dev_vlan_id(upper); + } + return true; + } + } + + return false; +} static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { - struct net_device *upper, *vlan_upper; - struct list_head *iter, *vlan_iter; struct rtable *rt; - struct bond_vlan_tag inner, outer; + struct bond_vlan_tag tags[BOND_MAX_VLAN_ENCAP]; __be32 *targets = bond->params.arp_targets, addr; int i; + bool ret; for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { pr_debug("basa: target %pI4\n", &targets[i]); - inner.vlan_proto = 0; - inner.vlan_id = 0; - outer.vlan_proto = 0; - outer.vlan_id = 0; + memset(tags, 0, sizeof(tags)); /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, @@ -2192,7 +2227,8 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", bond->dev->name, &targets[i]); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], 0, &inner, &outer); + bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], + 0, tags); continue; } @@ -2201,52 +2237,12 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) goto found; rcu_read_lock(); - /* first we search only for vlan devices. for every vlan - * found we verify its upper dev list, searching for the - * rt->dst.dev. If found we save the tag of the vlan and - * proceed to send the packet. - */ - netdev_for_each_all_upper_dev_rcu(bond->dev, vlan_upper, - vlan_iter) { - if (!is_vlan_dev(vlan_upper)) - continue; - - if (vlan_upper == rt->dst.dev) { - outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper); - outer.vlan_id = vlan_dev_vlan_id(vlan_upper); - rcu_read_unlock(); - goto found; - } - netdev_for_each_all_upper_dev_rcu(vlan_upper, upper, - iter) { - if (upper == rt->dst.dev) { - /* If the upper dev is a vlan dev too, - * set the vlan tag to inner tag. - */ - if (is_vlan_dev(upper)) { - inner.vlan_proto = vlan_dev_vlan_proto(upper); - inner.vlan_id = vlan_dev_vlan_id(upper); - } - outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper); - outer.vlan_id = vlan_dev_vlan_id(vlan_upper); - rcu_read_unlock(); - goto found; - } - } - } - - /* if the device we're looking for is not on top of any of - * our upper vlans, then just search for any dev that - * matches, and in case it's a vlan - save the id - */ - netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) { - if (upper == rt->dst.dev) { - rcu_read_unlock(); - goto found; - } - } + ret = bond_verify_device_path(bond->dev, rt->dst.dev, tags); rcu_read_unlock(); + if (ret) + goto found; + /* Not our device - skip */ pr_debug("%s: no path to arp_ip_target %pI4 via rt.dev %s\n", bond->dev->name, &targets[i], @@ -2259,7 +2255,7 @@ found: addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); ip_rt_put(rt); bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - addr, &inner, &outer); + addr, tags); } } diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index b8bdd0acc8f3..00bea320e3b5 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -36,6 +36,7 @@ #define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n" +#define BOND_MAX_VLAN_ENCAP 2 #define BOND_MAX_ARP_TARGETS 16 #define BOND_DEFAULT_MIIMON 100 diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 724bde8477b2..c901b13b6f03 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -484,4 +484,10 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, */ skb->protocol = htons(ETH_P_802_2); } + +static inline int vlan_get_encap_level(struct net_device *dev) +{ + BUG_ON(!is_vlan_dev(dev)); + return vlan_dev_priv(dev)->nest_level; +} #endif /* !(_LINUX_IF_VLAN_H_) */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d4b1f1b6b75..b42d07b0390b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3056,9 +3056,18 @@ extern int weight_p; extern int bpf_jit_enable; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter); struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); +/* iterate through upper list, must be called under RCU read lock */ +#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \ + for (iter = &(dev)->adj_list.upper, \ + updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ + updev; \ + updev = netdev_upper_get_next_dev_rcu(dev, &(iter))) + /* iterate through upper list, must be called under RCU read lock */ #define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \ for (iter = &(dev)->all_adj_list.upper, \ diff --git a/net/core/dev.c b/net/core/dev.c index 2b872bfbd172..9abc503b19b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4541,6 +4541,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list) } EXPORT_SYMBOL(netdev_adjacent_get_private); +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + /** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device -- cgit v1.2.3 From b69cf53640da2b86439596118cfa95233154ee76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2014 10:50:33 +0100 Subject: perf: Fix a race between ring_buffer_detach() and ring_buffer_attach() Alexander noticed that we use RCU iteration on rb->event_list but do not use list_{add,del}_rcu() to add,remove entries to that list, nor do we observe proper grace periods when re-using the entries. Merge ring_buffer_detach() into ring_buffer_attach() such that attaching to the NULL buffer is detaching. Furthermore, ensure that between any 'detach' and 'attach' of the same event we observe the required grace period, but only when strictly required. In effect this means that only ioctl(.request = PERF_EVENT_IOC_SET_OUTPUT) will wait for a grace period, while the normal initial attach and final detach will not be delayed. This patch should, I think, do the right thing under all circumstances, the 'normal' cases all should never see the extra grace period, but the two cases: 1) PERF_EVENT_IOC_SET_OUTPUT on an event which already has a ring_buffer set, will now observe the required grace period between removing itself from the old and attaching itself to the new buffer. This case is 'simple' in that both buffers are present in perf_event_set_output() one could think an unconditional synchronize_rcu() would be sufficient; however... 2) an event that has a buffer attached, the buffer is destroyed (munmap) and then the event is attached to a new/different buffer using PERF_EVENT_IOC_SET_OUTPUT. This case is more complex because the buffer destruction does: ring_buffer_attach(.rb = NULL) followed by the ioctl() doing: ring_buffer_attach(.rb = foo); and we still need to observe the grace period between these two calls due to us reusing the event->rb_entry list_head. In order to make 2 happen we use Paul's latest cond_synchronize_rcu() call. Cc: Paul Mackerras Cc: Stephane Eranian Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Mike Galbraith Reported-by: Alexander Shishkin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507123526.GD13658@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/perf_event.h | 2 + kernel/events/core.c | 109 ++++++++++++++++++++------------------------- 2 files changed, 51 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3356abcfff18..3ef6ea12806a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -402,6 +402,8 @@ struct perf_event { struct ring_buffer *rb; struct list_head rb_entry; + unsigned long rcu_batches; + int rcu_pending; /* poll related */ wait_queue_head_t waitq; diff --git a/kernel/events/core.c b/kernel/events/core.c index feb1329ca331..440eefc67397 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3192,7 +3192,8 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); static void unaccount_event_cpu(struct perf_event *event, int cpu) { @@ -3252,8 +3253,6 @@ static void free_event(struct perf_event *event) unaccount_event(event); if (event->rb) { - struct ring_buffer *rb; - /* * Can happen when we close an event with re-directed output. * @@ -3261,12 +3260,7 @@ static void free_event(struct perf_event *event) * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); - rb = event->rb; - if (rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* could be last */ - } + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } @@ -3850,28 +3844,47 @@ unlock: static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb) { + struct ring_buffer *old_rb = NULL; unsigned long flags; - if (!list_empty(&event->rb_entry)) - return; + if (event->rb) { + /* + * Should be impossible, we set this when removing + * event->rb_entry and wait/clear when adding event->rb_entry. + */ + WARN_ON_ONCE(event->rcu_pending); - spin_lock_irqsave(&rb->event_lock, flags); - if (list_empty(&event->rb_entry)) - list_add(&event->rb_entry, &rb->event_list); - spin_unlock_irqrestore(&rb->event_lock, flags); -} + old_rb = event->rb; + event->rcu_batches = get_state_synchronize_rcu(); + event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ - unsigned long flags; + spin_lock_irqsave(&old_rb->event_lock, flags); + list_del_rcu(&event->rb_entry); + spin_unlock_irqrestore(&old_rb->event_lock, flags); + } - if (list_empty(&event->rb_entry)) - return; + if (event->rcu_pending && rb) { + cond_synchronize_rcu(event->rcu_batches); + event->rcu_pending = 0; + } - spin_lock_irqsave(&rb->event_lock, flags); - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - spin_unlock_irqrestore(&rb->event_lock, flags); + if (rb) { + spin_lock_irqsave(&rb->event_lock, flags); + list_add_rcu(&event->rb_entry, &rb->event_list); + spin_unlock_irqrestore(&rb->event_lock, flags); + } + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } } static void ring_buffer_wakeup(struct perf_event *event) @@ -3940,7 +3953,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -3948,18 +3961,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) - return; + goto out_put; - /* Detach current event from the buffer. */ - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) { - ring_buffer_put(rb); /* can't be last */ - return; - } + if (atomic_read(&rb->mmap_count)) + goto out_put; /* * No other mmap()s, detach from all other events that might redirect @@ -3989,11 +3998,9 @@ again: * still restart the iteration to make sure we're not now * iterating the wrong list. */ - if (event->rb == rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* can't be last, we still have one */ - } + if (event->rb == rb) + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); put_event(event); @@ -4018,6 +4025,7 @@ again: vma->vm_mm->pinned_vm -= mmap_locked; free_uid(mmap_user); +out_put: ring_buffer_put(rb); /* could be last */ } @@ -4135,7 +4143,6 @@ again: vma->vm_mm->pinned_vm += extra; ring_buffer_attach(event, rb); - rcu_assign_pointer(event->rb, rb); perf_event_init_userpage(event); perf_event_update_userpage(event); @@ -6934,7 +6941,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL, *old_rb = NULL; + struct ring_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6962,8 +6969,6 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; - old_rb = event->rb; - if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6971,23 +6976,7 @@ set: goto unlock; } - if (old_rb) - ring_buffer_detach(event, old_rb); - - if (rb) - ring_buffer_attach(event, rb); - - rcu_assign_pointer(event->rb, rb); - - if (old_rb) { - ring_buffer_put(old_rb); - /* - * Since we detached before setting the new rb, so that we - * could attach the new rb, we could have missed a wakeup. - * Provide it now. - */ - wake_up_all(&event->waitq); - } + ring_buffer_attach(event, rb); ret = 0; unlock: -- cgit v1.2.3 From e1618d461ca18d40f9c3ef70598abb72e75d27ae Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 20 May 2014 10:59:26 -0400 Subject: vlan: Fix build error wth vlan_get_encap_level() The new function vlan_get_encap_level() uses vlan_dev_priv() which is only conditionally avaialble when VLAN support is enabled. Make vlan_get_encap_level() conditionally available as well. Fixes: 44a4085538c8 ("bonding: Fix stacked device detection in arp monitoring") Reported-by: Stephen Rothwell CC: Stephen Rothwell Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c901b13b6f03..b2acc4a1b13c 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -198,6 +198,12 @@ extern void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev); extern bool vlan_uses_dev(const struct net_device *dev); + +static inline int vlan_get_encap_level(struct net_device *dev) +{ + BUG_ON(!is_vlan_dev(dev)); + return vlan_dev_priv(dev)->nest_level; +} #else static inline struct net_device * __vlan_find_dev_deep(struct net_device *real_dev, @@ -264,6 +270,11 @@ static inline bool vlan_uses_dev(const struct net_device *dev) { return false; } +static inline int vlan_get_encap_level(struct net_device *dev) +{ + BUG(); + return 0; +} #endif static inline bool vlan_hw_offload_capable(netdev_features_t features, @@ -485,9 +496,4 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, skb->protocol = htons(ETH_P_802_2); } -static inline int vlan_get_encap_level(struct net_device *dev) -{ - BUG_ON(!is_vlan_dev(dev)); - return vlan_dev_priv(dev)->nest_level; -} #endif /* !(_LINUX_IF_VLAN_H_) */ -- cgit v1.2.3 From a8246fedacadaab18b23b280ea3cf916ef5fc30e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 8 May 2014 16:56:12 +0200 Subject: dmaengine: omap: hide filter_fn for built-in drivers It is not possible to reference the omap_dma_filter_fn filter function from a built-in driver if the dmaengine driver itself is a loadable module, which is a valid configuration otherwise. This provides only the dummy alternative if the function is referenced by a built-in driver to allow a successful build. The filter function is only required by ATAGS based platforms, which will continue to be broken after this change for the bogus configuration. When booting from DT, with the dma channels correctly listed there, it will work fine. Signed-off-by: Arnd Bergmann Acked-by: Tony Lindgren Cc: Russell King Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Signed-off-by: Vinod Koul --- include/linux/omap-dma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 41a13e70f41f..7944cdc27bed 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -10,7 +10,7 @@ struct dma_chan; -#if defined(CONFIG_DMA_OMAP) || defined(CONFIG_DMA_OMAP_MODULE) +#if defined(CONFIG_DMA_OMAP) || (defined(CONFIG_DMA_OMAP_MODULE) && defined(MODULE)) bool omap_dma_filter_fn(struct dma_chan *, void *); #else static inline bool omap_dma_filter_fn(struct dma_chan *c, void *d) -- cgit v1.2.3 From c1f43dd9c20d85e66c4d77e284f64ac114abe3f8 Mon Sep 17 00:00:00 2001 From: Xuelin Shi Date: Wed, 21 May 2014 14:02:37 -0700 Subject: dmaengine: fix dmaengine_unmap failure The count which is used to get_unmap_data maybe not the same as the count computed in dmaengine_unmap which causes to free data in a wrong pool. This patch fixes this issue by keeping the map count with unmap_data structure and use this count to get the pool. Cc: Signed-off-by: Xuelin Shi Signed-off-by: Dan Williams --- drivers/dma/dmaengine.c | 2 ++ include/linux/dmaengine.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index a886713937fd..d5d30ed863ce 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -1009,6 +1009,7 @@ static void dmaengine_unmap(struct kref *kref) dma_unmap_page(dev, unmap->addr[i], unmap->len, DMA_BIDIRECTIONAL); } + cnt = unmap->map_cnt; mempool_free(unmap, __get_unmap_pool(cnt)->pool); } @@ -1074,6 +1075,7 @@ dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags) memset(unmap, 0, sizeof(*unmap)); kref_init(&unmap->kref); unmap->dev = dev; + unmap->map_cnt = nr; return unmap; } diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 8300fb87b84a..72cb0ddb9678 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -429,6 +429,7 @@ typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); typedef void (*dma_async_tx_callback)(void *dma_async_param); struct dmaengine_unmap_data { + u8 map_cnt; u8 to_cnt; u8 from_cnt; u8 bidi_cnt; -- cgit v1.2.3 From ad0f614e4723db8cead439cf414108cbf975b224 Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Thu, 22 May 2014 11:54:20 -0700 Subject: wait: swap EXIT_ZOMBIE(Z) and EXIT_DEAD(X) chars in TASK_STATE_TO_CHAR_STR In commit ad86622b478e ("wait: swap EXIT_ZOMBIE and EXIT_DEAD to hide EXIT_TRACE from user-space") the order of task state definitions were changed: EXIT_DEAD and EXIT_ZOMBIE were swapped. Though the charterers for the states in TASK_STATE_TO_CHAR_STR string were not updated. This patch synchronizes the string to the order of definitions. Signed-off-by: Masatake YAMATO Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25f54c79f757..21fbdae61b9e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -220,7 +220,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #define TASK_PARKED 512 #define TASK_STATE_MAX 1024 -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; -- cgit v1.2.3 From fbebf59778600488147744cdf7d7c20d22531025 Mon Sep 17 00:00:00 2001 From: srinik Date: Thu, 15 May 2014 11:28:44 +0100 Subject: ARM: 8057/1: amba: Add Qualcomm vendor ID. This patch adds Qualcomm amba vendor Id to the list. This ID is used in mmci driver. The ID selected in same lines like 0x41 is "A" for ARM, 0x51 is "Q" for Qualcomm. As there are no physical register on Qcom SOC for amba vendor id, this is a fake ID assigned based on "Q" prefix from Qualcomm. Signed-off-by: Srinivas Kandagatla Acked-by: Linus Walleij Signed-off-by: Russell King --- include/linux/amba/bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 63b5eff0a80f..fdd7e1b61f60 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -47,6 +47,7 @@ struct amba_driver { enum amba_vendor { AMBA_VENDOR_ARM = 0x41, AMBA_VENDOR_ST = 0x80, + AMBA_VENDOR_QCOM = 0x51, }; extern struct bus_type amba_bustype; -- cgit v1.2.3