From d476059e77d1af48453a58f9de1e36f2eaff6450 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 2 Mar 2015 00:11:09 -0600 Subject: net: Kill dev_rebuild_header Now that there are no more users kill dev_rebuild_header and all of it's implementations. This is long overdue. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5897b4ea5a3f..2007f3b44d05 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -261,7 +261,6 @@ struct header_ops { unsigned short type, const void *daddr, const void *saddr, unsigned int len); int (*parse)(const struct sk_buff *skb, unsigned char *haddr); - int (*rebuild)(struct sk_buff *skb); int (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void (*cache_update)(struct hh_cache *hh, const struct net_device *dev, @@ -1346,7 +1345,7 @@ enum netdev_priv_flags { * if one wants to override the ndo_*() functions * @ethtool_ops: Management operations * @fwd_ops: Management operations - * @header_ops: Includes callbacks for creating,parsing,rebuilding,etc + * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * * @flags: Interface flags (a la BSD) @@ -2399,15 +2398,6 @@ static inline int dev_parse_header(const struct sk_buff *skb, return dev->header_ops->parse(skb, haddr); } -static inline int dev_rebuild_header(struct sk_buff *skb) -{ - const struct net_device *dev = skb->dev; - - if (!dev->header_ops || !dev->header_ops->rebuild) - return 0; - return dev->header_ops->rebuild(skb); -} - typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len); int register_gifconf(unsigned int family, gifconf_func_t *gifconf); static inline int unregister_gifconf(unsigned int family) -- cgit v1.2.3 From 4586f1bb911ce219a4bc1c2a9d6eee2e058b2b51 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Thu, 5 Mar 2015 21:21:14 -0800 Subject: netdevice: add IPv4 fib add/del ops Add two new ndo ops for IPv4 fib offload support, add and del. Add uses modifiy semantics if fib entry already offloaded. Drivers implementing the new ndo ops will return err<0 if programming device fails, for example if device's tables are full. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 625c8d71511b..45413784a3b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -768,6 +768,8 @@ struct netdev_phys_item_id { typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); +struct fib_info; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1031,6 +1033,14 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); * Called to notify switch device port of bridge port STP * state change. + * int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 tb_id); + * Called to add/modify IPv4 route to switch device. + * int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 tb_id); + * Called to delete IPv4 route from switch device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1192,6 +1202,18 @@ struct net_device_ops { struct netdev_phys_item_id *psid); int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); + int (*ndo_switch_fib_ipv4_add)(struct net_device *dev, + __be32 dst, + int dst_len, + struct fib_info *fi, + u8 tos, u8 type, + u32 tb_id); + int (*ndo_switch_fib_ipv4_del)(struct net_device *dev, + __be32 dst, + int dst_len, + struct fib_info *fi, + u8 tos, u8 type, + u32 tb_id); #endif }; -- cgit v1.2.3 From f8f2147150de303e814c0452075d467734d3544b Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Mon, 9 Mar 2015 13:59:09 -0700 Subject: switchdev: add netlink flags to IPv4 FIB add op Pass in the netlink flags (NLM_F_*) into switchdev driver for IPv4 FIB add op to allow driver to 1) optimize hardware updates, 2) handle ip route prepend and append commands correctly. Suggested-by: Jamal Hadi Salim Suggested-by: Roopa Prabhu Signed-off-by: Scott Feldman Reviewed-by: Simon Horman Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 3 ++- include/linux/netdevice.h | 3 ++- include/net/switchdev.h | 6 ++++-- net/ipv4/fib_trie.c | 5 ++++- net/switchdev/switchdev.c | 7 +++++-- 5 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 65e140315a58..223348d8cc07 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4152,7 +4152,8 @@ static int rocker_port_switch_port_stp_update(struct net_device *dev, u8 state) static int rocker_port_switch_fib_ipv4_add(struct net_device *dev, __be32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + u8 tos, u8 type, + u32 nlflags, u32 tb_id) { struct rocker_port *rocker_port = netdev_priv(dev); int flags = 0; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 45413784a3b1..1354ae83efc8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1035,7 +1035,7 @@ struct fib_info; * state change. * int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst, * int dst_len, struct fib_info *fi, - * u8 tos, u8 type, u32 tb_id); + * u8 tos, u8 type, u32 nlflags, u32 tb_id); * Called to add/modify IPv4 route to switch device. * int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst, * int dst_len, struct fib_info *fi, @@ -1207,6 +1207,7 @@ struct net_device_ops { int dst_len, struct fib_info *fi, u8 tos, u8 type, + u32 nlflags, u32 tb_id); int (*ndo_switch_fib_ipv4_del)(struct net_device *dev, __be32 dst, diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 933fac410a7a..1a9382febcc3 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -1,6 +1,7 @@ /* * include/net/switchdev.h - Switch device API * Copyright (c) 2014 Jiri Pirko + * Copyright (c) 2014-2015 Scott Feldman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -52,7 +53,7 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id); + u8 tos, u8 type, u32 nlflags, u32 tb_id); int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, u8 tos, u8 type, u32 tb_id); void netdev_switch_fib_ipv4_abort(struct fib_info *fi); @@ -117,7 +118,8 @@ static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device * static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + u8 tos, u8 type, + u32 nlflags, u32 tb_id) { return 0; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 90955455884e..fcfa9825a816 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1155,6 +1155,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) err = netdev_switch_fib_ipv4_add(key, plen, fi, new_fa->fa_tos, cfg->fc_type, + cfg->fc_nlflags, tb->tb_id); if (err) { netdev_switch_fib_ipv4_abort(fi); @@ -1201,7 +1202,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) /* (Optionally) offload fib entry to switch hardware. */ err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, - cfg->fc_type, tb->tb_id); + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); if (err) { netdev_switch_fib_ipv4_abort(fi); goto out_free_new_fa; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index aba6aa2656d8..8cf42a69baf4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1,6 +1,7 @@ /* * net/switchdev/switchdev.c - Switch device API * Copyright (c) 2014 Jiri Pirko + * Copyright (c) 2014-2015 Scott Feldman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -294,12 +295,13 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) * @fi: route FIB info structure * @tos: route TOS * @type: route type + * @nlflags: netlink flags passed in (NLM_F_*) * @tb_id: route table ID * * Add IPv4 route entry to switch device. */ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + u8 tos, u8 type, u32 nlflags, u32 tb_id) { struct net_device *dev; const struct net_device_ops *ops; @@ -324,7 +326,8 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (ops->ndo_switch_fib_ipv4_add) { err = ops->ndo_switch_fib_ipv4_add(dev, htonl(dst), dst_len, - fi, tos, type, tb_id); + fi, tos, type, nlflags, + tb_id); if (!err) fi->fib_flags |= RTNH_F_EXTERNAL; } -- cgit v1.2.3 From efd7ef1c1929d7a0329d4349252863c04d6f1729 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 Mar 2015 23:04:08 -0500 Subject: net: Kill hold_net release_net hold_net and release_net were an idea that turned out to be useless. The code has been disabled since 2008. Kill the code it is long past due. Signed-off-by: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- include/net/fib_rules.h | 9 +-------- include/net/net_namespace.h | 29 ----------------------------- include/net/sock.h | 2 +- net/core/dev.c | 2 -- net/core/fib_rules.c | 17 +++-------------- net/core/neighbour.c | 9 ++------- net/core/net_namespace.c | 11 ----------- net/core/sock.c | 1 - net/ipv4/fib_semantics.c | 3 +-- net/ipv4/inet_hashtables.c | 3 +-- net/ipv4/inet_timewait_sock.c | 3 +-- net/ipv6/addrlabel.c | 5 +---- net/ipv6/ip6_flowlabel.c | 3 +-- net/openvswitch/datapath.c | 4 +--- 15 files changed, 14 insertions(+), 90 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1354ae83efc8..cede40d9cac9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1864,8 +1864,7 @@ static inline void dev_net_set(struct net_device *dev, struct net *net) { #ifdef CONFIG_NET_NS - release_net(dev->nd_net); - dev->nd_net = hold_net(net); + dev->nd_net = net; #endif } diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 88d2ae526961..6d67383a5114 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -95,17 +95,10 @@ static inline void fib_rule_get(struct fib_rule *rule) atomic_inc(&rule->refcnt); } -static inline void fib_rule_put_rcu(struct rcu_head *head) -{ - struct fib_rule *rule = container_of(head, struct fib_rule, rcu); - release_net(rule->fr_net); - kfree(rule); -} - static inline void fib_rule_put(struct fib_rule *rule) { if (atomic_dec_and_test(&rule->refcnt)) - call_rcu(&rule->rcu, fib_rule_put_rcu); + kfree_rcu(rule, rcu); } static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e086f4030dd2..fab51ceeabf3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -49,11 +49,6 @@ struct net { atomic_t count; /* To decided when the network * namespace should be shut down. */ -#ifdef NETNS_REFCNT_DEBUG - atomic_t use_count; /* To track references we - * destroy on demand - */ -#endif spinlock_t rules_mod_lock; atomic64_t cookie_gen; @@ -236,30 +231,6 @@ int net_eq(const struct net *net1, const struct net *net2) #endif -#ifdef NETNS_REFCNT_DEBUG -static inline struct net *hold_net(struct net *net) -{ - if (net) - atomic_inc(&net->use_count); - return net; -} - -static inline void release_net(struct net *net) -{ - if (net) - atomic_dec(&net->use_count); -} -#else -static inline struct net *hold_net(struct net *net) -{ - return net; -} - -static inline void release_net(struct net *net) -{ -} -#endif - #ifdef CONFIG_NET_NS static inline void write_pnet(struct net **pnet, struct net *net) diff --git a/include/net/sock.h b/include/net/sock.h index d996c633bec2..95b2c1c220f9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2204,7 +2204,7 @@ static inline void sk_change_net(struct sock *sk, struct net *net) if (!net_eq(current_net, net)) { put_net(current_net); - sock_net_set(sk, hold_net(net)); + sock_net_set(sk, net); } } diff --git a/net/core/dev.c b/net/core/dev.c index 962ee9d71964..39fe369b46ad 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6841,8 +6841,6 @@ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - release_net(dev_net(dev)); - netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index b55677fed1c8..68ea6950cad1 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -31,7 +31,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; - r->fr_net = hold_net(ops->fro_net); + r->fr_net = ops->fro_net; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -116,7 +116,6 @@ static int __fib_rules_register(struct fib_rules_ops *ops) if (ops->family == o->family) goto errout; - hold_net(net); list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: @@ -160,15 +159,6 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) } } -static void fib_rules_put_rcu(struct rcu_head *head) -{ - struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); - struct net *net = ops->fro_net; - - release_net(net); - kfree(ops); -} - void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; @@ -178,7 +168,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops) fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); - call_rcu(&ops->rcu, fib_rules_put_rcu); + kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); @@ -303,7 +293,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) err = -ENOMEM; goto errout; } - rule->fr_net = hold_net(net); + rule->fr_net = net; if (tb[FRA_PRIORITY]) rule->pref = nla_get_u32(tb[FRA_PRIORITY]); @@ -423,7 +413,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) return 0; errout_free: - release_net(rule->fr_net); kfree(rule); errout: rules_ops_put(ops); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ad07990e943d..0e8b32efc031 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -591,7 +591,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (!n) goto out; - write_pnet(&n->net, hold_net(net)); + write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; if (dev) @@ -600,7 +600,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { if (dev) dev_put(dev); - release_net(net); kfree(n); n = NULL; goto out; @@ -634,7 +633,6 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, tbl->pdestructor(n); if (n->dev) dev_put(n->dev); - release_net(pneigh_net(n)); kfree(n); return 0; } @@ -657,7 +655,6 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); - release_net(pneigh_net(n)); kfree(n); continue; } @@ -1428,11 +1425,10 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); dev_hold(dev); p->dev = dev; - write_pnet(&p->net, hold_net(net)); + write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { - release_net(net); dev_put(dev); kfree(p); return NULL; @@ -1472,7 +1468,6 @@ EXPORT_SYMBOL(neigh_parms_release); static void neigh_parms_destroy(struct neigh_parms *parms) { - release_net(neigh_parms_net(parms)); kfree(parms); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb5290b8c428..e5e96b0f6717 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -236,10 +236,6 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) net->user_ns = user_ns; idr_init(&net->netns_ids); -#ifdef NETNS_REFCNT_DEBUG - atomic_set(&net->use_count, 0); -#endif - list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) @@ -294,13 +290,6 @@ out_free: static void net_free(struct net *net) { -#ifdef NETNS_REFCNT_DEBUG - if (unlikely(atomic_read(&net->use_count) != 0)) { - pr_emerg("network namespace not free! Usage: %d\n", - atomic_read(&net->use_count)); - return; - } -#endif kfree(rcu_access_pointer(net->gen)); kmem_cache_free(net_cachep, net); } diff --git a/net/core/sock.c b/net/core/sock.c index a9a9c2ff9260..c8842f279f7a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1455,7 +1455,6 @@ void sk_release_kernel(struct sock *sk) sock_hold(sk); sock_release(sk->sk_socket); - release_net(sock_net(sk)); sock_net_set(sk, get_net(&init_net)); sock_put(sk); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c6d267442dac..66c1e4fbf884 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - release_net(fi->fib_net); if (fi->fib_metrics != (u32 *) dst_default_metrics) kfree(fi->fib_metrics); kfree(fi); @@ -814,7 +813,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else fi->fib_metrics = (u32 *) dst_default_metrics; - fi->fib_net = hold_net(net); + fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9111a4e22155..f6a12b97d12b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -61,7 +61,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb != NULL) { - write_pnet(&tb->ib_net, hold_net(net)); + write_pnet(&tb->ib_net, net); tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -79,7 +79,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - release_net(ib_net(tb)); kmem_cache_free(cachep, tb); } } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2bd980526631..86ebf020925b 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -98,7 +98,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw) #ifdef SOCK_REFCNT_DEBUG pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - release_net(twsk_net(tw)); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -196,7 +195,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); - twsk_net_set(tw, hold_net(sock_net(sk))); + twsk_net_set(tw, sock_net(sk)); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index e43e79d0a612..59c793040498 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -129,9 +129,6 @@ static const __net_initconst struct ip6addrlbl_init_table /* Object management */ static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) { -#ifdef CONFIG_NET_NS - release_net(p->lbl_net); -#endif kfree(p); } @@ -241,7 +238,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->label = label; INIT_HLIST_NODE(&newp->list); #ifdef CONFIG_NET_NS - newp->lbl_net = hold_net(net); + newp->lbl_net = net; #endif atomic_set(&newp->refcnt, 1); return newp; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index f45d6db50a45..457303886fd4 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -100,7 +100,6 @@ static void fl_free(struct ip6_flowlabel *fl) if (fl) { if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); - release_net(fl->fl_net); kfree(fl->opt); kfree_rcu(fl, rcu); } @@ -403,7 +402,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } } - fl->fl_net = hold_net(net); + fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5bae7243c577..096c6276e6b9 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -203,7 +203,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu) ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); - release_net(ovs_dp_get_net(dp)); kfree(dp->ports); kfree(dp); } @@ -1501,7 +1500,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (dp == NULL) goto err_free_reply; - ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); + ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); @@ -1575,7 +1574,6 @@ err_destroy_percpu: err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_free_dp: - release_net(ovs_dp_get_net(dp)); kfree(dp); err_free_reply: kfree_skb(reply); -- cgit v1.2.3 From 0c5c9fb55106333e773de8c9dd321fa8240caeb3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 Mar 2015 23:06:44 -0500 Subject: net: Introduce possible_net_t Having to say > #ifdef CONFIG_NET_NS > struct net *net; > #endif in structures is a little bit wordy and a little bit error prone. Instead it is possible to say: > typedef struct { > #ifdef CONFIG_NET_NS > struct net *net; > #endif > } possible_net_t; And then in a header say: > possible_net_t net; Which is cleaner and easier to use and easier to test, as the possible_net_t is always there no matter what the compile options. Further this allows read_pnet and write_pnet to be functions in all cases which is better at catching typos. This change adds possible_net_t, updates the definitions of read_pnet and write_pnet, updates optional struct net * variables that write_pnet uses on to have the type possible_net_t, and finally fixes up the b0rked users of read_pnet and write_pnet. Signed-off-by: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++------ include/net/cfg80211.h | 4 +--- include/net/genetlink.h | 4 +--- include/net/inet_hashtables.h | 4 +--- include/net/ip_vs.h | 8 ++++---- include/net/neighbour.h | 8 ++------ include/net/net_namespace.h | 23 +++++++++++++---------- include/net/netfilter/nf_conntrack.h | 5 ++--- include/net/sock.h | 4 +--- include/net/xfrm.h | 8 ++------ net/9p/trans_fd.c | 4 ++-- net/ipv4/ipmr.c | 4 +--- net/ipv6/addrlabel.c | 8 ++------ net/ipv6/ip6mr.c | 4 +--- net/openvswitch/datapath.h | 4 +--- net/packet/internal.h | 4 +--- 16 files changed, 37 insertions(+), 67 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cede40d9cac9..ddab1a2a07a0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1721,9 +1721,7 @@ struct net_device { struct netpoll_info __rcu *npinfo; #endif -#ifdef CONFIG_NET_NS - struct net *nd_net; -#endif + possible_net_t nd_net; /* mid-layer private */ union { @@ -1863,9 +1861,7 @@ struct net *dev_net(const struct net_device *dev) static inline void dev_net_set(struct net_device *dev, struct net *net) { -#ifdef CONFIG_NET_NS - dev->nd_net = net; -#endif + write_pnet(&dev->nd_net, net); } static inline bool netdev_uses_dsa(struct net_device *dev) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 64e09e1e8099..f977abec07f6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3183,10 +3183,8 @@ struct wiphy { const struct ieee80211_ht_cap *ht_capa_mod_mask; const struct ieee80211_vht_cap *vht_capa_mod_mask; -#ifdef CONFIG_NET_NS /* the network namespace this phy lives in currently */ - struct net *_net; -#endif + possible_net_t _net; #ifdef CONFIG_CFG80211_WEXT const struct iw_handler_def *wext; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 0574abd3db86..a9af1cc8c1bc 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -92,9 +92,7 @@ struct genl_info { struct genlmsghdr * genlhdr; void * userhdr; struct nlattr ** attrs; -#ifdef CONFIG_NET_NS - struct net * _net; -#endif + possible_net_t _net; void * user_ptr[2]; struct sock * dst_sk; }; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index dd1950a7e273..bcd64756e5fe 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -76,9 +76,7 @@ struct inet_ehash_bucket { * ports are created in O(1) time? I thought so. ;-) -DaveM */ struct inet_bind_bucket { -#ifdef CONFIG_NET_NS - struct net *ib_net; -#endif + possible_net_t ib_net; unsigned short port; signed char fastreuse; signed char fastreuseport; diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 20fd23398537..4e3731ee4eac 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -47,13 +47,13 @@ static inline struct net *skb_net(const struct sk_buff *skb) * Start with the most likely hit * End with BUG */ - if (likely(skb->dev && skb->dev->nd_net)) + if (likely(skb->dev && dev_net(skb->dev))) return dev_net(skb->dev); if (skb_dst(skb) && skb_dst(skb)->dev) return dev_net(skb_dst(skb)->dev); WARN(skb->sk, "Maybe skb_sknet should be used in %s() at line:%d\n", __func__, __LINE__); - if (likely(skb->sk && skb->sk->sk_net)) + if (likely(skb->sk && sock_net(skb->sk))) return sock_net(skb->sk); pr_err("There is no net ptr to find in the skb in %s() line:%d\n", __func__, __LINE__); @@ -71,11 +71,11 @@ static inline struct net *skb_sknet(const struct sk_buff *skb) #ifdef CONFIG_NET_NS #ifdef CONFIG_IP_VS_DEBUG /* Start with the most likely hit */ - if (likely(skb->sk && skb->sk->sk_net)) + if (likely(skb->sk && sock_net(skb->sk))) return sock_net(skb->sk); WARN(skb->dev, "Maybe skb_net should be used instead in %s() line:%d\n", __func__, __LINE__); - if (likely(skb->dev && skb->dev->nd_net)) + if (likely(skb->dev && dev_net(skb->dev))) return dev_net(skb->dev); pr_err("There is no net ptr to find in the skb in %s() line:%d\n", __func__, __LINE__); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index d48b8ec8b5f4..e7bdf5170802 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -65,9 +65,7 @@ enum { }; struct neigh_parms { -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; struct net_device *dev; struct list_head list; int (*neigh_setup)(struct neighbour *); @@ -167,9 +165,7 @@ struct neigh_ops { struct pneigh_entry { struct pneigh_entry *next; -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; struct net_device *dev; u8 flags; u8 key[0]; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fab51ceeabf3..f733656404de 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -231,24 +231,27 @@ int net_eq(const struct net *net1, const struct net *net2) #endif +typedef struct { #ifdef CONFIG_NET_NS + struct net *net; +#endif +} possible_net_t; -static inline void write_pnet(struct net **pnet, struct net *net) +static inline void write_pnet(possible_net_t *pnet, struct net *net) { - *pnet = net; +#ifdef CONFIG_NET_NS + pnet->net = net; +#endif } -static inline struct net *read_pnet(struct net * const *pnet) +static inline struct net *read_pnet(const possible_net_t *pnet) { - return *pnet; -} - +#ifdef CONFIG_NET_NS + return pnet->net; #else - -#define write_pnet(pnet, net) do { (void)(net);} while (0) -#define read_pnet(pnet) (&init_net) - + return &init_net; #endif +} #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 74f271a172dd..095433b8a8b0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -95,9 +95,8 @@ struct nf_conn { /* Timer function; drops refcnt when it goes off. */ struct timer_list timeout; -#ifdef CONFIG_NET_NS - struct net *ct_net; -#endif + possible_net_t ct_net; + /* all members below initialized via memset */ u8 __nfct_init_offset[0]; diff --git a/include/net/sock.h b/include/net/sock.h index 95b2c1c220f9..9411c3421dd3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -190,9 +190,7 @@ struct sock_common { struct hlist_nulls_node skc_portaddr_node; }; struct proto *skc_prot; -#ifdef CONFIG_NET_NS - struct net *skc_net; -#endif + possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dc4865e90fe4..d0ac7d7be8a7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -126,9 +126,7 @@ struct xfrm_state_walk { /* Full description of state of transformer. */ struct xfrm_state { -#ifdef CONFIG_NET_NS - struct net *xs_net; -#endif + possible_net_t xs_net; union { struct hlist_node gclist; struct hlist_node bydst; @@ -522,9 +520,7 @@ struct xfrm_policy_queue { }; struct xfrm_policy { -#ifdef CONFIG_NET_NS - struct net *xp_net; -#endif + possible_net_t xp_net; struct hlist_node bydst; struct hlist_node byidx; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 80d08f6664cb..3e3d82d8ff70 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -940,7 +940,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) sin_server.sin_family = AF_INET; sin_server.sin_addr.s_addr = in_aton(addr); sin_server.sin_port = htons(opts.port); - err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_INET, + err = __sock_create(current->nsproxy->net_ns, PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket, 1); if (err) { pr_err("%s (%d): problem creating socket\n", @@ -988,7 +988,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) sun_server.sun_family = PF_UNIX; strcpy(sun_server.sun_path, addr); - err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_UNIX, + err = __sock_create(current->nsproxy->net_ns, PF_UNIX, SOCK_STREAM, 0, &csocket, 1); if (err < 0) { pr_err("%s (%d): problem creating socket\n", diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9d78427652d2..5b188832800f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -73,9 +73,7 @@ struct mr_table { struct list_head list; -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 59c793040498..3cc50e2d3bf5 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -29,9 +29,7 @@ * Policy Table */ struct ip6addrlbl_entry { -#ifdef CONFIG_NET_NS - struct net *lbl_net; -#endif + possible_net_t lbl_net; struct in6_addr prefix; int prefixlen; int ifindex; @@ -237,9 +235,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); -#ifdef CONFIG_NET_NS - newp->lbl_net = net; -#endif + write_pnet(&newp->lbl_net, net); atomic_set(&newp->refcnt, 1); return newp; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 34b682617f50..4b9315aa273e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -56,9 +56,7 @@ struct mr6_table { struct list_head list; -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; u32 id; struct sock *mroute6_sk; struct timer_list ipmr_expire_timer; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 3ece94563079..4ec4a480b147 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -84,10 +84,8 @@ struct datapath { /* Stats. */ struct dp_stats_percpu __percpu *stats_percpu; -#ifdef CONFIG_NET_NS /* Network namespace ref. */ - struct net *net; -#endif + possible_net_t net; u32 user_features; }; diff --git a/net/packet/internal.h b/net/packet/internal.h index cdddf6a30399..fe6e20caea1d 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -74,9 +74,7 @@ extern struct mutex fanout_mutex; #define PACKET_FANOUT_MAX 256 struct packet_fanout { -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; unsigned int num_members; u16 id; u8 type; -- cgit v1.2.3 From 4170604feec780d00e7511c24fa0f6e5c2e4ed75 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 15 Mar 2015 21:07:14 -0700 Subject: switchdev: add swdev ops As discussed at netconf, introduce swdev_ops as first step to move switchdev ops from ndo to swdev. This will keep switchdev from cluttering up ndo ops space. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/net/switchdev.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ddab1a2a07a0..9e8a2a933c68 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1577,6 +1577,9 @@ struct net_device { const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; const struct forwarding_accel_ops *fwd_ops; +#ifdef CONFIG_NET_SWITCHDEV + const struct swdev_ops *swdev_ops; +#endif const struct header_ops *header_ops; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 1a9382febcc3..e5de53f92482 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -14,6 +14,44 @@ #include #include +struct fib_info; + +/** + * struct switchdev_ops - switchdev operations + * + * int (*swdev_parent_id_get)(struct net_device *dev, + * struct netdev_phys_item_id *psid); + * Called to get an ID of the switch chip this port is part of. + * If driver implements this, it indicates that it represents a port + * of a switch chip. + * + * int (*swdev_port_stp_update)(struct net_device *dev, u8 state); + * Called to notify switch device port of bridge port STP + * state change. + * + * int (*swdev_fib_ipv4_add)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 nlflags, u32 tb_id); + * Called to add/modify IPv4 route to switch device. + * + * int (*swdev_fib_ipv4_del)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 tb_id); + * Called to delete IPv4 route from switch device. + */ +struct swdev_ops { + int (*swdev_parent_id_get)(struct net_device *dev, + struct netdev_phys_item_id *psid); + int (*swdev_port_stp_update)(struct net_device *dev, u8 state); + int (*swdev_fib_ipv4_add)(struct net_device *dev, __be32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, + u32 tb_id); + int (*swdev_fib_ipv4_del)(struct net_device *dev, __be32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id); +}; + enum netdev_switch_notifier_type { NETDEV_SWITCH_FDB_ADD = 1, NETDEV_SWITCH_FDB_DEL, -- cgit v1.2.3 From 812a1c3ff3ee9d5100e0e71edb06681014e84a9b Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sun, 15 Mar 2015 21:07:16 -0700 Subject: netdev: remove ndo ops for switchdev Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e8a2a933c68..dd1d069758be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -768,8 +768,6 @@ struct netdev_phys_item_id { typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); -struct fib_info; - /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1024,23 +1022,6 @@ struct fib_info; * be otherwise expressed by feature flags. The check is called with * the set of features that the stack has calculated and it returns * those the driver believes to be appropriate. - * - * int (*ndo_switch_parent_id_get)(struct net_device *dev, - * struct netdev_phys_item_id *psid); - * Called to get an ID of the switch chip this port is part of. - * If driver implements this, it indicates that it represents a port - * of a switch chip. - * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); - * Called to notify switch device port of bridge port STP - * state change. - * int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst, - * int dst_len, struct fib_info *fi, - * u8 tos, u8 type, u32 nlflags, u32 tb_id); - * Called to add/modify IPv4 route to switch device. - * int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst, - * int dst_len, struct fib_info *fi, - * u8 tos, u8 type, u32 tb_id); - * Called to delete IPv4 route from switch device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1197,25 +1178,6 @@ struct net_device_ops { netdev_features_t (*ndo_features_check) (struct sk_buff *skb, struct net_device *dev, netdev_features_t features); -#ifdef CONFIG_NET_SWITCHDEV - int (*ndo_switch_parent_id_get)(struct net_device *dev, - struct netdev_phys_item_id *psid); - int (*ndo_switch_port_stp_update)(struct net_device *dev, - u8 state); - int (*ndo_switch_fib_ipv4_add)(struct net_device *dev, - __be32 dst, - int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 nlflags, - u32 tb_id); - int (*ndo_switch_fib_ipv4_del)(struct net_device *dev, - __be32 dst, - int dst_len, - struct fib_info *fi, - u8 tos, u8 type, - u32 tb_id); -#endif }; /** -- cgit v1.2.3 From 822b3b2ebfff8e9b3d006086c527738a7ca00cd0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Mar 2015 14:57:33 +0200 Subject: net: Add max rate tx queue attribute This adds a tx_maxrate attribute to the tx queue sysfs entry allowing for max-rate limiting. Along with DCB-ETS and BQL this provides another knob to tune queue performance. The limit units are Mbps. By default it is disabled. To disable the rate limitation after it has been set for a queue, it should be set to zero. Signed-off-by: John Fastabend Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net-queues | 8 +++ Documentation/networking/scaling.txt | 9 ++++ include/linux/netdevice.h | 8 +++ net/core/net-sysfs.c | 67 +++++++++++++++++++----- 4 files changed, 80 insertions(+), 12 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues index 5e9aeb91d355..0c0df91b1516 100644 --- a/Documentation/ABI/testing/sysfs-class-net-queues +++ b/Documentation/ABI/testing/sysfs-class-net-queues @@ -24,6 +24,14 @@ Description: Indicates the number of transmit timeout events seen by this network interface transmit queue. +What: /sys/class//queues/tx-/tx_maxrate +Date: March 2015 +KernelVersion: 4.1 +Contact: netdev@vger.kernel.org +Description: + A Mbps max-rate set for the queue, a value of zero means disabled, + default is disabled. + What: /sys/class//queues/tx-/xps_cpus Date: November 2010 KernelVersion: 2.6.38 diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index 99ca40e8e810..cbfac0949635 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt @@ -421,6 +421,15 @@ best CPUs to share a given queue are probably those that share the cache with the CPU that processes transmit completions for that queue (transmit interrupts). +Per TX Queue rate limitation: +============================= + +These are rate-limitation mechanisms implemented by HW, where currently +a max-rate attribute is supported, by setting a Mbps value to + +/sys/class/net//queues/tx-/tx_maxrate + +A value of zero means disabled, and this is the default. Further Information =================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd1d069758be..76c5de4978a8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -587,6 +587,7 @@ struct netdev_queue { #ifdef CONFIG_BQL struct dql dql; #endif + unsigned long tx_maxrate; } ____cacheline_aligned_in_smp; static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) @@ -1022,6 +1023,10 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * be otherwise expressed by feature flags. The check is called with * the set of features that the stack has calculated and it returns * those the driver believes to be appropriate. + * int (*ndo_set_tx_maxrate)(struct net_device *dev, + * int queue_index, u32 maxrate); + * Called when a user wants to set a max-rate limitation of specific + * TX queue. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1178,6 +1183,9 @@ struct net_device_ops { netdev_features_t (*ndo_features_check) (struct sk_buff *skb, struct net_device *dev, netdev_features_t features); + int (*ndo_set_tx_maxrate)(struct net_device *dev, + int queue_index, + u32 maxrate); }; /** diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index cf30620a88e1..7e58bd7ec232 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -951,6 +951,60 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, return sprintf(buf, "%lu", trans_timeout); } +#ifdef CONFIG_XPS +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + +static ssize_t show_tx_maxrate(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + return sprintf(buf, "%lu\n", queue->tx_maxrate); +} + +static ssize_t set_tx_maxrate(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + int err, index = get_netdev_queue_index(queue); + u32 rate = 0; + + err = kstrtou32(buf, 10, &rate); + if (err < 0) + return err; + + if (!rtnl_trylock()) + return restart_syscall(); + + err = -EOPNOTSUPP; + if (dev->netdev_ops->ndo_set_tx_maxrate) + err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + + rtnl_unlock(); + if (!err) { + queue->tx_maxrate = rate; + return len; + } + return err; +} + +static struct netdev_queue_attribute queue_tx_maxrate = + __ATTR(tx_maxrate, S_IRUGO | S_IWUSR, + show_tx_maxrate, set_tx_maxrate); +#endif + static struct netdev_queue_attribute queue_trans_timeout = __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); @@ -1065,18 +1119,6 @@ static struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static unsigned int get_netdev_queue_index(struct netdev_queue *queue) -{ - struct net_device *dev = queue->dev; - unsigned int i; - - i = queue - dev->_tx; - BUG_ON(i >= dev->num_tx_queues); - - return i; -} - - static ssize_t show_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) { @@ -1153,6 +1195,7 @@ static struct attribute *netdev_queue_default_attrs[] = { &queue_trans_timeout.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &queue_tx_maxrate.attr, #endif NULL }; -- cgit v1.2.3 From db24a9044ee191c397dcd1c6574f56d67d7c8df5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Mar 2015 20:23:15 -0600 Subject: net: add support for phys_port_name Similar to port id allow netdevices to specify port names and export the name via sysfs. Drivers can implement the netdevice operation to assist udev in having sane default names for the devices using the rule: $ cat /etc/udev/rules.d/80-net-setup-link.rules SUBSYSTEM=="net", ACTION=="add", ATTR{phys_port_name}!="", NAME="$attr{phys_port_name}" Use of phys_name versus phys_id was suggested-by Jiri Pirko. Signed-off-by: David Ahern Acked-by: Jiri Pirko Acked-by: Scott Feldman Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 8 ++++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/if_link.h | 1 + net/core/dev.c | 18 ++++++++++++++++++ net/core/net-sysfs.c | 23 +++++++++++++++++++++++ net/core/rtnetlink.c | 21 +++++++++++++++++++++ 6 files changed, 75 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index beb8ec4dabbc..5ecfd72ba684 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -188,6 +188,14 @@ Description: Indicates the interface unique physical port identifier within the NIC, as a string. +What: /sys/class/net//phys_port_name +Date: March 2015 +KernelVersion: 4.0 +Contact: netdev@vger.kernel.org +Description: + Indicates the interface physical port name within the NIC, + as a string. + What: /sys/class/net//speed Date: October 2009 KernelVersion: 2.6.33 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 76c5de4978a8..ec8f9b5f6500 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1164,6 +1164,8 @@ struct net_device_ops { bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); + int (*ndo_get_phys_port_name)(struct net_device *dev, + char *name, size_t len); void (*ndo_add_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); @@ -2947,6 +2949,8 @@ int dev_set_mac_address(struct net_device *, struct sockaddr *); int dev_change_carrier(struct net_device *, bool new_carrier); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 756436e1ce89..7158fd00a109 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -147,6 +147,7 @@ enum { IFLA_CARRIER_CHANGES, IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 39fe369b46ad..a1f24151db5b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5911,6 +5911,24 @@ int dev_get_phys_port_id(struct net_device *dev, } EXPORT_SYMBOL(dev_get_phys_port_id); +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_name) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_name(dev, name, len); +} +EXPORT_SYMBOL(dev_get_phys_port_name); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7e58bd7ec232..cc5cf689809c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -418,6 +418,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sprintf(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -465,6 +487,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, NULL, }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..6abe634c666c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -982,6 +982,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; @@ -1072,6 +1090,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_port_name_fill(skb, dev)) + goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; -- cgit v1.2.3 From 99c4a26a159b28fa46a3e746a9b41b297e73d261 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 18 Mar 2015 22:52:33 -0400 Subject: net: Fix high overhead of vlan sub-device teardown. When a networking device is taken down that has a non-trivial number of VLAN devices configured under it, we eat a full synchronize_net() for every such VLAN device. This is because of the call chain: NETDEV_DOWN notifier --> vlan_device_event() --> dev_change_flags() --> __dev_change_flags() --> __dev_close() --> __dev_close_many() --> dev_deactivate_many() --> synchronize_net() This is kind of rediculous because we already have infrastructure for batching doing operation X to a list of net devices so that we only incur one sync. So make use of that by exporting dev_close_many() and adjusting it's interfaace so that the caller can fully manage the batch list. Use this in vlan_device_event() and all the overhead goes away. Reported-by: Salam Noureddine Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/8021q/vlan.c | 16 +++++++++++++--- net/core/dev.c | 10 ++++++---- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ec8f9b5f6500..76951c5fbedf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2156,6 +2156,7 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev); int dev_close(struct net_device *dev); +int dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 64c6bed4a3d3..98a30a5b8664 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -413,7 +413,10 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan_transfer_features(dev, vlandev); break; - case NETDEV_DOWN: + case NETDEV_DOWN: { + struct net_device *tmp; + LIST_HEAD(close_list); + if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); @@ -425,11 +428,18 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs & ~IFF_UP); + list_add(&vlandev->close_list, &close_list); + } + + dev_close_many(&close_list, false); + + list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { netif_stacked_transfer_operstate(dev, vlandev); + list_del_init(&vlandev->close_list); } + list_del(&close_list); break; - + } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { diff --git a/net/core/dev.c b/net/core/dev.c index a1f24151db5b..5d43e010ef87 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1385,7 +1385,7 @@ static int __dev_close(struct net_device *dev) return retval; } -static int dev_close_many(struct list_head *head) +int dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1399,11 +1399,13 @@ static int dev_close_many(struct list_head *head) list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); - list_del_init(&dev->close_list); + if (unlink) + list_del_init(&dev->close_list); } return 0; } +EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. @@ -1420,7 +1422,7 @@ int dev_close(struct net_device *dev) LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_close_many(&single); + dev_close_many(&single, true); list_del(&single); } return 0; @@ -5986,7 +5988,7 @@ static void rollback_registered_many(struct list_head *head) /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head); + dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ -- cgit v1.2.3 From 0117ec1970c5fa9c566045e7df8db76acc8f150e Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 23 Mar 2015 18:40:02 +0100 Subject: net: remove never used forwarding_accel_ops pointer from net_device Cc: John Fastabend Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ae69e7df867..08c4ab37189f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1342,7 +1342,6 @@ enum netdev_priv_flags { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @ethtool_ops: Management operations - * @fwd_ops: Management operations * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * @@ -1551,7 +1550,6 @@ struct net_device { #endif const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; - const struct forwarding_accel_ops *fwd_ops; #ifdef CONFIG_NET_SWITCHDEV const struct swdev_ops *swdev_ops; #endif -- cgit v1.2.3 From e38f30256b36700aa63aa709dc091bf6eb69c257 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 27 Mar 2015 14:31:13 +0900 Subject: net: Introduce passthru_features_check As there are a number of (especially virtual) devices that don't need the multiple vlan check, introduce passthru_features_check() for convenience. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 08c4ab37189f..967bb4c8caf1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3657,6 +3657,9 @@ void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features); netdev_features_t netif_skb_features(struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) diff --git a/net/core/dev.c b/net/core/dev.c index cb46badbef5a..3a06003ecafd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2562,6 +2562,14 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, return features; } +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return features; +} +EXPORT_SYMBOL(passthru_features_check); + static netdev_features_t dflt_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) -- cgit v1.2.3 From a54acb3a6f853e8394c4cb7b6a4d93c88f13eefd Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 2 Apr 2015 17:07:00 +0200 Subject: dev: introduce dev_get_iflink() The goal of this patch is to prepare the removal of the iflink field. It introduces a new ndo function, which will be implemented by virtual interfaces. There is no functional change into this patch. All readers of iflink field now call dev_get_iflink(). Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_core.c | 2 +- include/linux/netdevice.h | 4 ++++ net/batman-adv/hard-interface.c | 5 +++-- net/bridge/br_netlink.c | 4 ++-- net/core/dev.c | 21 +++++++++++++++++++-- net/core/link_watch.c | 4 ++-- net/core/net-sysfs.c | 10 +++++++++- net/core/rtnetlink.c | 8 ++++---- net/ipv4/ipmr.c | 2 +- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6mr.c | 2 +- 11 files changed, 48 insertions(+), 18 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 2a175006028b..131bde98188d 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -330,7 +330,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb) struct rtable *rt; int err, ret = NET_XMIT_DROP; struct flowi4 fl4 = { - .flowi4_oif = dev->iflink, + .flowi4_oif = dev_get_iflink(dev), .flowi4_tos = RT_TOS(ip4h->tos), .flowi4_flags = FLOWI_FLAG_ANYSRC, .daddr = ip4h->daddr, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 967bb4c8caf1..788eb7a622ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1030,6 +1030,8 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * int queue_index, u32 maxrate); * Called when a user wants to set a max-rate limitation of specific * TX queue. + * int (*ndo_get_iflink)(const struct net_device *dev); + * Called to get the iflink value of this device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1191,6 +1193,7 @@ struct net_device_ops { int (*ndo_set_tx_maxrate)(struct net_device *dev, int queue_index, u32 maxrate); + int (*ndo_get_iflink)(const struct net_device *dev); }; /** @@ -2149,6 +2152,7 @@ void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); +int dev_get_iflink(const struct net_device *dev); struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index fbda6b54baff..baf1f9843f2c 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -83,11 +83,12 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; /* no more parents..stop recursion */ - if (net_dev->iflink == 0 || net_dev->iflink == net_dev->ifindex) + if (dev_get_iflink(net_dev) == 0 || + dev_get_iflink(net_dev) == net_dev->ifindex) return false; /* recurse over the parent device */ - parent_dev = __dev_get_by_index(&init_net, net_dev->iflink); + parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e1115a224a95..0e4ddb81610d 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -305,8 +305,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; if (event == RTM_NEWLINK && port) { diff --git a/net/core/dev.c b/net/core/dev.c index 65492b0354c0..77172d085760 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -659,6 +659,23 @@ __setup("netdev=", netdev_boot_setup); *******************************************************************************/ +/** + * dev_get_iflink - get 'iflink' value of a interface + * @dev: targeted interface + * + * Indicates the ifindex the interface is linked to. + * Physical interfaces have the same 'ifindex' and 'iflink' values. + */ + +int dev_get_iflink(const struct net_device *dev) +{ + if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) + return dev->netdev_ops->ndo_get_iflink(dev); + + return dev->iflink; +} +EXPORT_SYMBOL(dev_get_iflink); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -6345,7 +6362,7 @@ int register_netdevice(struct net_device *dev) else if (__dev_get_by_index(net, dev->ifindex)) goto err_uninit; - if (dev->iflink == -1) + if (dev_get_iflink(dev) == -1) dev->iflink = dev->ifindex; /* Transfer changeable features to wanted_features and enable @@ -7061,7 +7078,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev->iflink == dev->ifindex); + int iflink = (dev_get_iflink(dev) == dev->ifindex); dev->ifindex = dev_new_index(net); if (iflink) dev->iflink = dev->ifindex; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 49a9e3e06c08..982861607f88 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -40,7 +40,7 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { if (!netif_carrier_ok(dev)) - return (dev->ifindex != dev->iflink ? + return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); if (netif_dormant(dev)) @@ -89,7 +89,7 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (!netif_running(dev)) return false; - if (dev->ifindex != dev->iflink) + if (dev->ifindex != dev_get_iflink(dev)) return true; if (dev->priv_flags & IFF_TEAM_PORT) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index cc5cf689809c..4238d6da5c60 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -109,11 +109,19 @@ NETDEVICE_SHOW_RO(dev_id, fmt_hex); NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); -NETDEVICE_SHOW_RO(iflink, fmt_dec); NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); +static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, dev_get_iflink(ndev)); +} +static DEVICE_ATTR_RO(iflink); + static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { return sprintf(buf, fmt_dec, dev->name_assign_type); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b96ac2109c82..ee0186cdd5cf 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1055,8 +1055,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink)) || + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || (upper_dev && nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || @@ -2863,8 +2863,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b4a545d24adb..eec68b0c3bc8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -801,7 +801,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->pkt_out = 0; v->link = dev->ifindex; if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) - v->link = dev->iflink; + v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5c9e94cb1b2c..37b70e82bff8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4858,8 +4858,8 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; protoinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protoinfo) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index caf6b99374e6..18a5ab286420 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -992,7 +992,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, v->pkt_out = 0; v->link = dev->ifindex; if (v->flags & MIFF_REGISTER) - v->link = dev->iflink; + v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); -- cgit v1.2.3 From 7a66bbc96ce9ad8261fa5f7f6ae65370eb6866ee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 2 Apr 2015 17:07:09 +0200 Subject: net: remove iflink field from struct net_device Now that all users of iflink have the ndo_get_iflink handler available, it's possible to remove this field. By default, dev_get_iflink() returns the ifindex of the interface. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- net/core/dev.c | 13 ++----------- 2 files changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 788eb7a622ad..846a1f5bc9db 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1538,7 +1538,7 @@ struct net_device { netdev_features_t mpls_features; int ifindex; - int iflink; + int group; struct net_device_stats stats; @@ -1741,7 +1741,6 @@ struct net_device { #endif struct phy_device *phydev; struct lock_class_key *qdisc_tx_busylock; - int group; struct pm_qos_request pm_qos_req; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/core/dev.c b/net/core/dev.c index 77172d085760..3be107e0bc93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -672,7 +672,7 @@ int dev_get_iflink(const struct net_device *dev) if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); - return dev->iflink; + return dev->ifindex; } EXPORT_SYMBOL(dev_get_iflink); @@ -6331,8 +6331,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - dev->iflink = -1; - ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -6362,9 +6360,6 @@ int register_netdevice(struct net_device *dev) else if (__dev_get_by_index(net, dev->ifindex)) goto err_uninit; - if (dev_get_iflink(dev) == -1) - dev->iflink = dev->ifindex; - /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ @@ -7077,12 +7072,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_net_set(dev, net); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev_get_iflink(dev) == dev->ifindex); + if (__dev_get_by_index(net, dev->ifindex)) dev->ifindex = dev_new_index(net); - if (iflink) - dev->iflink = dev->ifindex; - } /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); -- cgit v1.2.3 From e79d8429aac95a5cbe4c235795c7cd554c91f924 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 3 Apr 2015 22:17:17 +1030 Subject: netdevice: document NETDEV_TX_BUSY deprecation. This paraphrases DaveM (and steals some of his words) explaining why a device shouldn't return NETDEV_TX_BUSY, even though it looks so inviting to driver authors. See http://www.spinics.net/lists/netdev/msg322350.html Inspired-by: David Miller Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 846a1f5bc9db..a710d22b174f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -795,7 +795,10 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, * struct net_device *dev); * Called when a packet needs to be transmitted. - * Must return NETDEV_TX_OK , NETDEV_TX_BUSY. + * Returns NETDEV_TX_OK. Can return NETDEV_TX_BUSY, but you should stop + * the queue before that can happen; it's for obsolete devices and weird + * corner cases, but the stack really does a non-trivial amount + * of useless work if you return NETDEV_TX_BUSY. * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * -- cgit v1.2.3 From f60e5990d9c1424af9dbca60a23ba2a1c7c1ce90 Mon Sep 17 00:00:00 2001 From: "hannes@stressinduktion.org" Date: Wed, 1 Apr 2015 17:07:44 +0200 Subject: ipv6: protect skb->sk accesses from recursive dereference inside the stack We should not consult skb->sk for output decisions in xmit recursion levels > 0 in the stack. Otherwise local socket settings could influence the result of e.g. tunnel encapsulation process. ipv6 does not conform with this in three places: 1) ip6_fragment: we do consult ipv6_npinfo for frag_size 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should loop the packet back to the local socket 3) ip6_skb_dst_mtu could query the settings from the user socket and force a wrong MTU Furthermore: In sk_mc_loop we could potentially land in WARN_ON(1) if we use a PF_PACKET socket ontop of an IPv6-backed vxlan device. Reuse xmit_recursion as we are currently only interested in protecting tunnel devices. Cc: Jiri Pirko Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip.h | 16 ---------------- include/net/ip6_route.h | 3 ++- include/net/sock.h | 2 ++ net/core/dev.c | 4 +++- net/core/sock.c | 19 +++++++++++++++++++ net/ipv6/ip6_output.c | 3 ++- 7 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dcf6ec27739b..278738873703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2185,6 +2185,12 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +DECLARE_PER_CPU(int, xmit_recursion); +static inline int dev_recursion_level(void) +{ + return this_cpu_read(xmit_recursion); +} + struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/include/net/ip.h b/include/net/ip.h index 025c61c0dffb..6cc1eafb153a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -453,22 +453,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif -static inline int sk_mc_loop(struct sock *sk) -{ - if (!sk) - return 1; - switch (sk->sk_family) { - case AF_INET: - return inet_sk(sk)->mc_loop; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - return inet6_sk(sk)->mc_loop; -#endif - } - WARN_ON(1); - return 1; -} - bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1d09b46c1e48..eda131d179d9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); static inline int ip6_skb_dst_mtu(struct sk_buff *skb) { - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); diff --git a/include/net/sock.h b/include/net/sock.h index ab186b1d31ff..e4079c28e6b8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +bool sk_mc_loop(struct sock *sk); + static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); diff --git a/net/core/dev.c b/net/core/dev.c index 962ee9d71964..45109b70664e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); +DEFINE_PER_CPU(int, xmit_recursion); +EXPORT_SYMBOL(xmit_recursion); + #define RECURSION_LIMIT 10 /** diff --git a/net/core/sock.c b/net/core/sock.c index 78e89eb7eb70..71e3e5f1eaa0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) sock_reset_flag(sk, bit); } +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + switch (sk->sk_family) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e80b61b51ff..36cf0ab685a0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; -- cgit v1.2.3