diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 138 | ||||
-rw-r--r-- | net/core/neighbour.c | 2 | ||||
-rw-r--r-- | net/core/net-traces.c | 4 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 274 | ||||
-rw-r--r-- | net/core/skbuff.c | 3 |
5 files changed, 268 insertions, 153 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index ab9b8d0d115e..5df6cbce727c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -96,6 +96,7 @@ #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/sock.h> +#include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dst.h> @@ -182,8 +183,8 @@ EXPORT_SYMBOL(dev_base_lock); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; -static DEFINE_HASHTABLE(napi_hash, 8); +static unsigned int napi_gen_id = NR_CPUS; +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; @@ -2403,17 +2404,20 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, + name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); @@ -3018,7 +3022,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, int queue_index = 0; #ifdef CONFIG_XPS - if (skb->sender_cpu == 0) + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif @@ -4350,6 +4356,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); @@ -4383,7 +4390,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - napi->skb = skb; + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } } return skb; } @@ -4658,7 +4668,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -4669,43 +4679,101 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } -EXPORT_SYMBOL_GPL(napi_by_id); -void napi_hash_add(struct napi_struct *napi) +#if defined(CONFIG_NET_RX_BUSY_POLL) +#define BUSY_POLL_BUDGET 8 +bool sk_busy_loop(struct sock *sk, int nonblock) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*busy_poll)(struct napi_struct *dev); + struct napi_struct *napi; + int rc = false; - spin_lock(&napi_hash_lock); + rcu_read_lock(); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + /* Note: ndo_busy_poll method is optional in linux-4.5 */ + busy_poll = napi->dev->netdev_ops->ndo_busy_poll; + + do { + rc = 0; + local_bh_disable(); + if (busy_poll) { + rc = busy_poll(napi); + } else if (napi_schedule_prep(napi)) { + void *have = netpoll_poll_lock(napi); + + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi); + if (rc == BUSY_POLL_BUDGET) { + napi_complete_done(napi, rc); + napi_schedule(napi); + } + } + netpoll_poll_unlock(have); } + if (rc > 0) + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + local_bh_enable(); - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ - spin_unlock(&napi_hash_lock); - } + cpu_relax(); + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + !need_resched() && !busy_loop_timeout(end_time)); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock(); + return rc; +} +EXPORT_SYMBOL(sk_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +void napi_hash_add(struct napi_struct *napi) +{ + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || + test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + return; + + spin_lock(&napi_hash_lock); + + /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < NR_CPUS + 1)) + napi_gen_id = NR_CPUS + 1; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); } EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -void napi_hash_del(struct napi_struct *napi) +bool napi_hash_del(struct napi_struct *napi) { + bool rcu_sync_needed = false; + spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { + rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); - + } spin_unlock(&napi_hash_lock); + return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); @@ -4741,6 +4809,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); @@ -4760,8 +4829,12 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +/* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { + might_sleep(); + if (napi_hash_del(napi)) + synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); @@ -6426,11 +6499,16 @@ int __netdev_update_features(struct net_device *dev) if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } @@ -7156,11 +7234,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs); * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. * If this is the last reference then it will be freed. + * Must be called in process context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + might_sleep(); netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1aa8437ed6c4..e6af42da28d9 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -857,7 +857,7 @@ static void neigh_probe(struct neighbour *neigh) struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) - skb = skb_copy(skb, GFP_ATOMIC); + skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index adef015b2f41..92da5e4ceb4f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -32,6 +32,10 @@ #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/fib.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <trace/events/fib6.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); +#endif EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 504bd17b7456..34ba7a08876d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1045,15 +1045,156 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, + struct net_device *dev) +{ + const struct rtnl_link_stats64 *stats; + struct rtnl_link_stats64 temp; + struct nlattr *attr; + + stats = dev_get_stats(dev, &temp); + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats(nla_data(attr), stats); + + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats64(nla_data(attr), stats); + + return 0; +} + +static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, + struct net_device *dev, + int vfs_num, + struct nlattr *vfinfo) +{ + struct ifla_vf_rss_query_en vf_rss_query_en; + struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; + struct nlattr *vf, *vfstats; + struct ifla_vf_mac vf_mac; + struct ifla_vf_info ivi; + + /* Not all SR-IOV capable drivers support the + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. + */ + ivi.spoofchk = -1; + ivi.rss_query_en = -1; + ivi.trusted = -1; + memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; + if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) + return 0; + + vf_mac.vf = + vf_vlan.vf = + vf_rate.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = + vf_linkstate.vf = + vf_rss_query_en.vf = + vf_trust.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || + nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate) || + nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) + return -EMSGSIZE; + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start(skb, IFLA_VF_STATS); + if (!vfstats) { + nla_nest_cancel(skb, vf); + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast) || + nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast)) + return -EMSGSIZE; + nla_nest_end(skb, vfstats); + nla_nest_end(skb, vf); + return 0; +} + +static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +{ + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats; - struct nlattr *attr, *af_spec; + struct nlattr *af_spec; struct rtnl_af_ops *af_ops; struct net_device *upper_dev = netdev_master_upper_dev_get(dev); @@ -1096,18 +1237,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; - if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) - goto nla_put_failure; - } + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || @@ -1124,128 +1255,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) - goto nla_put_failure; - - stats = dev_get_stats(dev, &temp); - copy_rtnl_link_stats(nla_data(attr), stats); - - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); - if (attr == NULL) + if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - copy_rtnl_link_stats64(nla_data(attr), stats); if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent - && (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && + ext_filter_mask & RTEXT_FILTER_VF) { int i; - - struct nlattr *vfinfo, *vf, *vfstats; + struct nlattr *vfinfo; int num_vfs = dev_num_vf(dev->dev.parent); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); if (!vfinfo) goto nla_put_failure; for (i = 0; i < num_vfs; i++) { - struct ifla_vf_info ivi; - struct ifla_vf_mac vf_mac; - struct ifla_vf_vlan vf_vlan; - struct ifla_vf_rate vf_rate; - struct ifla_vf_tx_rate vf_tx_rate; - struct ifla_vf_spoofchk vf_spoofchk; - struct ifla_vf_link_state vf_linkstate; - struct ifla_vf_rss_query_en vf_rss_query_en; - struct ifla_vf_stats vf_stats; - struct ifla_vf_trust vf_trust; - - /* - * Not all SR-IOV capable drivers support the - * spoofcheck and "RSS query enable" query. Preset to - * -1 so the user space tool can detect that the driver - * didn't report anything. - */ - ivi.spoofchk = -1; - ivi.rss_query_en = -1; - ivi.trusted = -1; - memset(ivi.mac, 0, sizeof(ivi.mac)); - /* The default value for VF link state is "auto" - * IFLA_VF_LINK_STATE_AUTO which equals zero - */ - ivi.linkstate = 0; - if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) - break; - vf_mac.vf = - vf_vlan.vf = - vf_rate.vf = - vf_tx_rate.vf = - vf_spoofchk.vf = - vf_linkstate.vf = - vf_rss_query_en.vf = - vf_trust.vf = ivi.vf; - - memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); - vf_vlan.vlan = ivi.vlan; - vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.max_tx_rate; - vf_rate.min_tx_rate = ivi.min_tx_rate; - vf_rate.max_tx_rate = ivi.max_tx_rate; - vf_spoofchk.setting = ivi.spoofchk; - vf_linkstate.link_state = ivi.linkstate; - vf_rss_query_en.setting = ivi.rss_query_en; - vf_trust.setting = ivi.trusted; - vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || - nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || - nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), - &vf_rate) || - nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), - &vf_tx_rate) || - nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk) || - nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate) || - nla_put(skb, IFLA_VF_RSS_QUERY_EN, - sizeof(vf_rss_query_en), - &vf_rss_query_en) || - nla_put(skb, IFLA_VF_TRUST, - sizeof(vf_trust), &vf_trust)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) goto nla_put_failure; - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, i, - &vf_stats); - vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast) || - nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast)) - goto nla_put_failure; - nla_nest_end(skb, vfstats); - nla_nest_end(skb, vf); } + nla_nest_end(skb, vfinfo); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index aa41e6dd6429..152b9c70e252 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4268,7 +4268,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len, + 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; } |