From fc766e4c4965915ab52a1d1fa3c7a7b3e7bc07f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Oct 2010 03:09:24 +0000 Subject: decnet: RCU conversion and get rid of dev_base_lock While tracking dev_base_lock users, I found decnet used it in dnet_select_source(), but for a wrong purpose: Writers only hold RTNL, not dev_base_lock, so readers must use RCU if they cannot use RTNL. Adds an rcu_head in struct dn_ifaddr and handle proper RCU management. Adds __rcu annotation in dn_route as well. Signed-off-by: Eric Dumazet Acked-by: Steven Whitehouse Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8fd2c23a1b9..578debb801f4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -951,7 +951,7 @@ struct net_device { #endif void *atalk_ptr; /* AppleTalk link */ struct in_device __rcu *ip_ptr; /* IPv4 specific data */ - void *dn_ptr; /* DECnet specific data */ + struct dn_dev __rcu *dn_ptr; /* DECnet specific data */ struct inet6_dev __rcu *ip6_ptr; /* IPv6 specific data */ void *ec_ptr; /* Econet specific data */ void *ax25_ptr; /* AX.25 specific data */ -- cgit v1.2.3 From 58e998c6d23988490162cef0784b19ea274d90bb Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 29 Oct 2010 12:14:55 +0000 Subject: offloading: Force software GSO for multiple vlan tags. We currently use vlan_features to check for TSO support if there is a vlan tag. However, it's quite likely that the NIC is not able to do TSO when there is an arbitrary number of tags. Therefore if there is more than one tag (in-band or out-of-band), fall back to software emulation. Signed-off-by: Jesse Gross CC: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++---- net/core/dev.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 578debb801f4..6e4cfbc53d4c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2239,6 +2239,8 @@ unsigned long netdev_fix_features(unsigned long features, const char *name); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); +int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev); + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; @@ -2254,10 +2256,7 @@ static inline int skb_gso_ok(struct sk_buff *skb, int features) static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { if (skb_is_gso(skb)) { - int features = dev->features; - - if (skb->protocol == htons(ETH_P_8021Q) || skb->vlan_tci) - features &= dev->vlan_features; + int features = netif_get_vlan_features(skb, dev); return (!skb_gso_ok(skb, features) || unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); diff --git a/net/core/dev.c b/net/core/dev.c index 368930a988e3..8b500c3e0297 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1968,6 +1968,22 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } +int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev) +{ + __be16 protocol = skb->protocol; + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!skb->vlan_tci) + return dev->features; + + if (protocol != htons(ETH_P_8021Q)) + return dev->features & dev->vlan_features; + else + return 0; +} + /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or -- cgit v1.2.3 From fe8222406c8277a21172479d3a8283d31c209028 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 9 Nov 2010 10:47:38 +0000 Subject: net: Simplify RX queue allocation This patch move RX queue allocation to alloc_netdev_mq and freeing of the queues to free_netdev (symmetric to TX queue allocation). Each kobject RX queue takes a reference to the queue's device so that the device can't be freed before all the kobjects have been released-- this obviates the need for reference counts specific to RX queues. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- net/core/dev.c | 19 ++++++++++--------- net/core/net-sysfs.c | 7 ++----- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6e4cfbc53d4c..fccb11f879e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -592,8 +592,7 @@ struct netdev_rx_queue { struct rps_map __rcu *rps_map; struct rps_dev_flow_table __rcu *rps_flow_table; struct kobject kobj; - struct netdev_rx_queue *first; - atomic_t count; + struct net_device *dev; } ____cacheline_aligned_in_smp; #endif /* CONFIG_RPS */ diff --git a/net/core/dev.c b/net/core/dev.c index 75490670e0a9..8725d168d1f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5051,12 +5051,8 @@ static int netif_alloc_rx_queues(struct net_device *dev) } dev->_rx = rx; - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ for (i = 0; i < count; i++) - rx[i].first = rx; + rx[i].dev = dev; #endif return 0; } @@ -5132,10 +5128,6 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = netif_alloc_rx_queues(dev); - if (ret) - goto out; - netdev_init_queues(dev); /* Init, if this function is available */ @@ -5601,6 +5593,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, #ifdef CONFIG_RPS dev->num_rx_queues = queue_count; dev->real_num_rx_queues = queue_count; + if (netif_alloc_rx_queues(dev)) + goto free_pcpu; #endif dev->gso_max_size = GSO_MAX_SIZE; @@ -5618,6 +5612,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_pcpu: free_percpu(dev->pcpu_refcnt); kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + free_p: kfree(p); return NULL; @@ -5639,6 +5637,9 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif kfree(rcu_dereference_raw(dev->ingress_queue)); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a5ff5a89f376..3ba526b56fe3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -706,7 +706,6 @@ static struct attribute *rx_queue_default_attrs[] = { static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); - struct netdev_rx_queue *first = queue->first; struct rps_map *map; struct rps_dev_flow_table *flow_table; @@ -719,8 +718,7 @@ static void rx_queue_release(struct kobject *kobj) if (flow_table) call_rcu(&flow_table->rcu, rps_dev_flow_table_release); - if (atomic_dec_and_test(&first->count)) - kfree(first); + dev_put(queue->dev); } static struct kobj_type rx_queue_ktype = { @@ -732,7 +730,6 @@ static struct kobj_type rx_queue_ktype = { static int rx_queue_add_kobject(struct net_device *net, int index) { struct netdev_rx_queue *queue = net->_rx + index; - struct netdev_rx_queue *first = queue->first; struct kobject *kobj = &queue->kobj; int error = 0; @@ -745,7 +742,7 @@ static int rx_queue_add_kobject(struct net_device *net, int index) } kobject_uevent(kobj, KOBJ_ADD); - atomic_inc(&first->count); + dev_hold(queue->dev); return error; } -- cgit v1.2.3 From 61391cde9eefac5cfcf6d214aa80c77e58b1626b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 15 Nov 2010 06:38:12 +0000 Subject: netdev: add rcu annotations to receive handler hook Suggested by Eric's bridge RCU changes. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fccb11f879e5..b45c1b8b1d19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -994,8 +994,8 @@ struct net_device { unsigned int real_num_rx_queues; #endif - rx_handler_func_t *rx_handler; - void *rx_handler_data; + rx_handler_func_t __rcu *rx_handler; + void __rcu *rx_handler_data; struct netdev_queue __rcu *ingress_queue; -- cgit v1.2.3 From 1d24eb4815d1e0e8b451ecc546645f8ef1176d4f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 21 Nov 2010 13:17:27 +0000 Subject: xps: Transmit Packet Steering This patch implements transmit packet steering (XPS) for multiqueue devices. XPS selects a transmit queue during packet transmission based on configuration. This is done by mapping the CPU transmitting the packet to a queue. This is the transmit side analogue to RPS-- where RPS is selecting a CPU based on receive queue, XPS selects a queue based on the CPU (previously there was an XPS patch from Eric Dumazet, but that might more appropriately be called transmit completion steering). Each transmit queue can be associated with a number of CPUs which will use the queue to send packets. This is configured as a CPU mask on a per queue basis in: /sys/class/net/eth/queues/tx-/xps_cpus The mappings are stored per device in an inverted data structure that maps CPUs to queues. In the netdevice structure this is an array of num_possible_cpu structures where each structure holds and array of queue_indexes for queues which that CPU can use. The benefits of XPS are improved locality in the per queue data structures. Also, transmit completions are more likely to be done nearer to the sending thread, so this should promote locality back to the socket on free (e.g. UDP). The benefits of XPS are dependent on cache hierarchy, application load, and other factors. XPS would nominally be configured so that a queue would only be shared by CPUs which are sharing a cache, the degenerative configuration woud be that each CPU has it's own queue. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. bnx2x on 16 core AMD XPS (16 queues, 1 TX queue per CPU) 1234K at 100% CPU No XPS (16 queues) 996K at 100% CPU Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 30 ++++ net/core/dev.c | 53 ++++++- net/core/net-sysfs.c | 369 +++++++++++++++++++++++++++++++++++++++++++++- net/core/net-sysfs.h | 3 + 4 files changed, 447 insertions(+), 8 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b45c1b8b1d19..badf9285fe0d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -503,6 +503,10 @@ struct netdev_queue { struct Qdisc *qdisc; unsigned long state; struct Qdisc *qdisc_sleeping; +#ifdef CONFIG_RPS + struct kobject kobj; +#endif + /* * write mostly part */ @@ -529,6 +533,30 @@ struct rps_map { }; #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) +/* + * This structure holds an XPS map which can be of variable length. The + * map is an array of queues. + */ +struct xps_map { + unsigned int len; + unsigned int alloc_len; + struct rcu_head rcu; + u16 queues[0]; +}; +#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16))) +#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \ + / sizeof(u16)) + +/* + * This structure holds all XPS maps for device. Maps are indexed by CPU. + */ +struct xps_dev_maps { + struct rcu_head rcu; + struct xps_map *cpu_map[0]; +}; +#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ + (nr_cpu_ids * sizeof(struct xps_map *))) + /* * The rps_dev_flow structure contains the mapping of a flow to a CPU and the * tail pointer for that CPU's input queue at the time of last enqueue. @@ -1016,6 +1044,8 @@ struct net_device { unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; + struct xps_dev_maps *xps_maps; + /* These may be needed for future network-power-down code. */ /* diff --git a/net/core/dev.c b/net/core/dev.c index 7b17674a29ec..c852f0038a08 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1557,12 +1557,16 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { + int rc; + if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); if (txq < dev->real_num_tx_queues) qdisc_reset_all_tx_gt(dev, txq); } @@ -2142,6 +2146,44 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) return queue_index; } +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_RPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { @@ -2161,7 +2203,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, queue_index >= dev->real_num_tx_queues) { int old_index = queue_index; - queue_index = skb_tx_hash(dev, skb); + queue_index = get_xps_queue(dev, skb); + if (queue_index < 0) + queue_index = skb_tx_hash(dev, skb); if (queue_index != old_index && sk) { struct dst_entry *dst = @@ -5066,6 +5110,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; + int i; BUG_ON(count < 1); @@ -5076,6 +5121,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return -ENOMEM; } dev->_tx = tx; + + for (i = 0; i < count; i++) + tx[i].dev = dev; + return 0; } @@ -5083,8 +5132,6 @@ static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { - queue->dev = dev; - /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7abeb7ceaa4c..68dbbfdee274 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -772,18 +772,377 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) return error; } -static int rx_queue_register_kobjects(struct net_device *net) +/* + * netdev_queue sysfs structures and functions. + */ +struct netdev_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_netdev_queue_attr(_attr) container_of(_attr, \ + struct netdev_queue_attribute, attr) + +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) + +static ssize_t netdev_queue_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t netdev_queue_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops netdev_queue_sysfs_ops = { + .show = netdev_queue_attr_show, + .store = netdev_queue_attr_store, +}; + +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) { + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + + +static ssize_t show_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + cpumask_var_t mask; + unsigned long index; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + for_each_possible_cpu(i) { + struct xps_map *map = + rcu_dereference(dev_maps->cpu_map[i]); + if (map) { + int j; + for (j = 0; j < map->len; j++) { + if (map->queues[j] == index) { + cpumask_set_cpu(i, mask); + break; + } + } + } + } + } + rcu_read_unlock(); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + free_cpumask_var(mask); + return -EINVAL; + } + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static void xps_map_release(struct rcu_head *rcu) +{ + struct xps_map *map = container_of(rcu, struct xps_map, rcu); + + kfree(map); +} + +static void xps_dev_maps_release(struct rcu_head *rcu) +{ + struct xps_dev_maps *dev_maps = + container_of(rcu, struct xps_dev_maps, rcu); + + kfree(dev_maps); +} + +static DEFINE_MUTEX(xps_map_mutex); + +static ssize_t store_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + cpumask_var_t mask; + int err, i, cpu, pos, map_len, alloc_len, need_set; + unsigned long index; + struct xps_map *map, *new_map; + struct xps_dev_maps *dev_maps, *new_dev_maps; + int nonempty = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + new_dev_maps = kzalloc(max_t(unsigned, + XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); + if (!new_dev_maps) { + free_cpumask_var(mask); + return -ENOMEM; + } + + mutex_lock(&xps_map_mutex); + + dev_maps = dev->xps_maps; + + for_each_possible_cpu(cpu) { + new_map = map = dev_maps ? dev_maps->cpu_map[cpu] : NULL; + + if (map) { + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + map_len = map->len; + alloc_len = map->alloc_len; + } else + pos = map_len = alloc_len = 0; + + need_set = cpu_isset(cpu, *mask) && cpu_online(cpu); + + if (need_set && pos >= map_len) { + /* Need to add queue to this CPU's map */ + if (map_len >= alloc_len) { + alloc_len = alloc_len ? + 2 * alloc_len : XPS_MIN_MAP_ALLOC; + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), + GFP_KERNEL); + if (!new_map) + goto error; + new_map->alloc_len = alloc_len; + for (i = 0; i < map_len; i++) + new_map->queues[i] = map->queues[i]; + new_map->len = map_len; + } + new_map->queues[new_map->len++] = index; + } else if (!need_set && pos < map_len) { + /* Need to remove queue from this CPU's map */ + if (map_len > 1) + new_map->queues[pos] = + new_map->queues[--new_map->len]; + else + new_map = NULL; + } + new_dev_maps->cpu_map[cpu] = new_map; + } + + /* Cleanup old maps */ + for_each_possible_cpu(cpu) { + map = dev_maps ? dev_maps->cpu_map[cpu] : NULL; + if (map && new_dev_maps->cpu_map[cpu] != map) + call_rcu(&map->rcu, xps_map_release); + if (new_dev_maps->cpu_map[cpu]) + nonempty = 1; + } + + if (nonempty) + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + else { + kfree(new_dev_maps); + rcu_assign_pointer(dev->xps_maps, NULL); + } + + if (dev_maps) + call_rcu(&dev_maps->rcu, xps_dev_maps_release); + + mutex_unlock(&xps_map_mutex); + + free_cpumask_var(mask); + return len; + +error: + mutex_unlock(&xps_map_mutex); + + if (new_dev_maps) + for_each_possible_cpu(i) + kfree(new_dev_maps->cpu_map[i]); + kfree(new_dev_maps); + free_cpumask_var(mask); + return -ENOMEM; +} + +static struct netdev_queue_attribute xps_cpus_attribute = + __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); + +static struct attribute *netdev_queue_default_attrs[] = { + &xps_cpus_attribute.attr, + NULL +}; + +static void netdev_queue_release(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = dev->xps_maps; + + if (dev_maps) { + for_each_possible_cpu(i) { + map = dev_maps->cpu_map[i]; + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + call_rcu(&map->rcu, xps_map_release); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + call_rcu(&dev_maps->rcu, xps_dev_maps_release); + } + } + + mutex_unlock(&xps_map_mutex); + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type netdev_queue_ktype = { + .sysfs_ops = &netdev_queue_sysfs_ops, + .release = netdev_queue_release, + .default_attrs = netdev_queue_default_attrs, +}; + +static int netdev_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_queue *queue = net->_tx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; +} + +int +netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = netdev_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_tx[i].kobj); + + return error; +} + +static int register_queue_kobjects(struct net_device *net) +{ + int error = 0, txq = 0, rxq = 0; + net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); if (!net->queues_kset) return -ENOMEM; - return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); + + error = net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); + if (error) + goto error; + rxq = net->real_num_rx_queues; + + error = netdev_queue_update_kobjects(net, 0, + net->real_num_tx_queues); + if (error) + goto error; + txq = net->real_num_tx_queues; + + return 0; + +error: + netdev_queue_update_kobjects(net, txq, 0); + net_rx_queue_update_kobjects(net, rxq, 0); + return error; } -static void rx_queue_remove_kobjects(struct net_device *net) +static void remove_queue_kobjects(struct net_device *net) { net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0); + netdev_queue_update_kobjects(net, net->real_num_tx_queues, 0); kset_unregister(net->queues_kset); } #endif /* CONFIG_RPS */ @@ -886,7 +1245,7 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); #ifdef CONFIG_RPS - rx_queue_remove_kobjects(net); + remove_queue_kobjects(net); #endif device_del(dev); @@ -927,7 +1286,7 @@ int netdev_register_kobject(struct net_device *net) return error; #ifdef CONFIG_RPS - error = rx_queue_register_kobjects(net); + error = register_queue_kobjects(net); if (error) { device_del(dev); return error; diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 778e1571548d..25ec2ee57df7 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -6,6 +6,9 @@ int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); #ifdef CONFIG_RPS int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +int netdev_queue_update_kobjects(struct net_device *net, + int old_num, int new_num); + #endif #endif -- cgit v1.2.3 From 5a0d2268d259886f0c87131639d19eb4a67b4532 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Nov 2010 10:42:02 +0000 Subject: net: add netif_tx_queue_frozen_or_stopped When testing struct netdev_queue state against FROZEN bit, we also test XOFF bit. We can test both bits at once and save some cycles. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- net/core/netpoll.c | 3 +-- net/core/pktgen.c | 2 +- net/sched/sch_generic.c | 8 +++----- net/sched/sch_teql.c | 3 +-- 5 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index badf9285fe0d..7c6ae2f4b9ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -493,6 +493,8 @@ static inline void napi_synchronize(const struct napi_struct *n) enum netdev_queue_state_t { __QUEUE_STATE_XOFF, __QUEUE_STATE_FROZEN, +#define QUEUE_STATE_XOFF_OR_FROZEN ((1 << __QUEUE_STATE_XOFF) | \ + (1 << __QUEUE_STATE_FROZEN)) }; struct netdev_queue { @@ -1629,9 +1631,9 @@ static inline int netif_queue_stopped(const struct net_device *dev) return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0)); } -static inline int netif_tx_queue_frozen(const struct netdev_queue *dev_queue) +static inline int netif_tx_queue_frozen_or_stopped(const struct netdev_queue *dev_queue) { - return test_bit(__QUEUE_STATE_FROZEN, &dev_queue->state); + return dev_queue->state & QUEUE_STATE_XOFF_OR_FROZEN; } /** diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4e98ffac3af0..ee38acb6d463 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -76,8 +76,7 @@ static void queue_process(struct work_struct *work) local_irq_save(flags); __netif_tx_lock(txq, smp_processor_id()); - if (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq) || + if (netif_tx_queue_frozen_or_stopped(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 2e57830cbeb2..2953b2abc971 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3527,7 +3527,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) __netif_tx_lock_bh(txq); - if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) { + if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5dbb3cd96e59..7f0bd8952646 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -60,8 +60,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q) /* check the reason of requeuing without tx lock first */ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - if (!netif_tx_queue_stopped(txq) && - !netif_tx_queue_frozen(txq)) { + if (!netif_tx_queue_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else @@ -122,7 +121,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, spin_unlock(root_lock); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq)) + if (!netif_tx_queue_frozen_or_stopped(txq)) ret = dev_hard_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); @@ -144,8 +143,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, ret = dev_requeue_skb(skb, q); } - if (ret && (netif_tx_queue_stopped(txq) || - netif_tx_queue_frozen(txq))) + if (ret && netif_tx_queue_frozen_or_stopped(txq)) ret = 0; return ret; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 401af9596709..106479a7c94a 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -309,8 +309,7 @@ restart: if (__netif_tx_trylock(slave_txq)) { unsigned int length = qdisc_pkt_len(skb); - if (!netif_tx_queue_stopped(slave_txq) && - !netif_tx_queue_frozen(slave_txq) && + if (!netif_tx_queue_frozen_or_stopped(slave_txq) && slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { txq_trans_update(slave_txq); __netif_tx_unlock(slave_txq); -- cgit v1.2.3 From bf26414510103448ad3dc069c7422462f03ea3d7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 26 Nov 2010 08:36:09 +0000 Subject: xps: Add CONFIG_XPS This patch adds XPS_CONFIG option to enable and disable XPS. This is done in the same manner as RPS_CONFIG. This is also fixes build failure in XPS code when SMP is not enabled. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 52 +++++++++++++++++++++++++---------------------- net/Kconfig | 5 +++++ net/core/dev.c | 9 +++++--- net/core/net-sysfs.c | 47 ++++++++++++++++++++++++++++++------------ net/core/net-sysfs.h | 3 --- 5 files changed, 73 insertions(+), 43 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7c6ae2f4b9ab..9ae4544f0cf0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -535,30 +535,6 @@ struct rps_map { }; #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) -/* - * This structure holds an XPS map which can be of variable length. The - * map is an array of queues. - */ -struct xps_map { - unsigned int len; - unsigned int alloc_len; - struct rcu_head rcu; - u16 queues[0]; -}; -#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16))) -#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \ - / sizeof(u16)) - -/* - * This structure holds all XPS maps for device. Maps are indexed by CPU. - */ -struct xps_dev_maps { - struct rcu_head rcu; - struct xps_map *cpu_map[0]; -}; -#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ - (nr_cpu_ids * sizeof(struct xps_map *))) - /* * The rps_dev_flow structure contains the mapping of a flow to a CPU and the * tail pointer for that CPU's input queue at the time of last enqueue. @@ -626,6 +602,32 @@ struct netdev_rx_queue { } ____cacheline_aligned_in_smp; #endif /* CONFIG_RPS */ +#ifdef CONFIG_XPS +/* + * This structure holds an XPS map which can be of variable length. The + * map is an array of queues. + */ +struct xps_map { + unsigned int len; + unsigned int alloc_len; + struct rcu_head rcu; + u16 queues[0]; +}; +#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + (_num * sizeof(u16))) +#define XPS_MIN_MAP_ALLOC ((L1_CACHE_BYTES - sizeof(struct xps_map)) \ + / sizeof(u16)) + +/* + * This structure holds all XPS maps for device. Maps are indexed by CPU. + */ +struct xps_dev_maps { + struct rcu_head rcu; + struct xps_map *cpu_map[0]; +}; +#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ + (nr_cpu_ids * sizeof(struct xps_map *))) +#endif /* CONFIG_XPS */ + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1046,7 +1048,9 @@ struct net_device { unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; +#ifdef CONFIG_XPS struct xps_dev_maps *xps_maps; +#endif /* These may be needed for future network-power-down code. */ diff --git a/net/Kconfig b/net/Kconfig index 55fd82e9ffd9..126c2af0fc1f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -220,6 +220,11 @@ config RPS depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS default y +config XPS + boolean + depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS + default y + menu "Network testing" config NET_PKTGEN diff --git a/net/core/dev.c b/net/core/dev.c index c852f0038a08..3259d2c323a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1567,6 +1567,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); + if (rc) + return rc; + if (txq < dev->real_num_tx_queues) qdisc_reset_all_tx_gt(dev, txq); } @@ -2148,7 +2151,7 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { -#ifdef CONFIG_RPS +#ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct xps_map *map; int queue_index = -1; @@ -5085,9 +5088,9 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +#ifdef CONFIG_RPS static int netif_alloc_rx_queues(struct net_device *dev) { -#ifdef CONFIG_RPS unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; @@ -5102,9 +5105,9 @@ static int netif_alloc_rx_queues(struct net_device *dev) for (i = 0; i < count; i++) rx[i].dev = dev; -#endif return 0; } +#endif static int netif_alloc_netdev_queues(struct net_device *dev) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 68dbbfdee274..99c11294623f 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -751,10 +751,12 @@ static int rx_queue_add_kobject(struct net_device *net, int index) return error; } +#endif /* CONFIG_RPS */ int net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { +#ifdef CONFIG_RPS int i; int error = 0; @@ -770,8 +772,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) kobject_put(&net->_rx[i].kobj); return error; +#else + return 0; +#endif } +#ifdef CONFIG_XPS /* * netdev_queue sysfs structures and functions. */ @@ -1090,10 +1096,12 @@ static int netdev_queue_add_kobject(struct net_device *net, int index) return error; } +#endif /* CONFIG_XPS */ int netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { +#ifdef CONFIG_XPS int i; int error = 0; @@ -1109,27 +1117,36 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) kobject_put(&net->_tx[i].kobj); return error; +#else + return 0; +#endif } static int register_queue_kobjects(struct net_device *net) { - int error = 0, txq = 0, rxq = 0; + int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); if (!net->queues_kset) return -ENOMEM; +#endif + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; - error = net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); + error = net_rx_queue_update_kobjects(net, 0, real_rx); if (error) goto error; - rxq = net->real_num_rx_queues; + rxq = real_rx; - error = netdev_queue_update_kobjects(net, 0, - net->real_num_tx_queues); + error = netdev_queue_update_kobjects(net, 0, real_tx); if (error) goto error; - txq = net->real_num_tx_queues; + txq = real_tx; return 0; @@ -1141,11 +1158,19 @@ error: static void remove_queue_kobjects(struct net_device *net) { - net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0); - netdev_queue_update_kobjects(net, net->real_num_tx_queues, 0); + int real_rx = 0, real_tx = 0; + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + net_rx_queue_update_kobjects(net, real_rx, 0); + netdev_queue_update_kobjects(net, real_tx, 0); +#if defined(CONFIG_RPS) || defined(CONFIG_XPS) kset_unregister(net->queues_kset); +#endif } -#endif /* CONFIG_RPS */ static const void *net_current_ns(void) { @@ -1244,9 +1269,7 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); -#ifdef CONFIG_RPS remove_queue_kobjects(net); -#endif device_del(dev); } @@ -1285,13 +1308,11 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; -#ifdef CONFIG_RPS error = register_queue_kobjects(net); if (error) { device_del(dev); return error; } -#endif return error; } diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 25ec2ee57df7..bd7751ec1c4d 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -4,11 +4,8 @@ int netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); -#ifdef CONFIG_RPS int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); int netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num); #endif - -#endif -- cgit v1.2.3 From a41778694806ac1ccd4b1dafed1abef8d5ba98ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Nov 2010 21:43:02 +0000 Subject: xps: add __rcu annotations Avoid sparse warnings : add __rcu annotations and use rcu_dereference_protected() where necessary. Signed-off-by: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/net-sysfs.c | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9ae4544f0cf0..4b0c7f3aa32b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -622,7 +622,7 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; - struct xps_map *cpu_map[0]; + struct xps_map __rcu *cpu_map[0]; }; #define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * sizeof(struct xps_map *))) @@ -1049,7 +1049,7 @@ struct net_device { spinlock_t tx_global_lock; #ifdef CONFIG_XPS - struct xps_dev_maps *xps_maps; + struct xps_dev_maps __rcu *xps_maps; #endif /* These may be needed for future network-power-down code. */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 35ef42fa0cf3..f85cee3d869e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -899,6 +899,8 @@ static void xps_dev_maps_release(struct rcu_head *rcu) } static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static ssize_t store_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, @@ -935,11 +937,12 @@ static ssize_t store_xps_map(struct netdev_queue *queue, mutex_lock(&xps_map_mutex); - dev_maps = dev->xps_maps; + dev_maps = xmap_dereference(dev->xps_maps); for_each_possible_cpu(cpu) { - new_map = map = dev_maps ? dev_maps->cpu_map[cpu] : NULL; - + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + new_map = map; if (map) { for (pos = 0; pos < map->len; pos++) if (map->queues[pos] == index) @@ -975,13 +978,14 @@ static ssize_t store_xps_map(struct netdev_queue *queue, else new_map = NULL; } - new_dev_maps->cpu_map[cpu] = new_map; + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); } /* Cleanup old maps */ for_each_possible_cpu(cpu) { - map = dev_maps ? dev_maps->cpu_map[cpu] : NULL; - if (map && new_dev_maps->cpu_map[cpu] != map) + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) call_rcu(&map->rcu, xps_map_release); if (new_dev_maps->cpu_map[cpu]) nonempty = 1; @@ -1007,7 +1011,9 @@ error: if (new_dev_maps) for_each_possible_cpu(i) - kfree(new_dev_maps->cpu_map[i]); + kfree(rcu_dereference_protected( + new_dev_maps->cpu_map[i], + 1)); kfree(new_dev_maps); free_cpumask_var(mask); return -ENOMEM; @@ -1033,11 +1039,11 @@ static void netdev_queue_release(struct kobject *kobj) index = get_netdev_queue_index(queue); mutex_lock(&xps_map_mutex); - dev_maps = dev->xps_maps; + dev_maps = xmap_dereference(dev->xps_maps); if (dev_maps) { for_each_possible_cpu(i) { - map = dev_maps->cpu_map[i]; + map = xmap_dereference(dev_maps->cpu_map[i]); if (!map) continue; -- cgit v1.2.3 From f2cd2d3e9b3ef960612e362f0ad129d735452df2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Nov 2010 08:14:37 +0000 Subject: net sched: use xps information for qdisc NUMA affinity Allocate qdisc memory according to NUMA properties of cpus included in xps map. To be effective, qdisc should be (re)setup after changes of /sys/class/net/eth/queues/tx-/xps_cpus I added a numa_node field in struct netdev_queue, containing NUMA node if all cpus included in xps_cpus share same node, else -1. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++++++++++++++- net/core/dev.c | 5 +++-- net/core/net-sysfs.c | 12 +++++++++++- net/sched/sch_generic.c | 4 +++- 4 files changed, 36 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4b0c7f3aa32b..a9ac5dc26e3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -508,7 +508,9 @@ struct netdev_queue { #ifdef CONFIG_RPS struct kobject kobj; #endif - +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + int numa_node; +#endif /* * write mostly part */ @@ -523,6 +525,22 @@ struct netdev_queue { u64 tx_dropped; } ____cacheline_aligned_in_smp; +static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) +{ +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + return q->numa_node; +#else + return -1; +#endif +} + +static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node) +{ +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + q->numa_node = node; +#endif +} + #ifdef CONFIG_RPS /* * This structure holds an RPS map which can be of variable length. The diff --git a/net/core/dev.c b/net/core/dev.c index 3259d2c323a6..cd2437495428 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5125,9 +5125,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev) } dev->_tx = tx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { + netdev_queue_numa_node_write(&tx[i], -1); tx[i].dev = dev; - + } return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f85cee3d869e..85e8b5326dd6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -913,6 +913,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue, struct xps_map *map, *new_map; struct xps_dev_maps *dev_maps, *new_dev_maps; int nonempty = 0; + int numa_node = -2; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -953,7 +954,14 @@ static ssize_t store_xps_map(struct netdev_queue *queue, pos = map_len = alloc_len = 0; need_set = cpu_isset(cpu, *mask) && cpu_online(cpu); - +#ifdef CONFIG_NUMA + if (need_set) { + if (numa_node == -2) + numa_node = cpu_to_node(cpu); + else if (numa_node != cpu_to_node(cpu)) + numa_node = -1; + } +#endif if (need_set && pos >= map_len) { /* Need to add queue to this CPU's map */ if (map_len >= alloc_len) { @@ -1001,6 +1009,8 @@ static ssize_t store_xps_map(struct netdev_queue *queue, if (dev_maps) call_rcu(&dev_maps->rcu, xps_dev_maps_release); + netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : -1); + mutex_unlock(&xps_map_mutex); free_cpumask_var(mask); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7f0bd8952646..0918834ee4a1 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -553,7 +553,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, size = QDISC_ALIGN(sizeof(*sch)); size += ops->priv_size + (QDISC_ALIGNTO - 1); - p = kzalloc(size, GFP_KERNEL); + p = kzalloc_node(size, GFP_KERNEL, + netdev_queue_numa_node_read(dev_queue)); + if (!p) goto errout; sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); -- cgit v1.2.3 From 941666c2e3e0f9f6a1cb5808d02352d445bd702c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Dec 2010 01:23:53 +0000 Subject: net: RCU conversion of dev_getbyhwaddr() and arp_ioctl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le dimanche 05 décembre 2010 à 09:19 +0100, Eric Dumazet a écrit : > Hmm.. > > If somebody can explain why RTNL is held in arp_ioctl() (and therefore > in arp_req_delete()), we might first remove RTNL use in arp_ioctl() so > that your patch can be applied. > > Right now it is not good, because RTNL wont be necessarly held when you > are going to call arp_invalidate() ? While doing this analysis, I found a refcount bug in llc, I'll send a patch for net-2.6 Meanwhile, here is the patch for net-next-2.6 Your patch then can be applied after mine. Thanks [PATCH] net: RCU conversion of dev_getbyhwaddr() and arp_ioctl() dev_getbyhwaddr() was called under RTNL. Rename it to dev_getbyhwaddr_rcu() and change all its caller to now use RCU locking instead of RTNL. Change arp_ioctl() to use RCU instead of RTNL locking. Note: this fix a dev refcount bug in llc Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 17 +++++++---------- net/ieee802154/af_ieee802154.c | 6 +++--- net/ipv4/arp.c | 17 +++++++++-------- net/llc/af_llc.c | 11 ++++++----- 5 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9ac5dc26e3c..d31bc3c94717 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1360,7 +1360,8 @@ static inline struct net_device *first_net_device(struct net *net) extern int netdev_boot_setup_check(struct net_device *dev); extern unsigned long netdev_boot_base(const char *prefix, int unit); -extern struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *hwaddr); +extern struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *hwaddr); extern struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); extern struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type); extern void dev_add_pack(struct packet_type *pt); diff --git a/net/core/dev.c b/net/core/dev.c index ee605c0867e7..822b15b8d11c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -743,34 +743,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** - * dev_getbyhwaddr - find a device by its hardware address + * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased + * is not found or a pointer to the device. The caller must hold RCU + * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) { struct net_device *dev; - ASSERT_RTNL(); - - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } -EXPORT_SYMBOL(dev_getbyhwaddr); +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c index 93c91b633a56..6df6ecf49708 100644 --- a/net/ieee802154/af_ieee802154.c +++ b/net/ieee802154/af_ieee802154.c @@ -52,11 +52,11 @@ struct net_device *ieee802154_get_dev(struct net *net, switch (addr->addr_type) { case IEEE802154_ADDR_LONG: - rtnl_lock(); - dev = dev_getbyhwaddr(net, ARPHRD_IEEE802154, addr->hwaddr); + rcu_read_lock(); + dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, addr->hwaddr); if (dev) dev_hold(dev); - rtnl_unlock(); + rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: if (addr->pan_id == 0xffff || diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 10af759f2630..a2fc7b961dbc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1017,13 +1017,14 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + if (__in_dev_get_rcu(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on); return 0; } return -ENXIO; } +/* must be called with rcu_read_lock() */ static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { @@ -1033,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask && mask != htonl(0xFFFFFFFF)) return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; @@ -1225,10 +1226,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = htonl(0xFFFFFFFFUL); - rtnl_lock(); + rcu_read_lock(); if (r.arp_dev[0]) { err = -ENODEV; - dev = __dev_get_by_name(net, r.arp_dev); + dev = dev_get_by_name_rcu(net, r.arp_dev); if (dev == NULL) goto out; @@ -1252,12 +1253,12 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; case SIOCGARP: err = arp_req_get(&r, dev); - if (!err && copy_to_user(arg, &r, sizeof(r))) - err = -EFAULT; break; } out: - rtnl_unlock(); + rcu_read_unlock(); + if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; return err; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 582612998211..dfd3a648a551 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -316,9 +316,9 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) if (unlikely(addr->sllc_family != AF_LLC)) goto out; rc = -ENODEV; - rtnl_lock(); + rcu_read_lock(); if (sk->sk_bound_dev_if) { - llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (llc->dev) { if (!addr->sllc_arphrd) addr->sllc_arphrd = llc->dev->type; @@ -329,14 +329,15 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) !llc_mac_match(addr->sllc_mac, llc->dev->dev_addr)) { rc = -EINVAL; - dev_put(llc->dev); llc->dev = NULL; } } } else - llc->dev = dev_getbyhwaddr(&init_net, addr->sllc_arphrd, + llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); - rtnl_unlock(); + if (llc->dev) + dev_hold(llc->dev); + rcu_read_unlock(); if (!llc->dev) goto out; if (!addr->sllc_sap) { -- cgit v1.2.3 From a3d22a68d752ccc1a01bb0a64dd70b7a98bf9e23 Mon Sep 17 00:00:00 2001 From: Vladislav Zolotarov Date: Mon, 13 Dec 2010 06:27:10 +0000 Subject: bnx2x: Take the distribution range definition out of skb_tx_hash() Move the calcualation of the Tx hash for a given hash range into a separate function and define the skb_tx_hash(), which calculates a Tx hash for a [0; dev->real_num_tx_queues - 1] hash values range, using this function (__skb_tx_hash()). Signed-off-by: Vladislav Zolotarov Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ include/linux/skbuff.h | 5 +++-- net/core/dev.c | 15 ++++++++++----- 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d31bc3c94717..445e6825f8eb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1747,6 +1747,16 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) __netif_schedule(txq->qdisc); } +/* + * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used + * as a distribution range limit for the returned value. + */ +static inline u16 skb_tx_hash(const struct net_device *dev, + const struct sk_buff *skb) +{ + return __skb_tx_hash(dev, skb, dev->real_num_tx_queues); +} + /** * netif_is_multiqueue - test if device has multiple transmit queues * @dev: network device diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 19f37a6ee6c4..4c4bec6316d9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2165,8 +2165,9 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) return skb->queue_mapping != 0; } -extern u16 skb_tx_hash(const struct net_device *dev, - const struct sk_buff *skb); +extern u16 __skb_tx_hash(const struct net_device *dev, + const struct sk_buff *skb, + unsigned int num_tx_queues); #ifdef CONFIG_XFRM static inline struct sec_path *skb_sec_path(struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index d28b3a023bb2..b25dd087f06a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2112,14 +2112,19 @@ out: static u32 hashrnd __read_mostly; -u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) { u32 hash; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - while (unlikely(hash >= dev->real_num_tx_queues)) - hash -= dev->real_num_tx_queues; + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; return hash; } @@ -2129,9 +2134,9 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); - return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); + return (u16) (((u64) hash * num_tx_queues) >> 32); } -EXPORT_SYMBOL(skb_tx_hash); +EXPORT_SYMBOL(__skb_tx_hash); static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) { -- cgit v1.2.3 From b236da6931e2482bfe44a7865dd4e7bb036f3496 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 14 Dec 2010 03:09:15 +0000 Subject: net: use NUMA_NO_NODE instead of the magic number -1 Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 2 +- net/core/net-sysfs.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 445e6825f8eb..cc916c5c3279 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -530,7 +530,7 @@ static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) return q->numa_node; #else - return -1; + return NUMA_NO_NODE; #endif } diff --git a/net/core/dev.c b/net/core/dev.c index b25dd087f06a..7ac26d2b9722 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5121,7 +5121,7 @@ static void netdev_init_one_queue(struct net_device *dev, spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; - netdev_queue_numa_node_write(queue, -1); + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 85e8b5326dd6..e23c01be5a5b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1009,7 +1009,8 @@ static ssize_t store_xps_map(struct netdev_queue *queue, if (dev_maps) call_rcu(&dev_maps->rcu, xps_dev_maps_release); - netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : -1); + netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : + NUMA_NO_NODE); mutex_unlock(&xps_map_mutex); -- cgit v1.2.3 From 68763c890eb2a60f9b50a061502f94e0cf20fdfe Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Sun, 2 Jan 2011 22:54:09 +0000 Subject: trivial: Fix typo fault in netdevice.h Signed-off-by: Michal Simek Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc916c5c3279..0f6b1c965815 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -732,7 +732,7 @@ struct xps_dev_maps { * neither operation. * * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); - * If device support VLAN receive accleration + * If device support VLAN receive acceleration * (ie. dev->features & NETIF_F_HW_VLAN_RX), then this function is called * when vlan groups for the device changes. Note: grp is NULL * if no vlan's groups are being used. -- cgit v1.2.3 From f01a5236bd4b140198fbcc550f085e8361fd73fa Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Sun, 9 Jan 2011 06:23:31 +0000 Subject: net offloading: Generalize netif_get_vlan_features(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit netif_get_vlan_features() is currently only used by netif_needs_gso(), so it only concerns itself with GSO features. However, several other places also should take into account the contents of the packet when deciding whether to offload to hardware. This generalizes the function to return features about all of the various forms of offloading. Since offloads tend to be linked together, this avoids duplicating the logic in each location (i.e. the scatter/gather code also needs the checksum logic). Suggested-by: Michał Mirosław Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/dev.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0f6b1c965815..d4dac09a5ad2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2303,7 +2303,7 @@ unsigned long netdev_fix_features(unsigned long features, const char *name); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev); +int netif_skb_features(struct sk_buff *skb); static inline int net_gso_ok(int features, int gso_type) { @@ -2320,7 +2320,7 @@ static inline int skb_gso_ok(struct sk_buff *skb, int features) static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { if (skb_is_gso(skb)) { - int features = netif_get_vlan_features(skb, dev); + int features = netif_skb_features(skb); return (!skb_gso_ok(skb, features) || unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); diff --git a/net/core/dev.c b/net/core/dev.c index d8befd06da04..a51dfd7b56fb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2017,22 +2017,41 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } -int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev) +static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features) +{ + if (!can_checksum_protocol(protocol, features)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } + + return features; +} + +int netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; + int features = skb->dev->features; if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; - } else if (!skb->vlan_tci) - return dev->features; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, protocol, features); + } - if (protocol != htons(ETH_P_8021Q)) - return dev->features & dev->vlan_features; - else - return 0; + features &= skb->dev->vlan_features; + + if (protocol != htons(ETH_P_8021Q)) { + return harmonize_features(skb, protocol, features); + } else { + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM; + return harmonize_features(skb, protocol, features); + } } -EXPORT_SYMBOL(netif_get_vlan_features); +EXPORT_SYMBOL(netif_skb_features); /* * Returns true if either: -- cgit v1.2.3 From fc741216db156994c554ac31c1151fe0e00d8f0e Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Sun, 9 Jan 2011 06:23:32 +0000 Subject: net offloading: Pass features into netif_needs_gso(). Now that there is a single function that can compute the device features relevant to a packet, we don't want to run it for each offload. This converts netif_needs_gso() to take the features of the device, rather than computing them itself. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 2 +- include/linux/netdevice.h | 12 +++--------- net/core/dev.c | 8 ++++++-- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index cdbeec9f83ea..546de5749824 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -488,7 +488,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!netif_carrier_ok(dev) || (frags > 1 && !xennet_can_sg(dev)) || - netif_needs_gso(dev, skb))) { + netif_needs_gso(skb, netif_skb_features(skb)))) { spin_unlock_irq(&np->tx_lock); goto drop; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d4dac09a5ad2..de2bfe6da359 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2317,16 +2317,10 @@ static inline int skb_gso_ok(struct sk_buff *skb, int features) (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } -static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +static inline int netif_needs_gso(struct sk_buff *skb, int features) { - if (skb_is_gso(skb)) { - int features = netif_skb_features(skb); - - return (!skb_gso_ok(skb, features) || - unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); - } - - return 0; + return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || + unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); } static inline void netif_set_gso_max_size(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index a51dfd7b56fb..1444ed3861a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2086,6 +2086,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int rc = NETDEV_TX_OK; if (likely(!skb->next)) { + int features; + /* * If device doesnt need skb->dst, release it right now while * its hot in this cpu cache @@ -2098,8 +2100,10 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_orphan_try(skb); + features = netif_skb_features(skb); + if (vlan_tx_tag_present(skb) && - !(dev->features & NETIF_F_HW_VLAN_TX)) { + !(features & NETIF_F_HW_VLAN_TX)) { skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); if (unlikely(!skb)) goto out; @@ -2107,7 +2111,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - if (netif_needs_gso(dev, skb)) { + if (netif_needs_gso(skb, features)) { if (unlikely(dev_gso_segment(skb))) goto out_kfree_skb; if (skb->next) -- cgit v1.2.3 From 36909ea43814cba34f7c921e99cba33d770a54e1 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 9 Jan 2011 19:36:31 +0000 Subject: net: Add alloc_netdev_mqs function Added alloc_netdev_mqs function which allows the number of transmit and receive queues to be specified independenty. alloc_netdev_mq was changed to a macro to call the new function. Also added alloc_etherdev_mqs with same purpose. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 4 +++- include/linux/netdevice.h | 10 +++++++--- net/core/dev.c | 32 +++++++++++++++++++++----------- net/ethernet/eth.c | 12 +++++++----- 4 files changed, 38 insertions(+), 20 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index f16a01081e15..bec8b82889bf 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -48,8 +48,10 @@ extern int eth_validate_addr(struct net_device *dev); -extern struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count); +extern struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, + unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) +#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index de2bfe6da359..be4957cf6511 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2191,11 +2191,15 @@ static inline void netif_addr_unlock_bh(struct net_device *dev) extern void ether_setup(struct net_device *dev); /* Support for loadable net-drivers */ -extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, +extern struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, void (*setup)(struct net_device *), - unsigned int queue_count); + unsigned int txqs, unsigned int rxqs); #define alloc_netdev(sizeof_priv, name, setup) \ - alloc_netdev_mq(sizeof_priv, name, setup, 1) + alloc_netdev_mqs(sizeof_priv, name, setup, 1, 1) + +#define alloc_netdev_mq(sizeof_priv, name, setup, count) \ + alloc_netdev_mqs(sizeof_priv, name, setup, count, count) + extern int register_netdev(struct net_device *dev); extern void unregister_netdev(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index 3fe443be4b15..3295b94884ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5617,18 +5617,20 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) } /** - * alloc_netdev_mq - allocate network device + * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device - * @queue_count: the number of subqueues to allocate + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subquue structs - * for each queue on the device at the end of the netdevice. + * for each queue on the device. */ -struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, - void (*setup)(struct net_device *), unsigned int queue_count) +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t alloc_size; @@ -5636,12 +5638,20 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); - if (queue_count < 1) { + if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device " "with zero queues.\n"); return NULL; } +#ifdef CONFIG_RPS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero RX queues.\n"); + return NULL; + } +#endif + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ @@ -5672,14 +5682,14 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->num_tx_queues = queue_count; - dev->real_num_tx_queues = queue_count; + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_pcpu; #ifdef CONFIG_RPS - dev->num_rx_queues = queue_count; - dev->real_num_rx_queues = queue_count; + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_pcpu; #endif @@ -5707,7 +5717,7 @@ free_p: kfree(p); return NULL; } -EXPORT_SYMBOL(alloc_netdev_mq); +EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f00ef2f1d814..f9d7ac924f15 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -347,10 +347,11 @@ void ether_setup(struct net_device *dev) EXPORT_SYMBOL(ether_setup); /** - * alloc_etherdev_mq - Allocates and sets up an Ethernet device + * alloc_etherdev_mqs - Allocates and sets up an Ethernet device * @sizeof_priv: Size of additional driver-private structure to be allocated * for this Ethernet device - * @queue_count: The number of queues this device has. + * @txqs: The number of TX queues this device has. + * @txqs: The number of RX queues this device has. * * Fill in the fields of the device structure with Ethernet-generic * values. Basically does everything except registering the device. @@ -360,11 +361,12 @@ EXPORT_SYMBOL(ether_setup); * this private data area. */ -struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count) +struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, + unsigned int rxqs) { - return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count); + return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs); } -EXPORT_SYMBOL(alloc_etherdev_mq); +EXPORT_SYMBOL(alloc_etherdev_mqs); static size_t _format_mac_addr(char *buf, int buflen, const unsigned char *addr, int len) -- cgit v1.2.3 From 1ac9ad1394fa542ac7ae0dc943ee3cda678799fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Jan 2011 12:13:14 +0000 Subject: net: remove dev_txq_stats_fold() After recent changes, (percpu stats on vlan/tunnels...), we dont need anymore per struct netdev_queue tx_bytes/tx_packets/tx_dropped counters. Only remaining users are ixgbe, sch_teql, gianfar & macvlan : 1) ixgbe can be converted to use existing tx_ring counters. 2) macvlan incremented txq->tx_dropped, it can use the dev->stats.tx_dropped counter. 3) sch_teql : almost revert ab35cd4b8f42 (Use net_device internal stats) Now we have ndo_get_stats64(), use it, even for "unsigned long" fields (No need to bring back a struct net_device_stats) 4) gianfar adds a stats structure per tx queue to hold tx_bytes/tx_packets This removes a lockdep warning (and possible lockup) in rndis gadget, calling dev_get_stats() from hard IRQ context. Ref: http://www.spinics.net/lists/netdev/msg149202.html Reported-by: Neil Jones Signed-off-by: Eric Dumazet CC: Jarek Poplawski CC: Alexander Duyck CC: Jeff Kirsher CC: Sandeep Gopalpet CC: Michal Nazarewicz Signed-off-by: David S. Miller --- drivers/net/gianfar.c | 10 ++++------ drivers/net/gianfar.h | 10 ++++++++++ drivers/net/ixgbe/ixgbe_main.c | 23 ++++++++++++++++------- drivers/net/macvtap.c | 2 +- include/linux/netdevice.h | 5 ----- net/core/dev.c | 29 ----------------------------- net/sched/sch_teql.c | 26 +++++++++++++++++++++----- 7 files changed, 52 insertions(+), 53 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index 45c4b7bfcf39..f1d4b450e797 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -433,7 +433,6 @@ static void gfar_init_mac(struct net_device *ndev) static struct net_device_stats *gfar_get_stats(struct net_device *dev) { struct gfar_private *priv = netdev_priv(dev); - struct netdev_queue *txq; unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0; unsigned long tx_packets = 0, tx_bytes = 0; int i = 0; @@ -449,9 +448,8 @@ static struct net_device_stats *gfar_get_stats(struct net_device *dev) dev->stats.rx_dropped = rx_dropped; for (i = 0; i < priv->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; + tx_bytes += priv->tx_queue[i]->stats.tx_bytes; + tx_packets += priv->tx_queue[i]->stats.tx_packets; } dev->stats.tx_bytes = tx_bytes; @@ -2108,8 +2106,8 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) } /* Update transmit stats */ - txq->tx_bytes += skb->len; - txq->tx_packets ++; + tx_queue->stats.tx_bytes += skb->len; + tx_queue->stats.tx_packets++; txbdp = txbdp_start = tx_queue->cur_tx; lstatus = txbdp->lstatus; diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h index 68984eb88ae0..54de4135e932 100644 --- a/drivers/net/gianfar.h +++ b/drivers/net/gianfar.h @@ -907,12 +907,21 @@ enum { MQ_MG_MODE }; +/* + * Per TX queue stats + */ +struct tx_q_stats { + unsigned long tx_packets; + unsigned long tx_bytes; +}; + /** * struct gfar_priv_tx_q - per tx queue structure * @txlock: per queue tx spin lock * @tx_skbuff:skb pointers * @skb_curtx: to be used skb pointer * @skb_dirtytx:the last used skb pointer + * @stats: bytes/packets stats * @qindex: index of this queue * @dev: back pointer to the dev structure * @grp: back pointer to the group to which this queue belongs @@ -934,6 +943,7 @@ struct gfar_priv_tx_q { struct txbd8 *tx_bd_base; struct txbd8 *cur_tx; struct txbd8 *dirty_tx; + struct tx_q_stats stats; struct net_device *dev; struct gfar_priv_grp *grp; u16 skb_curtx; diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index a060610a42db..602078b84892 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -6667,8 +6667,6 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct ixgbe_adapter *adapter, struct ixgbe_ring *tx_ring) { - struct net_device *netdev = tx_ring->netdev; - struct netdev_queue *txq; unsigned int first; unsigned int tx_flags = 0; u8 hdr_len = 0; @@ -6765,9 +6763,6 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, /* add the ATR filter if ATR is on */ if (test_bit(__IXGBE_TX_FDIR_INIT_DONE, &tx_ring->state)) ixgbe_atr(tx_ring, skb, tx_flags, protocol); - txq = netdev_get_tx_queue(netdev, tx_ring->queue_index); - txq->tx_bytes += skb->len; - txq->tx_packets++; ixgbe_tx_queue(tx_ring, tx_flags, count, skb->len, hdr_len); ixgbe_maybe_stop_tx(tx_ring, DESC_NEEDED); @@ -6925,8 +6920,6 @@ static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev, struct ixgbe_adapter *adapter = netdev_priv(netdev); int i; - /* accurate rx/tx bytes/packets stats */ - dev_txq_stats_fold(netdev, stats); rcu_read_lock(); for (i = 0; i < adapter->num_rx_queues; i++) { struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]); @@ -6943,6 +6936,22 @@ static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev, stats->rx_bytes += bytes; } } + + for (i = 0; i < adapter->num_tx_queues; i++) { + struct ixgbe_ring *ring = ACCESS_ONCE(adapter->tx_ring[i]); + u64 bytes, packets; + unsigned int start; + + if (ring) { + do { + start = u64_stats_fetch_begin_bh(&ring->syncp); + packets = ring->stats.packets; + bytes = ring->stats.bytes; + } while (u64_stats_fetch_retry_bh(&ring->syncp, start)); + stats->tx_packets += packets; + stats->tx_bytes += bytes; + } + } rcu_read_unlock(); /* following stats updated by ixgbe_watchdog_task() */ stats->multicast = netdev->stats.multicast; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 21845affea13..5933621ac3ff 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -585,7 +585,7 @@ err: rcu_read_lock_bh(); vlan = rcu_dereference(q->vlan); if (vlan) - netdev_get_tx_queue(vlan->dev, 0)->tx_dropped++; + vlan->dev->stats.tx_dropped++; rcu_read_unlock_bh(); return err; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be4957cf6511..d971346b0340 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -520,9 +520,6 @@ struct netdev_queue { * please use this field instead of dev->trans_start */ unsigned long trans_start; - u64 tx_bytes; - u64 tx_packets; - u64 tx_dropped; } ____cacheline_aligned_in_smp; static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) @@ -2265,8 +2262,6 @@ extern void dev_load(struct net *net, const char *name); extern void dev_mcast_init(void); extern struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); -extern void dev_txq_stats_fold(const struct net_device *dev, - struct rtnl_link_stats64 *stats); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; diff --git a/net/core/dev.c b/net/core/dev.c index a3ef808b5e36..83507c265e48 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5523,34 +5523,6 @@ void netdev_run_todo(void) } } -/** - * dev_txq_stats_fold - fold tx_queues stats - * @dev: device to get statistics from - * @stats: struct rtnl_link_stats64 to hold results - */ -void dev_txq_stats_fold(const struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - spin_lock_bh(&txq->_xmit_lock); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - spin_unlock_bh(&txq->_xmit_lock); - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } -} -EXPORT_SYMBOL(dev_txq_stats_fold); - /* Convert net_device_stats to rtnl_link_stats64. They have the same * fields in the same order, with only the type differing. */ @@ -5594,7 +5566,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else { netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index af9360d1f6eb..84ce48eadff4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -59,6 +59,10 @@ struct teql_master struct net_device *dev; struct Qdisc *slaves; struct list_head master_list; + unsigned long tx_bytes; + unsigned long tx_packets; + unsigned long tx_errors; + unsigned long tx_dropped; }; struct teql_sched_data @@ -274,7 +278,6 @@ static inline int teql_resolve(struct sk_buff *skb, static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev) { struct teql_master *master = netdev_priv(dev); - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); struct Qdisc *start, *q; int busy; int nores; @@ -314,8 +317,8 @@ restart: __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); - txq->tx_packets++; - txq->tx_bytes += length; + master->tx_packets++; + master->tx_bytes += length; return NETDEV_TX_OK; } __netif_tx_unlock(slave_txq); @@ -342,10 +345,10 @@ restart: netif_stop_queue(dev); return NETDEV_TX_BUSY; } - dev->stats.tx_errors++; + master->tx_errors++; drop: - txq->tx_dropped++; + master->tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -398,6 +401,18 @@ static int teql_master_close(struct net_device *dev) return 0; } +static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct teql_master *m = netdev_priv(dev); + + stats->tx_packets = m->tx_packets; + stats->tx_bytes = m->tx_bytes; + stats->tx_errors = m->tx_errors; + stats->tx_dropped = m->tx_dropped; + return stats; +} + static int teql_master_mtu(struct net_device *dev, int new_mtu) { struct teql_master *m = netdev_priv(dev); @@ -422,6 +437,7 @@ static const struct net_device_ops teql_netdev_ops = { .ndo_open = teql_master_open, .ndo_stop = teql_master_close, .ndo_start_xmit = teql_master_xmit, + .ndo_get_stats64 = teql_master_stats64, .ndo_change_mtu = teql_master_mtu, }; -- cgit v1.2.3 From cbda10fa97d72c7a1923be4426171aa90e8c6dab Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 13 Jan 2011 23:38:30 +0000 Subject: net_device: add support for network device groups Net devices can now be grouped, enabling simpler manipulation from userspace. This patch adds a group field to the net_device structure, as well as rtnetlink support to query and modify it. Signed-off-by: Vlad Dogaru Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/if_link.h | 1 + include/linux/netdevice.h | 7 +++++++ net/core/dev.c | 12 ++++++++++++ net/core/rtnetlink.c | 6 ++++++ 4 files changed, 26 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 6485d2a89bec..f4a2e6b1b864 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -135,6 +135,7 @@ enum { IFLA_VF_PORTS, IFLA_PORT_SELF, IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ __IFLA_MAX }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d971346b0340..68a4627b74f5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -75,6 +75,9 @@ struct wireless_dev; #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ +/* Initial net device group. All devices belong to group 0 by default. */ +#define INIT_NETDEV_GROUP 0 + /* * Transmit return codes: transmit return codes originate from three different * namespaces: @@ -1153,6 +1156,9 @@ struct net_device { /* phy device may attach itself for hardware timestamping */ struct phy_device *phydev; + + /* group the device belongs to */ + int group; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -1844,6 +1850,7 @@ extern int dev_set_alias(struct net_device *, const char *, size_t); extern int dev_change_net_namespace(struct net_device *, struct net *, const char *); extern int dev_set_mtu(struct net_device *, int); +extern void dev_set_group(struct net_device *, int); extern int dev_set_mac_address(struct net_device *, struct sockaddr *); extern int dev_hard_start_xmit(struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 7741507429f4..2b85d4ae981f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4571,6 +4571,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) } EXPORT_SYMBOL(dev_set_mtu); +/** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + /** * dev_set_mac_address - Change Media Access Control Address * @dev: device @@ -5678,6 +5689,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; return dev; free_pcpu: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a5f7535aab5b..09062b07bf7f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -868,6 +868,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, netif_running(dev) ? dev->operstate : IF_OPER_DOWN); NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + NLA_PUT_U32(skb, IFLA_GROUP, dev->group); if (dev->ifindex != dev->iflink) NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); @@ -1265,6 +1266,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, modified = 1; } + if (tb[IFLA_GROUP]) { + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + modified = 1; + } + /* * Interface selected by interface index but interface * name provided implies that a name change has been -- cgit v1.2.3 From 4f57c087de9b46182545676d2c594120a20f2e58 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jan 2011 08:06:04 +0000 Subject: net: implement mechanism for HW based QOS This patch provides a mechanism for lower layer devices to steer traffic using skb->priority to tx queues. This allows for hardware based QOS schemes to use the default qdisc without incurring the penalties related to global state and the qdisc lock. While reliably receiving skbs on the correct tx ring to avoid head of line blocking resulting from shuffling in the LLD. Finally, all the goodness from txq caching and xps/rps can still be leveraged. Many drivers and hardware exist with the ability to implement QOS schemes in the hardware but currently these drivers tend to rely on firmware to reroute specific traffic, a driver specific select_queue or the queue_mapping action in the qdisc. By using select_queue for this drivers need to be updated for each and every traffic type and we lose the goodness of much of the upstream work. Firmware solutions are inherently inflexible. And finally if admins are expected to build a qdisc and filter rules to steer traffic this requires knowledge of how the hardware is currently configured. The number of tx queues and the queue offsets may change depending on resources. Also this approach incurs all the overhead of a qdisc with filters. With the mechanism in this patch users can set skb priority using expected methods ie setsockopt() or the stack can set the priority directly. Then the skb will be steered to the correct tx queues aligned with hardware QOS traffic classes. In the normal case with single traffic class and all queues in this class everything works as is until the LLD enables multiple tcs. To steer the skb we mask out the lower 4 bits of the priority and allow the hardware to configure upto 15 distinct classes of traffic. This is expected to be sufficient for most applications at any rate it is more then the 8021Q spec designates and is equal to the number of prio bands currently implemented in the default qdisc. This in conjunction with a userspace application such as lldpad can be used to implement 8021Q transmission selection algorithms one of these algorithms being the extended transmission selection algorithm currently being used for DCB. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/netdevice.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/dev.c | 55 +++++++++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 68a4627b74f5..371fa8839d51 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -646,6 +646,14 @@ struct xps_dev_maps { (nr_cpu_ids * sizeof(struct xps_map *))) #endif /* CONFIG_XPS */ +#define TC_MAX_QUEUE 16 +#define TC_BITMASK 15 +/* HW offloaded queuing disciplines txq count and offset maps */ +struct netdev_tc_txq { + u16 count; + u16 offset; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -756,6 +764,11 @@ struct xps_dev_maps { * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + * int (*ndo_setup_tc)(struct net_device *dev, u8 tc) + * Called to setup 'tc' number of traffic classes in the net device. This + * is always called from the stack with the rtnl lock held and netif tx + * queues stopped. This allows the netdevice to perform queue management + * safely. */ #define HAVE_NET_DEVICE_OPS struct net_device_ops { @@ -814,6 +827,7 @@ struct net_device_ops { struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + int (*ndo_setup_tc)(struct net_device *dev, u8 tc); #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); @@ -1146,6 +1160,9 @@ struct net_device { /* Data Center Bridging netlink ops */ const struct dcbnl_rtnl_ops *dcbnl_ops; #endif + u8 num_tc; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; + u8 prio_tc_map[TC_BITMASK + 1]; #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) /* max exchange id for FCoE LRO by ddp */ @@ -1164,6 +1181,57 @@ struct net_device { #define NETDEV_ALIGN 32 +static inline +int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio) +{ + return dev->prio_tc_map[prio & TC_BITMASK]; +} + +static inline +int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) +{ + if (tc >= dev->num_tc) + return -EINVAL; + + dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK; + return 0; +} + +static inline +void netdev_reset_tc(struct net_device *dev) +{ + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} + +static inline +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} + +static inline +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + + dev->num_tc = num_tc; + return 0; +} + +static inline +int netdev_get_num_tc(struct net_device *dev) +{ + return dev->num_tc; +} + static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) diff --git a/net/core/dev.c b/net/core/dev.c index 2b85d4ae981f..8b1d886ed23b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1593,6 +1593,48 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues changed " + "invalidating tc mappings. Priority " + "traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues " + "changed. Priority %i to tc " + "mapping %i is no longer valid " + "setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1612,6 +1654,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (rc) return rc; + if (dev->num_tc) + netif_setup_tc(dev, txq); + if (txq < dev->real_num_tx_queues) qdisc_reset_all_tx_gt(dev, txq); } @@ -2161,6 +2206,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, unsigned int num_tx_queues) { u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); @@ -2169,13 +2216,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, return hash; } + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); - return (u16) (((u64) hash * num_tx_queues) >> 32); + return (u16) (((u64) hash * qcount) >> 32) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); -- cgit v1.2.3 From c445477d74ab3779d1386ab797fbb9b628eb9f64 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 19 Jan 2011 11:03:53 +0000 Subject: net: RPS: Enable hardware acceleration of RFS Allow drivers for multiqueue hardware with flow filter tables to accelerate RFS. The driver must: 1. Set net_device::rx_cpu_rmap to a cpu_rmap of the RX completion IRQs (in queue order). This will provide a mapping from CPUs to the queues for which completions are handled nearest to them. 2. Implement net_device_ops::ndo_rx_flow_steer. This operation adds or replaces a filter steering the given flow to the given RX queue, if possible. 3. Periodically remove filters for which rps_may_expire_flow() returns true. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 33 ++++++++++++++-- net/Kconfig | 6 +++ net/core/dev.c | 97 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 127 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 371fa8839d51..a335f2022690 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -554,14 +554,16 @@ struct rps_map { #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) /* - * The rps_dev_flow structure contains the mapping of a flow to a CPU and the - * tail pointer for that CPU's input queue at the time of last enqueue. + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the + * tail pointer for that CPU's input queue at the time of last enqueue, and + * a hardware filter index. */ struct rps_dev_flow { u16 cpu; - u16 fill; + u16 filter; unsigned int last_qtail; }; +#define RPS_NO_FILTER 0xffff /* * The rps_dev_flow_table structure contains a table of flow mappings. @@ -611,6 +613,11 @@ static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table, extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; +#ifdef CONFIG_RFS_ACCEL +extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id); +#endif + /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { struct rps_map __rcu *rps_map; @@ -769,6 +776,13 @@ struct netdev_tc_txq { * is always called from the stack with the rtnl lock held and netif tx * queues stopped. This allows the netdevice to perform queue management * safely. + * + * RFS acceleration. + * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, + * u16 rxq_index, u32 flow_id); + * Set hardware filter for RFS. rxq_index is the target queue index; + * flow_id is a flow ID to be passed to rps_may_expire_flow() later. + * Return the filter ID on success, or a negative error code. */ #define HAVE_NET_DEVICE_OPS struct net_device_ops { @@ -842,6 +856,12 @@ struct net_device_ops { int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); #endif +#ifdef CONFIG_RFS_ACCEL + int (*ndo_rx_flow_steer)(struct net_device *dev, + const struct sk_buff *skb, + u16 rxq_index, + u32 flow_id); +#endif }; /* @@ -1056,6 +1076,13 @@ struct net_device { /* Number of RX queues currently active in device */ unsigned int real_num_rx_queues; + +#ifdef CONFIG_RFS_ACCEL + /* CPU reverse-mapping for RX completion interrupts, indexed + * by RX queue number. Assigned by driver. This must only be + * set if the ndo_rx_flow_steer operation is defined. */ + struct cpu_rmap *rx_cpu_rmap; +#endif #endif rx_handler_func_t __rcu *rx_handler; diff --git a/net/Kconfig b/net/Kconfig index 72840626284b..79cabf1ee68b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -221,6 +221,12 @@ config RPS depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS default y +config RFS_ACCEL + boolean + depends on RPS && GENERIC_HARDIRQS + select CPU_RMAP + default y + config XPS boolean depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS diff --git a/net/core/dev.c b/net/core/dev.c index d162ba8d622d..aa761472f9e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2588,6 +2589,53 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + u16 tcpu; + + tcpu = rflow->cpu = next_cpu; + if (tcpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb->rxhash & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->cpu = next_cpu; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, tcpu).input_queue_head; + } + + return rflow; +} + /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. @@ -2658,12 +2706,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { - tcpu = rflow->cpu = next_cpu; - if (tcpu != RPS_NO_CPU) - rflow->last_qtail = per_cpu(softnet_data, - tcpu).input_queue_head; - } + rflow->last_qtail)) >= 0)) + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; @@ -2684,6 +2729,46 @@ done: return cpu; } +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = ACCESS_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { -- cgit v1.2.3 From 04ed3e741d0f133e02bed7fa5c98edba128f90e7 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Mon, 24 Jan 2011 15:32:47 -0800 Subject: net: change netdev->features to u32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quoting Ben Hutchings: we presumably won't be defining features that can only be enabled on 64-bit architectures. Occurences found by `grep -r` on net/, drivers/net, include/ [ Move features and vlan_features next to each other in struct netdev, as per Eric Dumazet's suggestion -DaveM ] Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- drivers/net/bnx2.c | 2 +- drivers/net/bonding/bond_main.c | 4 ++-- drivers/net/myri10ge/myri10ge.c | 4 ++-- drivers/net/sfc/ethtool.c | 4 ++-- drivers/net/sfc/net_driver.h | 2 +- drivers/net/tun.c | 2 +- include/linux/netdevice.h | 24 ++++++++++++------------ include/linux/skbuff.h | 2 +- include/net/protocol.h | 4 ++-- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/8021q/vlan.c | 2 +- net/bridge/br_if.c | 2 +- net/bridge/br_private.h | 2 +- net/core/dev.c | 15 +++++++-------- net/core/ethtool.c | 2 +- net/core/net-sysfs.c | 2 +- net/core/skbuff.c | 4 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/udp.c | 2 +- 23 files changed, 45 insertions(+), 46 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index df99edf3464a..cab96fa4cd3a 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -8312,7 +8312,7 @@ static const struct net_device_ops bnx2_netdev_ops = { #endif }; -static void inline vlan_features_add(struct net_device *dev, unsigned long flags) +static void inline vlan_features_add(struct net_device *dev, u32 flags) { dev->vlan_features |= flags; } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 163e0b06eaa5..7047b406b8ba 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1372,8 +1372,8 @@ static int bond_compute_features(struct bonding *bond) { struct slave *slave; struct net_device *bond_dev = bond->dev; - unsigned long features = bond_dev->features; - unsigned long vlan_features = 0; + u32 features = bond_dev->features; + u32 vlan_features = 0; unsigned short max_hard_header_len = max((u16)ETH_HLEN, bond_dev->hard_header_len); int i; diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index ea5cfe2c3a04..a7f2eed9a08a 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -253,7 +253,7 @@ struct myri10ge_priv { unsigned long serial_number; int vendor_specific_offset; int fw_multicast_support; - unsigned long features; + u32 features; u32 max_tso6; u32 read_dma; u32 write_dma; @@ -1776,7 +1776,7 @@ static int myri10ge_set_rx_csum(struct net_device *netdev, u32 csum_enabled) static int myri10ge_set_tso(struct net_device *netdev, u32 tso_enabled) { struct myri10ge_priv *mgp = netdev_priv(netdev); - unsigned long flags = mgp->features & (NETIF_F_TSO6 | NETIF_F_TSO); + u32 flags = mgp->features & (NETIF_F_TSO6 | NETIF_F_TSO); if (tso_enabled) netdev->features |= flags; diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c index 0e8bb19ed60d..713969accdbd 100644 --- a/drivers/net/sfc/ethtool.c +++ b/drivers/net/sfc/ethtool.c @@ -502,7 +502,7 @@ static void efx_ethtool_get_stats(struct net_device *net_dev, static int efx_ethtool_set_tso(struct net_device *net_dev, u32 enable) { struct efx_nic *efx __attribute__ ((unused)) = netdev_priv(net_dev); - unsigned long features; + u32 features; features = NETIF_F_TSO; if (efx->type->offload_features & NETIF_F_V6_CSUM) @@ -519,7 +519,7 @@ static int efx_ethtool_set_tso(struct net_device *net_dev, u32 enable) static int efx_ethtool_set_tx_csum(struct net_device *net_dev, u32 enable) { struct efx_nic *efx = netdev_priv(net_dev); - unsigned long features = efx->type->offload_features & NETIF_F_ALL_CSUM; + u32 features = efx->type->offload_features & NETIF_F_ALL_CSUM; if (enable) net_dev->features |= features; diff --git a/drivers/net/sfc/net_driver.h b/drivers/net/sfc/net_driver.h index 28df8665256a..c65270241d2d 100644 --- a/drivers/net/sfc/net_driver.h +++ b/drivers/net/sfc/net_driver.h @@ -906,7 +906,7 @@ struct efx_nic_type { unsigned int phys_addr_channels; unsigned int tx_dc_base; unsigned int rx_dc_base; - unsigned long offload_features; + u32 offload_features; u32 reset_world_flags; }; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b100bd50a0d7..55786a0efc41 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1142,7 +1142,7 @@ static int tun_get_iff(struct net *net, struct tun_struct *tun, * privs required. */ static int set_offload(struct net_device *dev, unsigned long arg) { - unsigned int old_features, features; + u32 old_features, features; old_features = dev->features; /* Unset features, set them as we chew on the arg. */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a335f2022690..0de3c59720fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -914,7 +914,11 @@ struct net_device { struct list_head unreg_list; /* Net device features */ - unsigned long features; + u32 features; + + /* VLAN feature mask */ + u32 vlan_features; + #define NETIF_F_SG 1 /* Scatter/gather IO. */ #define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */ #define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */ @@ -1176,9 +1180,6 @@ struct net_device { /* rtnetlink link ops */ const struct rtnl_link_ops *rtnl_link_ops; - /* VLAN feature mask */ - unsigned long vlan_features; - /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; @@ -1401,7 +1402,7 @@ struct packet_type { struct packet_type *, struct net_device *); struct sk_buff *(*gso_segment)(struct sk_buff *skb, - int features); + u32 features); int (*gso_send_check)(struct sk_buff *skb); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); @@ -2370,7 +2371,7 @@ extern int netdev_tstamp_prequeue; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); extern int skb_checksum_help(struct sk_buff *skb); -extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features); +extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features); #ifdef CONFIG_BUG extern void netdev_rx_csum_fault(struct net_device *dev); #else @@ -2397,22 +2398,21 @@ extern char *netdev_drivername(const struct net_device *dev, char *buffer, int l extern void linkwatch_run_queue(void); -unsigned long netdev_increment_features(unsigned long all, unsigned long one, - unsigned long mask); -unsigned long netdev_fix_features(unsigned long features, const char *name); +u32 netdev_increment_features(u32 all, u32 one, u32 mask); +u32 netdev_fix_features(u32 features, const char *name); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -int netif_skb_features(struct sk_buff *skb); +u32 netif_skb_features(struct sk_buff *skb); -static inline int net_gso_ok(int features, int gso_type) +static inline int net_gso_ok(u32 features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; return (features & feature) == feature; } -static inline int skb_gso_ok(struct sk_buff *skb, int features) +static inline int skb_gso_ok(struct sk_buff *skb, u32 features) { return net_gso_ok(features, skb_shinfo(skb)->gso_type) && (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6e946da9d1d6..31f02d0b46a7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1877,7 +1877,7 @@ extern void skb_split(struct sk_buff *skb, extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); -extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); +extern struct sk_buff *skb_segment(struct sk_buff *skb, u32 features); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/include/net/protocol.h b/include/net/protocol.h index dc07495bce4c..6f7eb800974a 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -38,7 +38,7 @@ struct net_protocol { void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); struct sk_buff *(*gso_segment)(struct sk_buff *skb, - int features); + u32 features); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb); @@ -57,7 +57,7 @@ struct inet6_protocol { int (*gso_send_check)(struct sk_buff *skb); struct sk_buff *(*gso_segment)(struct sk_buff *skb, - int features); + u32 features); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb); diff --git a/include/net/tcp.h b/include/net/tcp.h index 38509f047382..917911165e3b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1404,7 +1404,7 @@ extern struct request_sock_ops tcp6_request_sock_ops; extern void tcp_v4_destroy_sock(struct sock *sk); extern int tcp_v4_gso_send_check(struct sk_buff *skb); -extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features); +extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features); extern struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb); extern struct sk_buff **tcp4_gro_receive(struct sk_buff **head, diff --git a/include/net/udp.h b/include/net/udp.h index bb967dd59bf7..e82f3a8c0f8f 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -245,5 +245,5 @@ extern void udp4_proc_exit(void); extern void udp_init(void); extern int udp4_ufo_send_check(struct sk_buff *skb); -extern struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features); +extern struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features); #endif /* _UDP_H */ diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 6e64f7c6a2e9..7850412f52b7 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -327,7 +327,7 @@ static void vlan_sync_address(struct net_device *dev, static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { - unsigned long old_features = vlandev->features; + u32 old_features = vlandev->features; vlandev->features &= ~dev->vlan_features; vlandev->features |= dev->features & dev->vlan_features; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index d9d1e2bac1d6..52ce4a30f8b3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -365,7 +365,7 @@ int br_min_mtu(const struct net_bridge *br) void br_features_recompute(struct net_bridge *br) { struct net_bridge_port *p; - unsigned long features, mask; + u32 features, mask; features = mask = br->feature_mask; if (list_empty(&br->port_list)) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 84aac7734bfc..9f22898c5359 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -182,7 +182,7 @@ struct net_bridge struct br_cpu_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; - unsigned long feature_mask; + u32 feature_mask; #ifdef CONFIG_BRIDGE_NETFILTER struct rtable fake_rtable; bool nf_call_iptables; diff --git a/net/core/dev.c b/net/core/dev.c index ad3741898584..7103f89fde0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1858,7 +1858,7 @@ EXPORT_SYMBOL(skb_checksum_help); * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; @@ -2046,7 +2046,7 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) protocol == htons(ETH_P_FCOE))); } -static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features) +static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) { if (!can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; @@ -2058,10 +2058,10 @@ static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features return features; } -int netif_skb_features(struct sk_buff *skb) +u32 netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - int features = skb->dev->features; + u32 features = skb->dev->features; if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2106,7 +2106,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int rc = NETDEV_TX_OK; if (likely(!skb->next)) { - int features; + u32 features; /* * If device doesnt need skb->dst, release it right now while @@ -5213,7 +5213,7 @@ static void rollback_registered(struct net_device *dev) rollback_registered_many(&single); } -unsigned long netdev_fix_features(unsigned long features, const char *name) +u32 netdev_fix_features(u32 features, const char *name) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && @@ -6143,8 +6143,7 @@ static int dev_cpu_callback(struct notifier_block *nfb, * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ -unsigned long netdev_increment_features(unsigned long all, unsigned long one, - unsigned long mask) +u32 netdev_increment_features(u32 all, u32 one, u32 mask) { /* If device needs checksumming, downgrade to it. */ if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17741782a345..bd1af99e1122 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1458,7 +1458,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; - unsigned long old_features; + u32 old_features; if (!dev || !netif_device_present(dev)) return -ENODEV; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e23c01be5a5b..81367ccf3306 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -99,7 +99,7 @@ NETDEVICE_SHOW(addr_assign_type, fmt_dec); NETDEVICE_SHOW(addr_len, fmt_dec); NETDEVICE_SHOW(iflink, fmt_dec); NETDEVICE_SHOW(ifindex, fmt_dec); -NETDEVICE_SHOW(features, fmt_long_hex); +NETDEVICE_SHOW(features, fmt_hex); NETDEVICE_SHOW(type, fmt_dec); NETDEVICE_SHOW(link_mode, fmt_dec); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d31bb36ae0dc..436c4c439240 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2497,7 +2497,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum); * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, int features) +struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; @@ -2507,7 +2507,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) unsigned int offset = doffset; unsigned int headroom; unsigned int len; - int sg = features & NETIF_F_SG; + int sg = !!(features & NETIF_F_SG); int nfrags = skb_shinfo(skb)->nr_frags; int err = -ENOMEM; int i = 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2b61107df6c..e5e2d9d64abb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1215,7 +1215,7 @@ out: return err; } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6c11eece262c..f9867d2dbef4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8157b17959ee..d37baaa1dbe3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2199,7 +2199,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) return 0; } -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 978e80e2c4a8..3194aa909872 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -772,7 +772,7 @@ out: return err; } -static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9a009c66c8a3..a419a787eb69 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1299,7 +1299,7 @@ static int udp6_ufo_send_check(struct sk_buff *skb) return 0; } -static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int features) +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; -- cgit v1.2.3 From acd1130e8793fb150fb522da8ec51675839eb4b1 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Mon, 24 Jan 2011 15:45:15 -0800 Subject: net: reduce and unify printk level in netdev_fix_features() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce printk() levels to KERN_INFO in netdev_fix_features() as this will be used by ethtool and might spam dmesg unnecessarily. This converts the function to use netdev_info() instead of plain printk(). As a side effect, bonding and bridge devices will now log dropped features on every slave device change. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 ++-- include/linux/netdevice.h | 2 +- net/bridge/br_if.c | 2 +- net/core/dev.c | 33 ++++++++++++--------------------- 4 files changed, 16 insertions(+), 25 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 7047b406b8ba..1df9f0ea9184 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1400,8 +1400,8 @@ static int bond_compute_features(struct bonding *bond) done: features |= (bond_dev->features & BOND_VLAN_FEATURES); - bond_dev->features = netdev_fix_features(features, NULL); - bond_dev->vlan_features = netdev_fix_features(vlan_features, NULL); + bond_dev->features = netdev_fix_features(bond_dev, features); + bond_dev->vlan_features = netdev_fix_features(bond_dev, vlan_features); bond_dev->hard_header_len = max_hard_header_len; return 0; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0de3c59720fa..8858422c5c5d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2399,7 +2399,7 @@ extern char *netdev_drivername(const struct net_device *dev, char *buffer, int l extern void linkwatch_run_queue(void); u32 netdev_increment_features(u32 all, u32 one, u32 mask); -u32 netdev_fix_features(u32 features, const char *name); +u32 netdev_fix_features(struct net_device *dev, u32 features); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 52ce4a30f8b3..2a6801d8b728 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -379,7 +379,7 @@ void br_features_recompute(struct net_bridge *br) } done: - br->dev->features = netdev_fix_features(features, NULL); + br->dev->features = netdev_fix_features(br->dev, features); } /* called with RTNL */ diff --git a/net/core/dev.c b/net/core/dev.c index 7103f89fde0c..1b4c07fe295f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5213,58 +5213,49 @@ static void rollback_registered(struct net_device *dev) rollback_registered_many(&single); } -u32 netdev_fix_features(u32 features, const char *name) +u32 netdev_fix_features(struct net_device *dev, u32 features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - if (name) - printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", - name); + netdev_info(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } if ((features & NETIF_F_NO_CSUM) && (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - if (name) - printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", - name); + netdev_info(dev, "mixed no checksumming and other settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); } /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " - "checksum feature.\n", name); + netdev_info(dev, + "Dropping NETIF_F_SG since no checksum feature.\n"); features &= ~NETIF_F_SG; } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { - if (name) - printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " - "SG feature.\n", name); + netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n"); features &= ~NETIF_F_TSO; } + /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { /* maybe split UFO into V4 and V6? */ if (!((features & NETIF_F_GEN_CSUM) || (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no checksum offload features.\n", - name); + netdev_info(dev, + "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; } if (!(features & NETIF_F_SG)) { - if (name) - printk(KERN_ERR "%s: Dropping NETIF_F_UFO " - "since no NETIF_F_SG feature.\n", name); + netdev_info(dev, + "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); features &= ~NETIF_F_UFO; } } @@ -5407,7 +5398,7 @@ int register_netdevice(struct net_device *dev) if (dev->iflink == -1) dev->iflink = dev->ifindex; - dev->features = netdev_fix_features(dev->features, dev->name); + dev->features = netdev_fix_features(dev, dev->features); /* Enable software GSO if SG is supported. */ if (dev->features & NETIF_F_SG) -- cgit v1.2.3 From ccf434380d1a67df2dcb9113206b77d0cb0a1cef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2011 18:08:02 +0000 Subject: net: fix dev_seq_next() Commit c6d14c84566d (net: Introduce for_each_netdev_rcu() iterator) added a race in dev_seq_next(). The rcu_dereference() call should be done _before_ testing the end of list, or we might return a wrong net_device if a concurrent thread changes net_device list under us. Note : discovered thanks to a sparse warning : net/core/dev.c:3919:9: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Eric Dumazet CC: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 ++++++++- net/core/dev.c | 11 +++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8858422c5c5d..c7d707452228 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1447,7 +1447,7 @@ static inline struct net_device *next_net_device_rcu(struct net_device *dev) struct net *net; net = dev_net(dev); - lh = rcu_dereference(dev->dev_list.next); + lh = rcu_dereference(list_next_rcu(&dev->dev_list)); return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } @@ -1457,6 +1457,13 @@ static inline struct net_device *first_net_device(struct net *net) net_device_entry(net->dev_base_head.next); } +static inline struct net_device *first_net_device_rcu(struct net *net) +{ + struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head)); + + return lh == &net->dev_base_head ? NULL : net_device_entry(lh); +} + extern int netdev_boot_setup_check(struct net_device *dev); extern unsigned long netdev_boot_base(const char *prefix, int unit); extern struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, diff --git a/net/core/dev.c b/net/core/dev.c index 1b4c07fe295f..ddd5df2b61d4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4051,12 +4051,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = (v == SEQ_START_TOKEN) ? - first_net_device(seq_file_net(seq)) : - next_net_device((struct net_device *)v); + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + dev = first_net_device_rcu(seq_file_net(seq)); + else + dev = next_net_device_rcu(dev); ++*pos; - return rcu_dereference(dev); + return dev; } void dev_seq_stop(struct seq_file *seq, void *v) -- cgit v1.2.3 From d59cfde2fb960b5970ccb5a38cea25d38b37a8e8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Feb 2011 00:46:06 +0000 Subject: net: remove the unnecessary dance around skb_bond_should_drop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need to check (master) twice and to drive in and out the header file. Signed-off-by: Jiri Pirko Reviewed-by: Nicolas de Pesloüan Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ----------- net/core/dev.c | 6 +++--- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c7d707452228..5a5baeaaa50f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2437,17 +2437,6 @@ static inline void netif_set_gso_max_size(struct net_device *dev, dev->gso_max_size = size; } -extern int __skb_bond_should_drop(struct sk_buff *skb, - struct net_device *master); - -static inline int skb_bond_should_drop(struct sk_buff *skb, - struct net_device *master) -{ - if (master) - return __skb_bond_should_drop(skb, master); - return 0; -} - extern struct pernet_operations __net_initdata loopback_net_ops; static inline int dev_ethtool_get_settings(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 6392ea0a5910..d874fd1baf49 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3105,7 +3105,8 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb, * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and * ARP on active-backup slaves with arp_validate enabled. */ -int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) +static int __skb_bond_should_drop(struct sk_buff *skb, + struct net_device *master) { struct net_device *dev = skb->dev; @@ -3139,7 +3140,6 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master) } return 0; } -EXPORT_SYMBOL(__skb_bond_should_drop); static int __netif_receive_skb(struct sk_buff *skb) { @@ -3177,7 +3177,7 @@ static int __netif_receive_skb(struct sk_buff *skb) if (skb->deliver_no_wcard) null_or_orig = orig_dev; else if (master) { - if (skb_bond_should_drop(skb, master)) { + if (__skb_bond_should_drop(skb, master)) { skb->deliver_no_wcard = 1; null_or_orig = orig_dev; /* deliver only exact match */ } else -- cgit v1.2.3 From 1765a575334f1a232c1478accdee5c7d19f4b3e3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Feb 2011 06:48:36 +0000 Subject: net: make dev->master general dev->master is now tightly connected to bonding driver. This patch makes this pointer more general and ready to be used by others. - netdev_set_master() - bond specifics moved to new function netdev_set_bond_master() - introduced netif_is_bond_slave() to check if device is a bonding slave Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/infiniband/hw/nes/nes.c | 3 ++- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/net/bonding/bond_main.c | 10 ++++---- drivers/net/cxgb3/cxgb3_offload.c | 3 ++- include/linux/netdevice.h | 7 ++++++ net/core/dev.c | 49 ++++++++++++++++++++++++++++---------- 6 files changed, 54 insertions(+), 20 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 3b4ec3238ceb..3d7f3664b67b 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -153,7 +153,8 @@ static int nes_inetaddr_event(struct notifier_block *notifier, nesdev, nesdev->netdev[0]->name); netdev = nesdev->netdev[0]; nesvnic = netdev_priv(netdev); - is_bonded = (netdev->master == event_netdev); + is_bonded = netif_is_bond_slave(netdev) && + (netdev->master == event_netdev); if ((netdev == event_netdev) || is_bonded) { if (nesvnic->rdma_enabled == 0) { nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since" diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 009ec814d517..ec3aa11c36cb 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1118,7 +1118,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpi return rc; } - if (nesvnic->netdev->master) + if (netif_is_bond_slave(netdev)) netdev = nesvnic->netdev->master; else netdev = nesvnic->netdev; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1df9f0ea9184..9f877878d636 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1594,9 +1594,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) } } - res = netdev_set_master(slave_dev, bond_dev); + res = netdev_set_bond_master(slave_dev, bond_dev); if (res) { - pr_debug("Error %d calling netdev_set_master\n", res); + pr_debug("Error %d calling netdev_set_bond_master\n", res); goto err_restore_mac; } /* open the slave since the application closed it */ @@ -1812,7 +1812,7 @@ err_close: dev_close(slave_dev); err_unset_master: - netdev_set_master(slave_dev, NULL); + netdev_set_bond_master(slave_dev, NULL); err_restore_mac: if (!bond->params.fail_over_mac) { @@ -1992,7 +1992,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) netif_addr_unlock_bh(bond_dev); } - netdev_set_master(slave_dev, NULL); + netdev_set_bond_master(slave_dev, NULL); #ifdef CONFIG_NET_POLL_CONTROLLER read_lock_bh(&bond->lock); @@ -2114,7 +2114,7 @@ static int bond_release_all(struct net_device *bond_dev) netif_addr_unlock_bh(bond_dev); } - netdev_set_master(slave_dev, NULL); + netdev_set_bond_master(slave_dev, NULL); /* close slave before restoring its mac address */ dev_close(slave_dev); diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index 7ea94b5205f8..862804f32b6e 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -186,9 +186,10 @@ static struct net_device *get_iff_from_mac(struct adapter *adapter, dev = NULL; if (grp) dev = vlan_group_get_device(grp, vlan); - } else + } else if (netif_is_bond_slave(dev)) { while (dev->master) dev = dev->master; + } return dev; } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a5baeaaa50f..5a42b1003767 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2377,6 +2377,8 @@ extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); +extern int netdev_set_bond_master(struct net_device *dev, + struct net_device *master); extern int skb_checksum_help(struct sk_buff *skb); extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features); #ifdef CONFIG_BUG @@ -2437,6 +2439,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev, dev->gso_max_size = size; } +static inline int netif_is_bond_slave(struct net_device *dev) +{ + return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; +} + extern struct pernet_operations __net_initdata loopback_net_ops; static inline int dev_ethtool_get_settings(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index d874fd1baf49..a4132766d363 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3146,7 +3146,6 @@ static int __netif_receive_skb(struct sk_buff *skb) struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct net_device *orig_dev; - struct net_device *master; struct net_device *null_or_orig; struct net_device *orig_or_bond; int ret = NET_RX_DROP; @@ -3173,15 +3172,19 @@ static int __netif_receive_skb(struct sk_buff *skb) */ null_or_orig = NULL; orig_dev = skb->dev; - master = ACCESS_ONCE(orig_dev->master); if (skb->deliver_no_wcard) null_or_orig = orig_dev; - else if (master) { - if (__skb_bond_should_drop(skb, master)) { - skb->deliver_no_wcard = 1; - null_or_orig = orig_dev; /* deliver only exact match */ - } else - skb->dev = master; + else if (netif_is_bond_slave(orig_dev)) { + struct net_device *bond_master = ACCESS_ONCE(orig_dev->master); + + if (likely(bond_master)) { + if (__skb_bond_should_drop(skb, bond_master)) { + skb->deliver_no_wcard = 1; + /* deliver only exact match */ + null_or_orig = orig_dev; + } else + skb->dev = bond_master; + } } __this_cpu_inc(softnet_data.processed); @@ -4346,15 +4349,14 @@ static int __init dev_proc_init(void) /** - * netdev_set_master - set up master/slave pair + * netdev_set_master - set up master pointer * @slave: slave device * @master: new master device * * Changes the master device of the slave. Pass %NULL to break the * bonding. The caller must hold the RTNL semaphore. On a failure * a negative errno code is returned. On success the reference counts - * are adjusted, %RTM_NEWLINK is sent to the routing socket and the - * function returns zero. + * are adjusted and the function returns zero. */ int netdev_set_master(struct net_device *slave, struct net_device *master) { @@ -4374,6 +4376,29 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) synchronize_net(); dev_put(old); } + return 0; +} +EXPORT_SYMBOL(netdev_set_master); + +/** + * netdev_set_bond_master - set up bonding master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success %RTM_NEWLINK is sent + * to the routing socket and the function returns zero. + */ +int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +{ + int err; + + ASSERT_RTNL(); + + err = netdev_set_master(slave, master); + if (err) + return err; if (master) slave->flags |= IFF_SLAVE; else @@ -4382,7 +4407,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); return 0; } -EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_set_bond_master); static void dev_change_rx_flags(struct net_device *dev, int flags) { -- cgit v1.2.3 From fbaec0ea54f7d9131891ff98744e82c073ce03b1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 13 Feb 2011 10:15:37 +0000 Subject: rtnetlink: implement setting of master device This patch allows userspace to enslave/release slave devices via netlink interface using IFLA_MASTER. This introduces generic way to add/remove underling devices. Signed-off-by: Jiri Pirko Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++++++++++ net/core/rtnetlink.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a42b1003767..d08ef6538579 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -783,6 +783,14 @@ struct netdev_tc_txq { * Set hardware filter for RFS. rxq_index is the target queue index; * flow_id is a flow ID to be passed to rps_may_expire_flow() later. * Return the filter ID on success, or a negative error code. + * + * Slave management functions (for bridge, bonding, etc). User should + * call netdev_set_master() to set dev->master properly. + * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev); + * Called to make another netdev an underling. + * + * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + * Called to release previously enslaved netdev. */ #define HAVE_NET_DEVICE_OPS struct net_device_ops { @@ -862,6 +870,10 @@ struct net_device_ops { u16 rxq_index, u32 flow_id); #endif + int (*ndo_add_slave)(struct net_device *dev, + struct net_device *slave_dev); + int (*ndo_del_slave)(struct net_device *dev, + struct net_device *slave_dev); }; /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index da0fe457c858..49f7ea5b4c75 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1036,6 +1036,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, + [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, @@ -1178,6 +1179,41 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) return err; } +static int do_set_master(struct net_device *dev, int ifindex) +{ + struct net_device *master_dev; + const struct net_device_ops *ops; + int err; + + if (dev->master) { + if (dev->master->ifindex == ifindex) + return 0; + ops = dev->master->netdev_ops; + if (ops->ndo_del_slave) { + err = ops->ndo_del_slave(dev->master, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + + if (ifindex) { + master_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!master_dev) + return -EINVAL; + ops = master_dev->netdev_ops; + if (ops->ndo_add_slave) { + err = ops->ndo_add_slave(master_dev, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + return 0; +} + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { @@ -1301,6 +1337,13 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, goto errout; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + if (err) + goto errout; + modified = 1; + } + if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); -- cgit v1.2.3 From 212b573f5552c60265da721ff9ce32e3462a2cdd Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Tue, 15 Feb 2011 16:59:16 +0000 Subject: ethtool: enable GSO and GRO by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d08ef6538579..168e3ad14daf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -984,6 +984,9 @@ struct net_device { NETIF_F_SG | NETIF_F_HIGHDMA | \ NETIF_F_FRAGLIST) + /* changeable features with no special hardware requirements */ +#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) + /* Interface index. Unique device identifier */ int ifindex; int iflink; diff --git a/net/core/dev.c b/net/core/dev.c index 4580460ebecd..8686f6ffe7f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5274,6 +5274,12 @@ u32 netdev_fix_features(struct net_device *dev, u32 features) features &= ~NETIF_F_TSO; } + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; + } + /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { /* maybe split UFO into V4 and V6? */ @@ -5430,11 +5436,15 @@ int register_netdevice(struct net_device *dev) if (dev->iflink == -1) dev->iflink = dev->ifindex; - dev->features = netdev_fix_features(dev, dev->features); + /* Enable software offloads by default - will be stripped in + * netdev_fix_features() if not supported. */ + dev->features |= NETIF_F_SOFT_FEATURES; - /* Enable software GSO if SG is supported. */ - if (dev->features & NETIF_F_SG) - dev->features |= NETIF_F_GSO; + /* Avoid warning from netdev_fix_features() for GSO without SG */ + if (!(dev->features & NETIF_F_SG)) + dev->features &= ~NETIF_F_GSO; + + dev->features = netdev_fix_features(dev, dev->features); /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features -- cgit v1.2.3 From 0a417704777ed29d0e8c72b7274a328e61248e75 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Tue, 15 Feb 2011 16:59:17 +0000 Subject: ethtool: factorize get/set_one_feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to enable GRO even if RX csum is disabled. GRO will not be used for packets without hardware checksum anyway. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 + net/core/ethtool.c | 274 ++++++++++++++++++++++------------------------ 2 files changed, 138 insertions(+), 142 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 168e3ad14daf..dede3fdbb4be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -976,6 +976,12 @@ struct net_device { #define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) #define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) +#define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) + +#define NETIF_F_ALL_TX_OFFLOADS (NETIF_F_ALL_CSUM | NETIF_F_SG | \ + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ + NETIF_F_SCTP_CSUM | NETIF_F_FCOE_CRC) + /* * If one device supports one of these features, then enable them * for all in netdev_increment_features. diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 85aaeab862a8..c3fb8f90de6d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -191,6 +191,109 @@ static void __ethtool_get_strings(struct net_device *dev, ops->get_strings(dev, stringset, data); } +static u32 ethtool_get_feature_mask(u32 eth_cmd) +{ + /* feature masks of legacy discrete ethtool ops */ + + switch (eth_cmd) { + case ETHTOOL_GTXCSUM: + case ETHTOOL_STXCSUM: + return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + case ETHTOOL_GSG: + case ETHTOOL_SSG: + return NETIF_F_SG; + case ETHTOOL_GTSO: + case ETHTOOL_STSO: + return NETIF_F_ALL_TSO; + case ETHTOOL_GUFO: + case ETHTOOL_SUFO: + return NETIF_F_UFO; + case ETHTOOL_GGSO: + case ETHTOOL_SGSO: + return NETIF_F_GSO; + case ETHTOOL_GGRO: + case ETHTOOL_SGRO: + return NETIF_F_GRO; + default: + BUG(); + } +} + +static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops) + return NULL; + + switch (ethcmd) { + case ETHTOOL_GTXCSUM: + return ops->get_tx_csum; + case ETHTOOL_SSG: + return ops->get_sg; + case ETHTOOL_STSO: + return ops->get_tso; + case ETHTOOL_SUFO: + return ops->get_ufo; + default: + return NULL; + } +} + +static int ethtool_get_one_feature(struct net_device *dev, + char __user *useraddr, u32 ethcmd) +{ + struct ethtool_value edata = { + .cmd = ethcmd, + .data = !!(dev->features & ethtool_get_feature_mask(ethcmd)), + }; + u32 (*actor)(struct net_device *); + + actor = __ethtool_get_one_feature_actor(dev, ethcmd); + if (actor) + edata.data = actor(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); +static int __ethtool_set_sg(struct net_device *dev, u32 data); +static int __ethtool_set_tso(struct net_device *dev, u32 data); +static int __ethtool_set_ufo(struct net_device *dev, u32 data); + +static int ethtool_set_one_feature(struct net_device *dev, + void __user *useraddr, u32 ethcmd) +{ + struct ethtool_value edata; + u32 mask; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + switch (ethcmd) { + case ETHTOOL_STXCSUM: + return __ethtool_set_tx_csum(dev, edata.data); + case ETHTOOL_SSG: + return __ethtool_set_sg(dev, edata.data); + case ETHTOOL_STSO: + return __ethtool_set_tso(dev, edata.data); + case ETHTOOL_SUFO: + return __ethtool_set_ufo(dev, edata.data); + case ETHTOOL_SGSO: + case ETHTOOL_SGRO: + mask = ethtool_get_feature_mask(ethcmd); + if (edata.data) + dev->features |= mask; + else + dev->features &= ~mask; + return 0; + default: + return -EOPNOTSUPP; + } +} + static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; @@ -1107,6 +1210,9 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) { int err; + if (data && !(dev->features & NETIF_F_ALL_CSUM)) + return -EINVAL; + if (!data && dev->ethtool_ops->set_tso) { err = dev->ethtool_ops->set_tso(dev, 0); if (err) @@ -1121,24 +1227,20 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data) return dev->ethtool_ops->set_sg(dev, data); } -static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) { - struct ethtool_value edata; int err; if (!dev->ethtool_ops->set_tx_csum) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (!edata.data && dev->ethtool_ops->set_sg) { + if (!data && dev->ethtool_ops->set_sg) { err = __ethtool_set_sg(dev, 0); if (err) return err; } - return dev->ethtool_ops->set_tx_csum(dev, edata.data); + return dev->ethtool_ops->set_tx_csum(dev, data); } static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) @@ -1157,108 +1259,28 @@ static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_rx_csum(dev, edata.data); } -static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (!dev->ethtool_ops->set_sg) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (edata.data && - !(dev->features & NETIF_F_ALL_CSUM)) - return -EINVAL; - - return __ethtool_set_sg(dev, edata.data); -} - -static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_tso(struct net_device *dev, u32 data) { - struct ethtool_value edata; - if (!dev->ethtool_ops->set_tso) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (edata.data && !(dev->features & NETIF_F_SG)) + if (data && !(dev->features & NETIF_F_SG)) return -EINVAL; - return dev->ethtool_ops->set_tso(dev, edata.data); + return dev->ethtool_ops->set_tso(dev, data); } -static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_ufo(struct net_device *dev, u32 data) { - struct ethtool_value edata; - if (!dev->ethtool_ops->set_ufo) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - if (edata.data && !(dev->features & NETIF_F_SG)) + if (data && !(dev->features & NETIF_F_SG)) return -EINVAL; - if (edata.data && !((dev->features & NETIF_F_GEN_CSUM) || + if (data && !((dev->features & NETIF_F_GEN_CSUM) || (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) return -EINVAL; - return dev->ethtool_ops->set_ufo(dev, edata.data); -} - -static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GGSO }; - - edata.data = dev->features & NETIF_F_GSO; - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_gso(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - if (edata.data) - dev->features |= NETIF_F_GSO; - else - dev->features &= ~NETIF_F_GSO; - return 0; -} - -static int ethtool_get_gro(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GGRO }; - - edata.data = dev->features & NETIF_F_GRO; - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_gro(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (edata.data) { - u32 rxcsum = dev->ethtool_ops->get_rx_csum ? - dev->ethtool_ops->get_rx_csum(dev) : - ethtool_op_get_rx_csum(dev); - - if (!rxcsum) - return -EINVAL; - dev->features |= NETIF_F_GRO; - } else - dev->features &= ~NETIF_F_GRO; - - return 0; + return dev->ethtool_ops->set_ufo(dev, data); } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) @@ -1590,33 +1612,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXCSUM: rc = ethtool_set_rx_csum(dev, useraddr); break; - case ETHTOOL_GTXCSUM: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_tx_csum ? - dev->ethtool_ops->get_tx_csum : - ethtool_op_get_tx_csum)); - break; - case ETHTOOL_STXCSUM: - rc = ethtool_set_tx_csum(dev, useraddr); - break; - case ETHTOOL_GSG: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_sg ? - dev->ethtool_ops->get_sg : - ethtool_op_get_sg)); - break; - case ETHTOOL_SSG: - rc = ethtool_set_sg(dev, useraddr); - break; - case ETHTOOL_GTSO: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_tso ? - dev->ethtool_ops->get_tso : - ethtool_op_get_tso)); - break; - case ETHTOOL_STSO: - rc = ethtool_set_tso(dev, useraddr); - break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; @@ -1632,21 +1627,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; - case ETHTOOL_GUFO: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_ufo ? - dev->ethtool_ops->get_ufo : - ethtool_op_get_ufo)); - break; - case ETHTOOL_SUFO: - rc = ethtool_set_ufo(dev, useraddr); - break; - case ETHTOOL_GGSO: - rc = ethtool_get_gso(dev, useraddr); - break; - case ETHTOOL_SGSO: - rc = ethtool_set_gso(dev, useraddr); - break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, (dev->ethtool_ops->get_flags ? @@ -1677,12 +1657,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; - case ETHTOOL_GGRO: - rc = ethtool_get_gro(dev, useraddr); - break; - case ETHTOOL_SGRO: - rc = ethtool_set_gro(dev, useraddr); - break; case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, useraddr); break; @@ -1704,6 +1678,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; + case ETHTOOL_GTXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GTSO: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + rc = ethtool_get_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_STXCSUM: + case ETHTOOL_SSG: + case ETHTOOL_STSO: + case ETHTOOL_SUFO: + case ETHTOOL_SGSO: + case ETHTOOL_SGRO: + rc = ethtool_set_one_feature(dev, useraddr, ethcmd); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From 5455c6998d34dc983a8693500e4dffefc3682dc5 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Tue, 15 Feb 2011 16:59:17 +0000 Subject: net: Introduce new feature setting ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces a new framework to handle device features setting. It consists of: - new fields in struct net_device: + hw_features - features that hw/driver supports toggling + wanted_features - features that user wants enabled, when possible - new netdev_ops: + feat = ndo_fix_features(dev, feat) - API checking constraints for enabling features or their combinations + ndo_set_features(dev) - API updating hardware state to match changed dev->features - new ethtool commands: + ETHTOOL_GFEATURES/ETHTOOL_SFEATURES: get/set dev->wanted_features and trigger device reconfiguration if resulting dev->features changed + ETHTOOL_GSTRINGS(ETH_SS_FEATURES): get feature bits names (meaning) Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/ethtool.h | 85 +++++++++++++++++++++++++++++++ include/linux/netdevice.h | 37 +++++++++++++- net/core/dev.c | 46 ++++++++++++++--- net/core/ethtool.c | 125 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 283 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1908929204a9..806e716bb4fb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -251,6 +251,7 @@ enum ethtool_stringset { ETH_SS_STATS, ETH_SS_PRIV_FLAGS, ETH_SS_NTUPLE_FILTERS, + ETH_SS_FEATURES, }; /* for passing string sets for data tagging */ @@ -523,6 +524,87 @@ struct ethtool_flash { char data[ETHTOOL_FLASH_MAX_FILENAME]; }; +/* for returning and changing feature sets */ + +/** + * struct ethtool_get_features_block - block with state of 32 features + * @available: mask of changeable features + * @requested: mask of features requested to be enabled if possible + * @active: mask of currently enabled features + * @never_changed: mask of features not changeable for any device + */ +struct ethtool_get_features_block { + __u32 available; + __u32 requested; + __u32 active; + __u32 never_changed; +}; + +/** + * struct ethtool_gfeatures - command to get state of device's features + * @cmd: command number = %ETHTOOL_GFEATURES + * @size: in: number of elements in the features[] array; + * out: number of elements in features[] needed to hold all features + * @features: state of features + */ +struct ethtool_gfeatures { + __u32 cmd; + __u32 size; + struct ethtool_get_features_block features[0]; +}; + +/** + * struct ethtool_set_features_block - block with request for 32 features + * @valid: mask of features to be changed + * @requested: values of features to be changed + */ +struct ethtool_set_features_block { + __u32 valid; + __u32 requested; +}; + +/** + * struct ethtool_sfeatures - command to request change in device's features + * @cmd: command number = %ETHTOOL_SFEATURES + * @size: array size of the features[] array + * @features: feature change masks + */ +struct ethtool_sfeatures { + __u32 cmd; + __u32 size; + struct ethtool_set_features_block features[0]; +}; + +/* + * %ETHTOOL_SFEATURES changes features present in features[].valid to the + * values of corresponding bits in features[].requested. Bits in .requested + * not set in .valid or not changeable are ignored. + * + * Returns %EINVAL when .valid contains undefined or never-changable bits + * or size is not equal to required number of features words (32-bit blocks). + * Returns >= 0 if request was completed; bits set in the value mean: + * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not + * changeable (not present in %ETHTOOL_GFEATURES' features[].available) + * those bits were ignored. + * %ETHTOOL_F_WISH - some or all changes requested were recorded but the + * resulting state of bits masked by .valid is not equal to .requested. + * Probably there are other device-specific constraints on some features + * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered + * here as though ignored bits were cleared. + * + * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of + * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands + * for ETH_SS_FEATURES string set. First entry in the table corresponds to least + * significant bit in features[0] fields. Empty strings mark undefined features. + */ +enum ethtool_sfeatures_retval_bits { + ETHTOOL_F_UNSUPPORTED__BIT, + ETHTOOL_F_WISH__BIT, +}; + +#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) +#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) + #ifdef __KERNEL__ #include @@ -744,6 +826,9 @@ struct ethtool_ops { #define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ #define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ +#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ +#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ + /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET #define SPARC_ETH_SSET ETHTOOL_SSET diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dede3fdbb4be..85f67e225f60 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -791,6 +791,18 @@ struct netdev_tc_txq { * * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. + * + * Feature/offload setting functions. + * u32 (*ndo_fix_features)(struct net_device *dev, u32 features); + * Adjusts the requested feature flags according to device-specific + * constraints, and returns the resulting flags. Must not modify + * the device state. + * + * int (*ndo_set_features)(struct net_device *dev, u32 features); + * Called to update device configuration to new features. Passed + * feature set might be less than what was returned by ndo_fix_features()). + * Must return >0 or -errno if it changed dev->features itself. + * */ #define HAVE_NET_DEVICE_OPS struct net_device_ops { @@ -874,6 +886,10 @@ struct net_device_ops { struct net_device *slave_dev); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + u32 (*ndo_fix_features)(struct net_device *dev, + u32 features); + int (*ndo_set_features)(struct net_device *dev, + u32 features); }; /* @@ -925,12 +941,18 @@ struct net_device { struct list_head napi_list; struct list_head unreg_list; - /* Net device features */ + /* currently active device features */ u32 features; - + /* user-changeable features */ + u32 hw_features; + /* user-requested features */ + u32 wanted_features; /* VLAN feature mask */ u32 vlan_features; + /* Net device feature bits; if you change something, + * also update netdev_features_strings[] in ethtool.c */ + #define NETIF_F_SG 1 /* Scatter/gather IO. */ #define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */ #define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */ @@ -966,6 +988,12 @@ struct net_device { #define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) #define NETIF_F_FSO (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT) + /* Features valid for ethtool to change */ + /* = all defined minus driver/device-class-related */ +#define NETIF_F_NEVER_CHANGE (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \ + NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) +#define NETIF_F_ETHTOOL_BITS (0x1f3fffff & ~NETIF_F_NEVER_CHANGE) + /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ NETIF_F_TSO6 | NETIF_F_UFO) @@ -2428,8 +2456,13 @@ extern char *netdev_drivername(const struct net_device *dev, char *buffer, int l extern void linkwatch_run_queue(void); +static inline u32 netdev_get_wanted_features(struct net_device *dev) +{ + return (dev->features & ~dev->hw_features) | dev->wanted_features; +} u32 netdev_increment_features(u32 all, u32 one, u32 mask); u32 netdev_fix_features(struct net_device *dev, u32 features); +void netdev_update_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index 8686f6ffe7f0..4f6943928fe8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5302,6 +5302,37 @@ u32 netdev_fix_features(struct net_device *dev, u32 features) } EXPORT_SYMBOL(netdev_fix_features); +void netdev_update_features(struct net_device *dev) +{ + u32 features; + int err = 0; + + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + if (dev->features == features) + return; + + netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n", + dev->features, features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + + if (!err) + dev->features = features; + else if (err < 0) + netdev_err(dev, + "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", + err, features, dev->features); +} +EXPORT_SYMBOL(netdev_update_features); + /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from @@ -5436,15 +5467,18 @@ int register_netdevice(struct net_device *dev) if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Enable software offloads by default - will be stripped in - * netdev_fix_features() if not supported. */ - dev->features |= NETIF_F_SOFT_FEATURES; + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->wanted_features = (dev->features & dev->hw_features) + | NETIF_F_SOFT_FEATURES; /* Avoid warning from netdev_fix_features() for GSO without SG */ - if (!(dev->features & NETIF_F_SG)) - dev->features &= ~NETIF_F_GSO; + if (!(dev->wanted_features & NETIF_F_SG)) + dev->wanted_features &= ~NETIF_F_GSO; - dev->features = netdev_fix_features(dev, dev->features); + netdev_update_features(dev); /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c3fb8f90de6d..95773960dc77 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -172,10 +172,120 @@ EXPORT_SYMBOL(ethtool_ntuple_flush); /* Handlers for each ethtool command */ +#define ETHTOOL_DEV_FEATURE_WORDS 1 + +static int ethtool_get_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gfeatures cmd = { + .cmd = ETHTOOL_GFEATURES, + .size = ETHTOOL_DEV_FEATURE_WORDS, + }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { + { + .available = dev->hw_features, + .requested = dev->wanted_features, + .active = dev->features, + .never_changed = NETIF_F_NEVER_CHANGE, + }, + }; + u32 __user *sizeaddr; + u32 copy_size; + + sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); + if (get_user(copy_size, sizeaddr)) + return -EFAULT; + + if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) + copy_size = ETHTOOL_DEV_FEATURE_WORDS; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_sfeatures cmd; + struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + int ret = 0; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + + if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) + return -EINVAL; + + if (copy_from_user(features, useraddr, sizeof(features))) + return -EFAULT; + + if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; + + if (features[0].valid & ~dev->hw_features) { + features[0].valid &= dev->hw_features; + ret |= ETHTOOL_F_UNSUPPORTED; + } + + dev->wanted_features &= ~features[0].valid; + dev->wanted_features |= features[0].valid & features[0].requested; + netdev_update_features(dev); + + if ((dev->wanted_features ^ dev->features) & features[0].valid) + ret |= ETHTOOL_F_WISH; + + return ret; +} + +static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = { + /* NETIF_F_SG */ "tx-scatter-gather", + /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4", + /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded", + /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic", + /* NETIF_F_IPV6_CSUM */ "tx_checksum-ipv6", + /* NETIF_F_HIGHDMA */ "highdma", + /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist", + /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert", + + /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse", + /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter", + /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged", + /* NETIF_F_GSO */ "tx-generic-segmentation", + /* NETIF_F_LLTX */ "tx-lockless", + /* NETIF_F_NETNS_LOCAL */ "netns-local", + /* NETIF_F_GRO */ "rx-gro", + /* NETIF_F_LRO */ "rx-lro", + + /* NETIF_F_TSO */ "tx-tcp-segmentation", + /* NETIF_F_UFO */ "tx-udp-fragmentation", + /* NETIF_F_GSO_ROBUST */ "tx-gso-robust", + /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation", + /* NETIF_F_TSO6 */ "tx-tcp6-segmentation", + /* NETIF_F_FSO */ "tx-fcoe-segmentation", + "", + "", + + /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc", + /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp", + /* NETIF_F_FCOE_MTU */ "fcoe-mtu", + /* NETIF_F_NTUPLE */ "rx-ntuple-filter", + /* NETIF_F_RXHASH */ "rx-hashing", + "", + "", + "", +}; + static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; + if (sset == ETH_SS_FEATURES) + return ARRAY_SIZE(netdev_features_strings); + if (ops && ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -187,8 +297,12 @@ static void __ethtool_get_strings(struct net_device *dev, { const struct ethtool_ops *ops = dev->ethtool_ops; - /* ops->get_strings is valid because checked earlier */ - ops->get_strings(dev, stringset, data); + if (stringset == ETH_SS_FEATURES) + memcpy(data, netdev_features_strings, + sizeof(netdev_features_strings)); + else + /* ops->get_strings is valid because checked earlier */ + ops->get_strings(dev, stringset, data); } static u32 ethtool_get_feature_mask(u32 eth_cmd) @@ -1533,6 +1647,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GFEATURES: break; default: if (!capable(CAP_NET_ADMIN)) @@ -1678,6 +1793,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; + case ETHTOOL_GFEATURES: + rc = ethtool_get_features(dev, useraddr); + break; + case ETHTOOL_SFEATURES: + rc = ethtool_set_features(dev, useraddr); + break; case ETHTOOL_GTXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: -- cgit v1.2.3 From e83d360d9a7e5d71d55c13e96b19109a2ea23bf0 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Tue, 15 Feb 2011 16:59:18 +0000 Subject: net: introduce NETIF_F_RXCSUM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce NETIF_F_RXCSUM to replace device-private flags for RX checksum offload. Integrate it with ndo_fix_features. ethtool_op_get_rx_csum() is removed altogether as nothing in-tree uses it. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/ethtool.h | 1 - include/linux/netdevice.h | 5 ++++- net/core/ethtool.c | 47 +++++++++++++++++++++++------------------------ 3 files changed, 27 insertions(+), 26 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 806e716bb4fb..54d776c2c1b5 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -625,7 +625,6 @@ struct net_device; /* Some generic methods drivers may use in their ethtool_ops */ u32 ethtool_op_get_link(struct net_device *dev); -u32 ethtool_op_get_rx_csum(struct net_device *dev); u32 ethtool_op_get_tx_csum(struct net_device *dev); int ethtool_op_set_tx_csum(struct net_device *dev, u32 data); int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 85f67e225f60..ffe56c16df8a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -977,6 +977,7 @@ struct net_device { #define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/ #define NETIF_F_NTUPLE (1 << 27) /* N-tuple filters supported */ #define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */ +#define NETIF_F_RXCSUM (1 << 29) /* Receive checksumming offload */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 @@ -992,7 +993,7 @@ struct net_device { /* = all defined minus driver/device-class-related */ #define NETIF_F_NEVER_CHANGE (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \ NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) -#define NETIF_F_ETHTOOL_BITS (0x1f3fffff & ~NETIF_F_NEVER_CHANGE) +#define NETIF_F_ETHTOOL_BITS (0x3f3fffff & ~NETIF_F_NEVER_CHANGE) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ @@ -2510,6 +2511,8 @@ static inline int dev_ethtool_get_settings(struct net_device *dev, static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev) { + if (dev->hw_features & NETIF_F_RXCSUM) + return !!(dev->features & NETIF_F_RXCSUM); if (!dev->ethtool_ops || !dev->ethtool_ops->get_rx_csum) return 0; return dev->ethtool_ops->get_rx_csum(dev); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 65b3d50a6df2..66cdc76770ce 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -34,12 +34,6 @@ u32 ethtool_op_get_link(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_link); -u32 ethtool_op_get_rx_csum(struct net_device *dev) -{ - return (dev->features & NETIF_F_ALL_CSUM) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_rx_csum); - u32 ethtool_op_get_tx_csum(struct net_device *dev) { return (dev->features & NETIF_F_ALL_CSUM) != 0; @@ -274,7 +268,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS /* NETIF_F_FCOE_MTU */ "fcoe-mtu", /* NETIF_F_NTUPLE */ "rx-ntuple-filter", /* NETIF_F_RXHASH */ "rx-hashing", - "", + /* NETIF_F_RXCSUM */ "rx-checksum", "", "", }; @@ -313,6 +307,9 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd) case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + case ETHTOOL_GRXCSUM: + case ETHTOOL_SRXCSUM: + return NETIF_F_RXCSUM; case ETHTOOL_GSG: case ETHTOOL_SSG: return NETIF_F_SG; @@ -343,6 +340,8 @@ static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) switch (ethcmd) { case ETHTOOL_GTXCSUM: return ops->get_tx_csum; + case ETHTOOL_GRXCSUM: + return ops->get_rx_csum; case ETHTOOL_SSG: return ops->get_sg; case ETHTOOL_STSO: @@ -354,6 +353,11 @@ static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) } } +static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev) +{ + return !!(dev->features & NETIF_F_ALL_CSUM); +} + static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { @@ -369,6 +373,10 @@ static int ethtool_get_one_feature(struct net_device *dev, actor = __ethtool_get_one_feature_actor(dev, ethcmd); + /* bug compatibility with old get_rx_csum */ + if (ethcmd == ETHTOOL_GRXCSUM && !actor) + actor = __ethtool_get_rx_csum_oldbug; + if (actor) edata.data = actor(dev); } @@ -379,6 +387,7 @@ static int ethtool_get_one_feature(struct net_device *dev, } static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); +static int __ethtool_set_rx_csum(struct net_device *dev, u32 data); static int __ethtool_set_sg(struct net_device *dev, u32 data); static int __ethtool_set_tso(struct net_device *dev, u32 data); static int __ethtool_set_ufo(struct net_device *dev, u32 data); @@ -416,6 +425,8 @@ static int ethtool_set_one_feature(struct net_device *dev, switch (ethcmd) { case ETHTOOL_STXCSUM: return __ethtool_set_tx_csum(dev, edata.data); + case ETHTOOL_SRXCSUM: + return __ethtool_set_rx_csum(dev, edata.data); case ETHTOOL_SSG: return __ethtool_set_sg(dev, edata.data); case ETHTOOL_STSO: @@ -1404,20 +1415,15 @@ static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) return dev->ethtool_ops->set_tx_csum(dev, data); } -static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) +static int __ethtool_set_rx_csum(struct net_device *dev, u32 data) { - struct ethtool_value edata; - if (!dev->ethtool_ops->set_rx_csum) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - if (!edata.data && dev->ethtool_ops->set_sg) + if (!data) dev->features &= ~NETIF_F_GRO; - return dev->ethtool_ops->set_rx_csum(dev, edata.data); + return dev->ethtool_ops->set_rx_csum(dev, data); } static int __ethtool_set_tso(struct net_device *dev, u32 data) @@ -1765,15 +1771,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SPAUSEPARAM: rc = ethtool_set_pauseparam(dev, useraddr); break; - case ETHTOOL_GRXCSUM: - rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_rx_csum ? - dev->ethtool_ops->get_rx_csum : - ethtool_op_get_rx_csum)); - break; - case ETHTOOL_SRXCSUM: - rc = ethtool_set_rx_csum(dev, useraddr); - break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; @@ -1846,6 +1843,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_features(dev, useraddr); break; case ETHTOOL_GTXCSUM: + case ETHTOOL_GRXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: case ETHTOOL_GUFO: @@ -1854,6 +1852,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_get_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_STXCSUM: + case ETHTOOL_SRXCSUM: case ETHTOOL_SSG: case ETHTOOL_STSO: case ETHTOOL_SUFO: -- cgit v1.2.3 From 23b41168fc942a4a041325a04ecc1bd17d031a3e Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Sat, 26 Feb 2011 22:39:12 +0000 Subject: netdevice: make initial group visible to userspace INIT_NETDEV_GROUP is needed by userspace, move it outside __KERNEL__ guards. Signed-off-by: Vlad Dogaru Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ffe56c16df8a..8be4056ad251 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -75,9 +75,6 @@ struct wireless_dev; #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ -/* Initial net device group. All devices belong to group 0 by default. */ -#define INIT_NETDEV_GROUP 0 - /* * Transmit return codes: transmit return codes originate from three different * namespaces: @@ -141,6 +138,9 @@ static inline bool dev_xmit_complete(int rc) #define MAX_ADDR_LEN 32 /* Largest hardware address length */ +/* Initial net device group. All devices belong to group 0 by default. */ +#define INIT_NETDEV_GROUP 0 + #ifdef __KERNEL__ /* * Compute the worst case header length according to the protocols -- cgit v1.2.3 From 256ee435b9a9ee9cca69602fe8046b27ca99fbee Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 1 Mar 2011 07:06:12 +0000 Subject: netdevice: Convert printk to pr_info in netif_tx_stop_queue This allows any caller to be prefaced by any specific pr_fmt to better identify which device driver is using this function inappropriately. Add terminating newline. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8be4056ad251..71563e7b2bfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1765,8 +1765,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { if (WARN_ON(!dev_queue)) { - printk(KERN_INFO "netif_stop_queue() cannot be called before " - "register_netdev()"); + pr_info("netif_stop_queue() cannot be called before register_netdev()\n"); return; } set_bit(__QUEUE_STATE_XOFF, &dev_queue->state); -- cgit v1.2.3 From 6247e086188dd2ba5bfd64f9a876fe42b0cf39fb Mon Sep 17 00:00:00 2001 From: Yi Zou Date: Tue, 1 Feb 2011 07:22:06 +0000 Subject: net: add ndo_fcoe_ddp_target() to support FCoE DDP in target mode The Fiber Channel over Ethernet (FCoE) Direct Data Placement (DDP) can also be used for FCoE target, where the DDP used for read I/O on an initiator can be used on an FCoE target to speed up the write I/O to the target from the initiator. The added ndo_fcoe_ddp_target() works in the similar way as the existing ndo_fcoe_ddp_setup() to allow the underlying hardware set up the DDP context accordingly when it gets called from the FCoE target implementation on top the existing Open-FCoE fcoe/libfc protocol stack so without losing the ability to provide DDP for read I/O as an initiator, it can also provide DDP offload to the write I/O coming from the initiator as a target. Signed-off-by: Yi Zou Signed-off-by: Kiran Patil Tested-by: Kavindya Deegala Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 71563e7b2bfb..6bd5d460b7c1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -871,6 +871,10 @@ struct net_device_ops { unsigned int sgc); int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); + int (*ndo_fcoe_ddp_target)(struct net_device *dev, + u16 xid, + struct scatterlist *sgl, + unsigned int sgc); #define NETDEV_FCOE_WWNN 0 #define NETDEV_FCOE_WWPN 1 int (*ndo_fcoe_get_wwn)(struct net_device *dev, -- cgit v1.2.3 From 8909c9ad8ff03611c9c96c9a92656213e4bb495b Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Wed, 2 Mar 2011 00:33:13 +0300 Subject: net: don't allow CAP_NET_ADMIN to load non-netdev kernel modules Since a8f80e8ff94ecba629542d9b4b5f5a8ee3eb565c any process with CAP_NET_ADMIN may load any module from /lib/modules/. This doesn't mean that CAP_NET_ADMIN is a superset of CAP_SYS_MODULE as modules are limited to /lib/modules/**. However, CAP_NET_ADMIN capability shouldn't allow anybody load any module not related to networking. This patch restricts an ability of autoloading modules to netdev modules with explicit aliases. This fixes CVE-2011-1019. Arnd Bergmann suggested to leave untouched the old pre-v2.6.32 behavior of loading netdev modules by name (without any prefix) for processes with CAP_SYS_MODULE to maintain the compatibility with network scripts that use autoloading netdev modules by aliases like "eth0", "wlan0". Currently there are only three users of the feature in the upstream kernel: ipip, ip_gre and sit. root@albatros:~# capsh --drop=$(seq -s, 0 11),$(seq -s, 13 34) -- root@albatros:~# grep Cap /proc/$$/status CapInh: 0000000000000000 CapPrm: fffffff800001000 CapEff: fffffff800001000 CapBnd: fffffff800001000 root@albatros:~# modprobe xfs FATAL: Error inserting xfs (/lib/modules/2.6.38-rc6-00001-g2bf4ca3/kernel/fs/xfs/xfs.ko): Operation not permitted root@albatros:~# lsmod | grep xfs root@albatros:~# ifconfig xfs xfs: error fetching interface information: Device not found root@albatros:~# lsmod | grep xfs root@albatros:~# lsmod | grep sit root@albatros:~# ifconfig sit sit: error fetching interface information: Device not found root@albatros:~# lsmod | grep sit root@albatros:~# ifconfig sit0 sit0 Link encap:IPv6-in-IPv4 NOARP MTU:1480 Metric:1 root@albatros:~# lsmod | grep sit sit 10457 0 tunnel4 2957 1 sit For CAP_SYS_MODULE module loading is still relaxed: root@albatros:~# grep Cap /proc/$$/status CapInh: 0000000000000000 CapPrm: ffffffffffffffff CapEff: ffffffffffffffff CapBnd: ffffffffffffffff root@albatros:~# ifconfig xfs xfs: error fetching interface information: Device not found root@albatros:~# lsmod | grep xfs xfs 745319 0 Reference: https://lkml.org/lkml/2011/2/24/203 Signed-off-by: Vasiliy Kulikov Signed-off-by: Michael Tokarev Acked-by: David S. Miller Acked-by: Kees Cook Signed-off-by: James Morris --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 12 ++++++++++-- net/ipv4/ip_gre.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/sit.c | 2 +- 5 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d971346b0340..71caf7a5e6c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2392,6 +2392,9 @@ extern int netdev_notice(const struct net_device *dev, const char *format, ...) extern int netdev_info(const struct net_device *dev, const char *format, ...) __attribute__ ((format (printf, 2, 3))); +#define MODULE_ALIAS_NETDEV(device) \ + MODULE_ALIAS("netdev-" device) + #if defined(DEBUG) #define netdev_dbg(__dev, format, args...) \ netdev_printk(KERN_DEBUG, __dev, format, ##args) diff --git a/net/core/dev.c b/net/core/dev.c index 8ae6631abcc2..6561021d22d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,13 +1114,21 @@ EXPORT_SYMBOL(netdev_bonding_change); void dev_load(struct net *net, const char *name) { struct net_device *dev; + int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); - if (!dev && capable(CAP_NET_ADMIN)) - request_module("%s", name); + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device " +"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s " +"instead\n", name); + } } EXPORT_SYMBOL(dev_load); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6613edfac28c..d1d0e2c256fc 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1765,4 +1765,4 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); -MODULE_ALIAS("gre0"); +MODULE_ALIAS_NETDEV("gre0"); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 988f52fba54a..a5f58e7cbb26 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -913,4 +913,4 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); -MODULE_ALIAS("tunl0"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8ce38f10a547..d2c16e10f650 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1290,4 +1290,4 @@ static int __init sit_init(void) module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); -MODULE_ALIAS("sit0"); +MODULE_ALIAS_NETDEV("sit0"); -- cgit v1.2.3 From e9bce845c0cee1a492e5cee6a827ae71140fe8b3 Mon Sep 17 00:00:00 2001 From: Yi Zou Date: Wed, 9 Mar 2011 08:48:03 +0000 Subject: net: add proper documentation for previously added net_device_ops for FCoE Add proper documentation for previously added net_device_ops ops for FCoE. Signed-off-by: Yi Zou Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7a071535c4c0..604dbf5051d3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -777,6 +777,42 @@ struct netdev_tc_txq { * queues stopped. This allows the netdevice to perform queue management * safely. * + * Fiber Channel over Ethernet (FCoE) offload functions. + * int (*ndo_fcoe_enable)(struct net_device *dev); + * Called when the FCoE protocol stack wants to start using LLD for FCoE + * so the underlying device can perform whatever needed configuration or + * initialization to support acceleration of FCoE traffic. + * + * int (*ndo_fcoe_disable)(struct net_device *dev); + * Called when the FCoE protocol stack wants to stop using LLD for FCoE + * so the underlying device can perform whatever needed clean-ups to + * stop supporting acceleration of FCoE traffic. + * + * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, + * struct scatterlist *sgl, unsigned int sgc); + * Called when the FCoE Initiator wants to initialize an I/O that + * is a possible candidate for Direct Data Placement (DDP). The LLD can + * perform necessary setup and returns 1 to indicate the device is set up + * successfully to perform DDP on this I/O, otherwise this returns 0. + * + * int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); + * Called when the FCoE Initiator/Target is done with the DDPed I/O as + * indicated by the FC exchange id 'xid', so the underlying device can + * clean up and reuse resources for later DDP requests. + * + * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, + * struct scatterlist *sgl, unsigned int sgc); + * Called when the FCoE Target wants to initialize an I/O that + * is a possible candidate for Direct Data Placement (DDP). The LLD can + * perform necessary setup and returns 1 to indicate the device is set up + * successfully to perform DDP on this I/O, otherwise this returns 0. + * + * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); + * Called when the underlying device wants to override default World Wide + * Name (WWN) generation mechanism in FCoE protocol stack to pass its own + * World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE + * protocol stack to use. + * * RFS acceleration. * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, * u16 rxq_index, u32 flow_id); -- cgit v1.2.3 From 8a4eb5734e8d1dc60a8c28576bbbdfdcc643626d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Mar 2011 03:14:39 +0000 Subject: net: introduce rx_handler results and logic around that MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allows rx_handlers to better signalize what to do next to it's caller. That makes skb->deliver_no_wcard no longer needed. kernel-doc for rx_handler_result is taken from Nicolas' patch. Signed-off-by: Jiri Pirko Reviewed-by: Nicolas de Pesloüan Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 22 +++++++++--------- drivers/net/macvlan.c | 11 ++++----- include/linux/netdevice.h | 50 ++++++++++++++++++++++++++++++++++++++++- include/linux/skbuff.h | 5 +---- net/bridge/br_input.c | 25 ++++++++++++--------- net/bridge/br_private.h | 2 +- net/core/dev.c | 21 +++++++++++------ net/core/skbuff.c | 1 - 8 files changed, 98 insertions(+), 39 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 04119b1e7cdb..27c413aa15da 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1480,20 +1480,23 @@ static bool bond_should_deliver_exact_match(struct sk_buff *skb, return false; } -static struct sk_buff *bond_handle_frame(struct sk_buff *skb) +static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { + struct sk_buff *skb = *pskb; struct slave *slave; struct net_device *bond_dev; struct bonding *bond; - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb)) - return NULL; - slave = bond_slave_get_rcu(skb->dev); bond_dev = ACCESS_ONCE(slave->dev->master); if (unlikely(!bond_dev)) - return skb; + return RX_HANDLER_PASS; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return RX_HANDLER_CONSUMED; + + *pskb = skb; bond = netdev_priv(bond_dev); @@ -1501,8 +1504,7 @@ static struct sk_buff *bond_handle_frame(struct sk_buff *skb) slave->dev->last_rx = jiffies; if (bond_should_deliver_exact_match(skb, slave, bond)) { - skb->deliver_no_wcard = 1; - return skb; + return RX_HANDLER_EXACT; } skb->dev = bond_dev; @@ -1514,12 +1516,12 @@ static struct sk_buff *bond_handle_frame(struct sk_buff *skb) if (unlikely(skb_cow_head(skb, skb->data - skb_mac_header(skb)))) { kfree_skb(skb); - return NULL; + return RX_HANDLER_CONSUMED; } memcpy(eth_hdr(skb)->h_dest, bond_dev->dev_addr, ETH_ALEN); } - return skb; + return RX_HANDLER_ANOTHER; } /* enslave device to bond device */ diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 497991bd3b64..5b37d3c191e4 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -152,9 +152,10 @@ static void macvlan_broadcast(struct sk_buff *skb, } /* called under rcu_read_lock() from netif_receive_skb */ -static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) +static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; + struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; @@ -184,7 +185,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); - return skb; + return RX_HANDLER_PASS; } if (port->passthru) @@ -192,12 +193,12 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) else vlan = macvlan_hash_lookup(port, eth->h_dest); if (vlan == NULL) - return skb; + return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); - return NULL; + return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); @@ -211,7 +212,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0); - return NULL; + return RX_HANDLER_CONSUMED; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 604dbf5051d3..5eeb2cd3631c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -390,7 +390,55 @@ enum gro_result { }; typedef enum gro_result gro_result_t; -typedef struct sk_buff *rx_handler_func_t(struct sk_buff *skb); +/* + * enum rx_handler_result - Possible return values for rx_handlers. + * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it + * further. + * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in + * case skb->dev was changed by rx_handler. + * @RX_HANDLER_EXACT: Force exact delivery, no wildcard. + * @RX_HANDLER_PASS: Do nothing, passe the skb as if no rx_handler was called. + * + * rx_handlers are functions called from inside __netif_receive_skb(), to do + * special processing of the skb, prior to delivery to protocol handlers. + * + * Currently, a net_device can only have a single rx_handler registered. Trying + * to register a second rx_handler will return -EBUSY. + * + * To register a rx_handler on a net_device, use netdev_rx_handler_register(). + * To unregister a rx_handler on a net_device, use + * netdev_rx_handler_unregister(). + * + * Upon return, rx_handler is expected to tell __netif_receive_skb() what to + * do with the skb. + * + * If the rx_handler consumed to skb in some way, it should return + * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for + * the skb to be delivered in some other ways. + * + * If the rx_handler changed skb->dev, to divert the skb to another + * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the + * new device will be called if it exists. + * + * If the rx_handler consider the skb should be ignored, it should return + * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that + * are registred on exact device (ptype->dev == skb->dev). + * + * If the rx_handler didn't changed skb->dev, but want the skb to be normally + * delivered, it should return RX_HANDLER_PASS. + * + * A device without a registered rx_handler will behave as if rx_handler + * returned RX_HANDLER_PASS. + */ + +enum rx_handler_result { + RX_HANDLER_CONSUMED, + RX_HANDLER_ANOTHER, + RX_HANDLER_EXACT, + RX_HANDLER_PASS, +}; +typedef enum rx_handler_result rx_handler_result_t; +typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb); extern void __napi_schedule(struct napi_struct *n); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 31f02d0b46a7..24cfa626931e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -388,10 +388,7 @@ struct sk_buff { kmemcheck_bitfield_begin(flags2); __u16 queue_mapping:16; #ifdef CONFIG_IPV6_NDISC_NODETYPE - __u8 ndisc_nodetype:2, - deliver_no_wcard:1; -#else - __u8 deliver_no_wcard:1; + __u8 ndisc_nodetype:2; #endif __u8 ooo_okay:1; kmemcheck_bitfield_end(flags2); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 88e4aa9cb1f9..e2160792e1bc 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -139,21 +139,22 @@ static inline int is_link_local(const unsigned char *dest) * Return NULL if skb is handled * note: already called with rcu_read_lock */ -struct sk_buff *br_handle_frame(struct sk_buff *skb) +rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { struct net_bridge_port *p; + struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; br_should_route_hook_t *rhook; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) - return skb; + return RX_HANDLER_PASS; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) - return NULL; + return RX_HANDLER_CONSUMED; p = br_port_get_rcu(skb->dev); @@ -167,10 +168,12 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb) goto forward; if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, - NULL, br_handle_local_finish)) - return NULL; /* frame consumed by filter */ - else - return skb; /* continue processing */ + NULL, br_handle_local_finish)) { + return RX_HANDLER_CONSUMED; /* consumed by filter */ + } else { + *pskb = skb; + return RX_HANDLER_PASS; /* continue processing */ + } } forward: @@ -178,8 +181,10 @@ forward: case BR_STATE_FORWARDING: rhook = rcu_dereference(br_should_route_hook); if (rhook) { - if ((*rhook)(skb)) - return skb; + if ((*rhook)(skb)) { + *pskb = skb; + return RX_HANDLER_PASS; + } dest = eth_hdr(skb)->h_dest; } /* fall through */ @@ -194,5 +199,5 @@ forward: drop: kfree_skb(skb); } - return NULL; + return RX_HANDLER_CONSUMED; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f7afc364d777..19e2f46ed086 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -379,7 +379,7 @@ extern void br_features_recompute(struct net_bridge *br); /* br_input.c */ extern int br_handle_frame_finish(struct sk_buff *skb); -extern struct sk_buff *br_handle_frame(struct sk_buff *skb); +extern rx_handler_result_t br_handle_frame(struct sk_buff **pskb); /* br_ioctl.c */ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); diff --git a/net/core/dev.c b/net/core/dev.c index 0d39032e9621..0b88eba97dab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3070,6 +3070,8 @@ out: * on a failure. * * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, @@ -3129,6 +3131,7 @@ static int __netif_receive_skb(struct sk_buff *skb) rx_handler_func_t *rx_handler; struct net_device *orig_dev; struct net_device *null_or_dev; + bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; @@ -3181,18 +3184,22 @@ ncls: rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { - struct net_device *prev_dev; - if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - prev_dev = skb->dev; - skb = rx_handler(skb); - if (!skb) + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: goto out; - if (skb->dev != prev_dev) + case RX_HANDLER_ANOTHER: goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } } if (vlan_tx_tag_present(skb)) { @@ -3210,7 +3217,7 @@ ncls: vlan_on_bond_hook(skb); /* deliver only exact match when indicated */ - null_or_dev = skb->deliver_no_wcard ? skb->dev : NULL; + null_or_dev = deliver_exact ? skb->dev : NULL; type = skb->protocol; list_for_each_entry_rcu(ptype, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1eb526a848ff..801dd08908f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -523,7 +523,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); new->priority = old->priority; - new->deliver_no_wcard = old->deliver_no_wcard; #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif -- cgit v1.2.3 From 4dd5ffe4fc36128dc86568ddeaeae359e6037762 Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Wed, 30 Mar 2011 23:58:08 +0000 Subject: net: Fix dev dev_ethtool_get_rx_csum() for forced NETIF_F_RXCSUM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dev_ethtool_get_rx_csum() won't report rx checksumming when it's not changeable and driver is converted to hw_features and friends. Fix this. (dev->hw_features & NETIF_F_RXCSUM) check is dropped - if the ethtool_ops->get_rx_csum is set, then driver is not coverted, yet. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5eeb2cd3631c..0249fe7e3872 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2598,8 +2598,8 @@ static inline int dev_ethtool_get_settings(struct net_device *dev, static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev) { - if (dev->hw_features & NETIF_F_RXCSUM) - return !!(dev->features & NETIF_F_RXCSUM); + if (dev->features & NETIF_F_RXCSUM) + return 1; if (!dev->ethtool_ops || !dev->ethtool_ops->get_rx_csum) return 0; return dev->ethtool_ops->get_rx_csum(dev); -- cgit v1.2.3