diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 301 | ||||
-rw-r--r-- | net/core/dev.c | 172 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 4 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 9 | ||||
-rw-r--r-- | net/core/dst.c | 24 | ||||
-rw-r--r-- | net/core/ethtool.c | 81 | ||||
-rw-r--r-- | net/core/filter.c | 97 | ||||
-rw-r--r-- | net/core/iovec.c | 47 | ||||
-rw-r--r-- | net/core/link_watch.c | 2 | ||||
-rw-r--r-- | net/core/neighbour.c | 279 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 44 | ||||
-rw-r--r-- | net/core/netpoll.c | 4 | ||||
-rw-r--r-- | net/core/pktgen.c | 3 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 154 | ||||
-rw-r--r-- | net/core/scm.c | 3 | ||||
-rw-r--r-- | net/core/skbuff.c | 347 | ||||
-rw-r--r-- | net/core/sock.c | 20 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 21 | ||||
-rw-r--r-- | net/core/utils.c | 3 |
19 files changed, 985 insertions, 630 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a81d4c2..df493d68330c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -49,6 +49,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/uio.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -309,16 +310,14 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) EXPORT_SYMBOL(skb_kill_datagram); /** - * skb_copy_datagram_iovec - Copy a datagram to an iovec. + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from - * @to: io vector to copy to + * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec - * - * Note: the iovec is modified during the copy. */ -int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, - struct iovec *to, int len) +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -330,8 +329,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_toiovec(to, skb->data + offset, copy)) - goto fault; + if (copy_to_iter(skb->data + offset, copy, to) != copy) + goto short_copy; if ((len -= copy) == 0) return 0; offset += copy; @@ -346,18 +345,12 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - if (copy > len) copy = len; - vaddr = kmap(page); - err = memcpy_toiovec(to, vaddr + frag->page_offset + - offset - start, copy); - kunmap(page); - if (err) - goto fault; + if (copy_page_to_iter(skb_frag_page(frag), + frag->page_offset + offset - + start, copy, to) != copy) + goto short_copy; if (!(len -= copy)) return 0; offset += copy; @@ -374,9 +367,8 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iovec(frag_iter, - offset - start, - to, copy)) + if (skb_copy_datagram_iter(frag_iter, offset - start, + to, copy)) goto fault; if ((len -= copy) == 0) return 0; @@ -387,113 +379,33 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, if (!len) return 0; + /* This is not really a user copy fault, but rather someone + * gave us a bogus length on the skb. We should probably + * print a warning here as it may indicate a kernel bug. + */ + fault: return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_iovec); -/** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; +short_copy: + if (iov_iter_count(to)) + goto fault; - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; + return 0; } -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); +EXPORT_SYMBOL(skb_copy_datagram_iter); /** - * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. + * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to - * @from: io vector to copy to - * @from_offset: offset in the io vector to start copying from + * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. */ -int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - const struct iovec *from, int from_offset, +int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) { int start = skb_headlen(skb); @@ -504,13 +416,11 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_fromiovecend(skb->data + offset, from, from_offset, - copy)) + if (copy_from_iter(skb->data + offset, copy, from) != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - from_offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ @@ -522,24 +432,19 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); + size_t copied; if (copy > len) copy = len; - vaddr = kmap(page); - err = memcpy_fromiovecend(vaddr + frag->page_offset + - offset - start, - from, from_offset, copy); - kunmap(page); - if (err) + copied = copy_page_from_iter(skb_frag_page(frag), + frag->page_offset + offset - start, + copy, from); + if (copied != copy) goto fault; if (!(len -= copy)) return 0; offset += copy; - from_offset += copy; } start = end; } @@ -553,16 +458,13 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_from_iovec(frag_iter, - offset - start, - from, - from_offset, - copy)) + if (skb_copy_datagram_from_iter(frag_iter, + offset - start, + from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - from_offset += copy; } start = end; } @@ -572,101 +474,82 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +EXPORT_SYMBOL(skb_copy_datagram_from_iter); /** - * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter * @skb: buffer to copy - * @from: io vector to copy from - * @offset: offset in the io vector to start copying from - * @count: amount of vectors to copy to buffer from + * @from: the source to copy from * * The function will first copy up to headlen, and then pin the userspace * pages and build frags through them. * * Returns 0, -EFAULT or -EMSGSIZE. - * Note: the iovec is not modified during the copy */ -int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) { - int len = iov_length(from, count) - offset; + int len = iov_iter_count(from); int copy = min_t(int, skb_headlen(skb), len); - int size; - int i = 0; + int frag = 0; /* copy up to skb headlen */ - if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - if (len == copy) - return 0; - - offset += copy; - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; + while (iov_iter_count(from)) { + struct page *pages[MAX_SKB_FRAGS]; + size_t start; + ssize_t copied; unsigned long truesize; + int n = 0; - /* Skip over from offset and copied */ - if (offset >= from->iov_len) { - offset -= from->iov_len; - ++from; - continue; - } - len = from->iov_len - offset; - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) + if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - release_pages(&page[i], num_pages, 0); + + copied = iov_iter_get_pages(from, pages, ~0U, + MAX_SKB_FRAGS - frag, &start); + if (copied < 0) return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; + + iov_iter_advance(from, copied); + + truesize = PAGE_ALIGN(copied + start); + skb->data_len += copied; + skb->len += copied; skb->truesize += truesize; atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - skb_fill_page_desc(skb, i, page[i], off, size); - base += size; - len -= size; - i++; + while (copied) { + int size = min_t(int, copied, PAGE_SIZE - start); + skb_fill_page_desc(skb, frag++, pages[n], start, size); + start = 0; + copied -= size; + n++; } - offset = 0; - ++from; } return 0; } -EXPORT_SYMBOL(zerocopy_sg_from_iovec); +EXPORT_SYMBOL(zerocopy_sg_from_iter); static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, - u8 __user *to, int len, + struct iov_iter *to, int len, __wsum *csump) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; + int n; /* Copy header. */ if (copy > 0) { - int err = 0; if (copy > len) copy = len; - *csump = csum_and_copy_to_user(skb->data + offset, to, copy, - *csump, &err); - if (err) + n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); + if (n != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - to += copy; pos = copy; } @@ -678,26 +561,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - __wsum csum2; - int err = 0; - u8 *vaddr; + __wsum csum2 = 0; struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); if (copy > len) copy = len; - vaddr = kmap(page); - csum2 = csum_and_copy_to_user(vaddr + - frag->page_offset + - offset - start, - to, copy, 0, &err); + n = csum_and_copy_to_iter(vaddr + frag->page_offset + + offset - start, copy, + &csum2, to); kunmap(page); - if (err) + if (n != copy) goto fault; *csump = csum_block_add(*csump, csum2, pos); if (!(len -= copy)) return 0; offset += copy; - to += copy; pos += copy; } start = end; @@ -722,7 +601,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, if ((len -= copy) == 0) return 0; offset += copy; - to += copy; pos += copy; } start = end; @@ -775,20 +653,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) EXPORT_SYMBOL(__skb_checksum_complete); /** - * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec. + * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length - * @iov: io vector + * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. - * -EFAULT - fault during copy. Beware, in this case iovec - * can be modified! + * -EFAULT - fault during copy. */ -int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, - int hlen, struct iovec *iov) +int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, + int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; @@ -796,28 +673,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, if (!chunk) return 0; - /* Skip filled elements. - * Pretty silly, look at memcpy_toiovec, though 8) - */ - while (!iov->iov_len) - iov++; - - if (iov->iov_len < chunk) { + if (iov_iter_count(&msg->msg_iter) < chunk) { if (__skb_checksum_complete(skb)) goto csum_error; - if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); - if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; if (csum_fold(csum)) goto csum_error; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); - iov->iov_len -= chunk; - iov->iov_base += chunk; } return 0; csum_error: @@ -825,7 +694,7 @@ csum_error: fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll diff --git a/net/core/dev.c b/net/core/dev.c index 3acff0974560..f411c28d0a66 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> +#include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> @@ -133,6 +134,7 @@ #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> +#include <linux/hrtimer.h> #include "net-sysfs.h" @@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - - /* the same for macvlan devices */ - if (netif_is_macvlan(dev)) - dev = macvlan_dev_real_dev(dev); + struct net_device *lower_dev; + struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); + + netdev_for_each_lower_dev(dev, lower_dev, iter) + dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); @@ -2530,7 +2527,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; @@ -2647,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (skb) - skb->vlan_tci = 0; - } + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); return skb; } @@ -3304,7 +3297,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { - if (skb_queue_len(&sd->input_pkt_queue)) { + if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); @@ -4179,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; } return skb; @@ -4316,20 +4309,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); } +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); -#ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ - if (sd->rps_ipi_list) { + if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } -#endif + napi->weight = weight_p; local_irq_disable(); while (1) { @@ -4356,7 +4357,6 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - list_del(&napi->poll_list); napi->state = 0; rps_unlock(sd); @@ -4376,7 +4376,8 @@ static int process_backlog(struct napi_struct *napi, int quota) * __napi_schedule - schedule for receive * @n: entry to schedule * - * The entry's receive function will be scheduled to run + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { @@ -4388,18 +4389,29 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); - list_del(&n->poll_list); + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +void napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4410,12 +4422,28 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); + if (n->gro_list) { + unsigned long timeout = 0; + + if (work_done) + timeout = n->dev->gro_flush_timeout; + + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + else + napi_gro_flush(n, false); + } + if (likely(list_empty(&n->poll_list))) { + WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + } else { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); + } } -EXPORT_SYMBOL(napi_complete); +EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ struct napi_struct *napi_by_id(unsigned int napi_id) @@ -4469,10 +4497,23 @@ void napi_hash_del(struct napi_struct *napi) } EXPORT_SYMBOL_GPL(napi_hash_del); +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + if (napi->gro_list) + napi_schedule(napi); + + return HRTIMER_NORESTART; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; @@ -4491,6 +4532,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, } EXPORT_SYMBOL(netif_napi_add); +void napi_disable(struct napi_struct *n) +{ + might_sleep(); + set_bit(NAPI_STATE_DISABLE, &n->state); + + while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + + hrtimer_cancel(&n->timer); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable); + void netif_napi_del(struct napi_struct *napi) { list_del_init(&napi->dev_list); @@ -4507,29 +4562,28 @@ static void net_rx_action(struct softirq_action *h) struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; + LIST_HEAD(list); + LIST_HEAD(repoll); void *have; local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); - while (!list_empty(&sd->poll_list)) { + while (!list_empty(&list)) { struct napi_struct *n; int work, weight; - /* If softirq window is exhuasted then punt. + /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; - local_irq_enable(); - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + n = list_first_entry(&list, struct napi_struct, poll_list); + list_del_init(&n->poll_list); have = netpoll_poll_lock(n); @@ -4551,8 +4605,6 @@ static void net_rx_action(struct softirq_action *h) budget -= work; - local_irq_disable(); - /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can @@ -4560,32 +4612,40 @@ static void net_rx_action(struct softirq_action *h) */ if (unlikely(work == weight)) { if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); napi_complete(n); - local_irq_disable(); } else { if (n->gro_list) { /* flush too old packets * If HZ < 1000, flush all packets. */ - local_irq_enable(); napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); } - list_move_tail(&n->poll_list, &sd->poll_list); + list_add_tail(&n->poll_list, &repoll); } } netpoll_poll_unlock(have); } + + if (!sd_has_rps_ipi_waiting(sd) && + list_empty(&list) && + list_empty(&repoll)) + return; out: + local_irq_disable(); + + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + net_rps_action_and_irq_enable(sd); return; softnet_break: sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } @@ -5786,7 +5846,7 @@ EXPORT_SYMBOL(dev_change_carrier); * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_port_id *ppid) + struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; @@ -5865,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -5874,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + GFP_KERNEL); + /* * Flush the unicast and multicast chains */ @@ -5883,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index b6b230600b97..c0548d268e1a 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,8 +278,8 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** - * __hw_addr_unsync_dev - Remove synchonized addresses from device - * @list: address list to remove syncronized addresses from + * __hw_addr_unsync_dev - Remove synchronized addresses from device + * @list: address list to remove synchronized addresses from * @dev: device to sync * @unsync: function to call if address should be removed * diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 72e899a3efda..b94b1d293506 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -142,10 +142,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm case SIOCGIFHWADDR: if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + memset(ifr->ifr_hwaddr.sa_data, 0, + sizeof(ifr->ifr_hwaddr.sa_data)); else memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); ifr->ifr_hwaddr.sa_family = dev->type; return 0; @@ -265,7 +267,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return 0; diff --git a/net/core/dst.c b/net/core/dst.c index a028409ee438..e956ce6d1378 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); -/** - * __skb_dst_set_noref - sets skb dst, without a reference - * @skb: buffer - * @dst: dst entry - * @force: if force is set, use noref version even for DST_NOCACHE entries - * - * Sets skb dst, assuming a reference was not taken on dst - * skb_dst_drop() should not dst_release() this dst - */ -void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) -{ - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); - /* If dst not in cache, we must take a reference, because - * dst_release() will destroy dst as soon as its refcount becomes zero - */ - if (unlikely((dst->flags & DST_NOCACHE) && !force)) { - dst_hold(dst); - skb_dst_set(skb, dst); - } else { - skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; - } -} -EXPORT_SYMBOL(__skb_dst_set_noref); - /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 06dfb293e5aa..550892cd6b3f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched.h> +#include <linux/net.h> /* * Some useful ethtool_ops methods that're device independent. @@ -84,7 +85,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", @@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; +static const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); + if (sset == ETH_SS_RSS_HASH_FUNCS) + return ARRAY_SIZE(rss_hash_func_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev, if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); + else if (stringset == ETH_SS_RSS_HASH_FUNCS) + memcpy(data, rss_hash_func_strings, + sizeof(rss_hash_func_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -574,6 +586,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; + +void netdev_rss_key_fill(void *buffer, size_t len) +{ + BUG_ON(len > sizeof(netdev_rss_key)); + net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); + memcpy(buffer, netdev_rss_key, len); +} +EXPORT_SYMBOL(netdev_rss_key_fill); + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { @@ -608,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; @@ -669,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, goto out; } - ret = ops->set_rxfh(dev, indir, NULL); + ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); out: kfree(indir); @@ -687,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, u32 total_size; u32 indir_bytes; u32 *indir = NULL; + u8 dev_hfunc = 0; u8 *hkey = NULL; u8 *rss_config; - if (!(dev->ethtool_ops->get_rxfh_indir_size || - dev->ethtool_ops->get_rxfh_key_size) || - !dev->ethtool_ops->get_rxfh) + if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -700,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; rxfh.indir_size = dev_indir_size; @@ -717,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; - /* If the user buffer size is 0, this is just a query for the - * device table size and key size. Otherwise, if the User size is - * not equal to device table size or key size it's an error. - */ - if (!user_indir_size && !user_key_size) - return 0; - if ((user_indir_size && (user_indir_size != dev_indir_size)) || (user_key_size && (user_key_size != dev_key_size))) return -EINVAL; @@ -740,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); - if (!ret) { - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh, rss_config[0]), - rss_config, total_size)) - ret = -EFAULT; - } + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (ret) + goto out; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), + &dev_hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) { + ret = -EFAULT; + } +out: kfree(rss_config); return ret; @@ -766,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || - !ops->get_rxnfc || !ops->set_rxfh) + if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; - /* If either indir or hash key is valid, proceed further. - * It is not valid to request that both be unchanged. + /* If either indir, hash key or function is valid, proceed further. + * Must request at least one change: indir size, hash key or function. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0)) + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) @@ -835,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey); + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); out: kfree(rss_config); diff --git a/net/core/filter.c b/net/core/filter.c index 647b12265e18..ec9baea10c16 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -44,6 +44,7 @@ #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> +#include <linux/bpf.h> /** * sk_filter - run a packet through a socket filter @@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - bpf_release_orig_filter(prog); - bpf_prog_free(prog); + if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } } static void __sk_filter_release(struct sk_filter *fp) @@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); +#ifdef CONFIG_BPF_SYSCALL +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + struct bpf_prog *prog; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get(ufd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + /* valid fd, but invalid program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + bpf_prog_put(prog); + return -ENOMEM; + } + fp->prog = prog; + + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +/* allow socket filters to call + * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() + */ +static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: + return NULL; + } +} + +static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* skb fields cannot be accessed yet */ + return false; +} + +static struct bpf_verifier_ops sock_filter_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, +}; + +static struct bpf_prog_type_list tl = { + .ops = &sock_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, +}; + +static int __init register_sock_filter_ops(void) +{ + bpf_register_prog_type(&tl); + return 0; +} +late_initcall(register_sock_filter_ops); +#else +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + return -EOPNOTSUPP; +} +#endif int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/iovec.c b/net/core/iovec.c index e1ec45ab1e63..dcbe98b3726a 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -28,53 +28,6 @@ #include <net/sock.h> /* - * Verify iovec. The caller must ensure that the iovec is big enough - * to hold the message iovec. - * - * Save time not doing access_ok. copy_*_user will make this work - * in any case. - */ - -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode) -{ - int size, ct, err; - - if (m->msg_name && m->msg_namelen) { - if (mode == VERIFY_READ) { - void __user *namep; - namep = (void __user __force *) m->msg_name; - err = move_addr_to_kernel(namep, m->msg_namelen, - address); - if (err < 0) - return err; - } - m->msg_name = address; - } else { - m->msg_name = NULL; - m->msg_namelen = 0; - } - - size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) - return -EFAULT; - - m->msg_iov = iov; - err = 0; - - for (ct = 0; ct < m->msg_iovlen; ct++) { - size_t len = iov[ct].iov_len; - - if (len > INT_MAX - err) { - len = INT_MAX - err; - iov[ct].iov_len = len; - } - err += len; - } - - return err; -} - -/* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram * Calls to csum_partial but the last must be in 32 bit chunks diff --git a/net/core/link_watch.c b/net/core/link_watch.c index bd0767e6b2b3..49a9e3e06c08 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -21,7 +21,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> -#include <asm/types.h> +#include <linux/types.h> enum lw_bits { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef31fef25e5a..8e38f17288d3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags); static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS static const struct file_operations neigh_stat_seq_fops; #endif @@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops; the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. - - The last lock is neigh_tbl_lock. It is pure SMP lock, protecting - list of neighbour tables. This list is used only in process context, */ -static DEFINE_RWLOCK(neigh_tbl_lock); - static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); @@ -773,7 +767,7 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; tbl->last_rand = jiffies; - for (p = &tbl->parms; p; p = p->next) + list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } @@ -1446,7 +1440,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, { struct neigh_parms *p; - for (p = &tbl->parms; p; p = p->next) { + list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; @@ -1481,8 +1475,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, } write_lock_bh(&tbl->lock); - p->next = tbl->parms.next; - tbl->parms.next = p; + list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); @@ -1501,24 +1494,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head) void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { - struct neigh_parms **p; - if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); - for (p = &tbl->parms.next; *p; p = &(*p)->next) { - if (*p == parms) { - *p = parms->next; - parms->dead = 1; - write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); - call_rcu(&parms->rcu_head, neigh_rcu_free_parms); - return; - } - } + list_del(&parms->list); + parms->dead = 1; write_unlock_bh(&tbl->lock); - neigh_dbg(1, "%s: not found\n", __func__); + if (parms->dev) + dev_put(parms->dev); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -1530,11 +1514,15 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static void neigh_table_init_no_netlink(struct neigh_table *tbl) +static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; + +void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; + INIT_LIST_HEAD(&tbl->parms_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); atomic_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = @@ -1574,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; -} - -void neigh_table_init(struct neigh_table *tbl) -{ - struct neigh_table *tmp; - - neigh_table_init_no_netlink(tbl); - write_lock(&neigh_tbl_lock); - for (tmp = neigh_tables; tmp; tmp = tmp->next) { - if (tmp->family == tbl->family) - break; - } - tbl->next = neigh_tables; - neigh_tables = tbl; - write_unlock(&neigh_tbl_lock); - if (unlikely(tmp)) { - pr_err("Registering multiple tables for family %d\n", - tbl->family); - dump_stack(); - } + neigh_tables[index] = tbl; } EXPORT_SYMBOL(neigh_table_init); -int neigh_table_clear(struct neigh_table *tbl) +int neigh_table_clear(int index, struct neigh_table *tbl) { - struct neigh_table **tp; - + neigh_tables[index] = NULL; /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); @@ -1609,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl) neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); - write_lock(&neigh_tbl_lock); - for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { - if (*tp == tbl) { - *tp = tbl->next; - break; - } - } - write_unlock(&neigh_tbl_lock); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); @@ -1634,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); +static struct neigh_table *neigh_find_table(int family) +{ + struct neigh_table *tbl = NULL; + + switch (family) { + case AF_INET: + tbl = neigh_tables[NEIGH_ARP_TABLE]; + break; + case AF_INET6: + tbl = neigh_tables[NEIGH_ND_TABLE]; + break; + case AF_DECnet: + tbl = neigh_tables[NEIGH_DN_TABLE]; + break; + } + + return tbl; +} + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; + struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; @@ -1660,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) } } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct neighbour *neigh; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); - - if (nla_len(dst_attr) < tbl->key_len) - goto out; - - if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out; - } + if (nla_len(dst_attr) < tbl->key_len) + goto out; - if (dev == NULL) - goto out; + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } - neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); - if (neigh == NULL) { - err = -ENOENT; - goto out; - } + if (dev == NULL) + goto out; - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); - neigh_release(neigh); + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; goto out; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); out: return err; @@ -1700,11 +1672,14 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; + struct neighbour *neigh; + void *dst, *lladdr; int err; ASSERT_RTNL(); @@ -1728,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; - struct neighbour *neigh; - void *dst, *lladdr; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; - if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out; - dst = nla_data(tb[NDA_DST]); - lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; - if (ndm->ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; + err = 0; + } + goto out; + } - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm->ndm_flags; - err = 0; - } + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto out; } - if (dev == NULL) + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); goto out; - - neigh = neigh_lookup(tbl, dst, dev); - if (neigh == NULL) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - err = -ENOENT; - goto out; - } - - neigh = __neigh_lookup_errno(tbl, dst, dev); - if (IS_ERR(neigh)) { - err = PTR_ERR(neigh); - goto out; - } - } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { - err = -EEXIST; - neigh_release(neigh); - goto out; - } - - if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - if (ndm->ndm_flags & NTF_USE) { - neigh_event_send(neigh, NULL); - err = 0; - } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); - neigh_release(neigh); - goto out; + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + out: return err; } @@ -1990,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; - int err; + bool found = false; + int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy); @@ -2003,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) } ndtmsg = nlmsg_data(nlh); - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + tbl = neigh_tables[tidx]; + if (!tbl) + continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; - - if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { + found = true; break; + } } - if (tbl == NULL) { - err = -ENOENT; - goto errout_locked; - } + if (!found) + return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and @@ -2126,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) errout_tbl_lock: write_unlock_bh(&tbl->lock); -errout_locked: - read_unlock(&neigh_tbl_lock); errout: return err; } @@ -2142,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; + tbl = neigh_tables[tidx]; + if (!tbl) + continue; + if (tidx < tbl_skip || (family && tbl->family != family)) continue; @@ -2154,7 +2123,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) NLM_F_MULTI) <= 0) break; - for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + nidx = 0; + p = list_next_entry(&tbl->parms, list); + list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2174,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: - read_unlock(&neigh_tbl_lock); cb->args[0] = tidx; cb->args[1] = nidx; @@ -2357,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) int proxy = 0; int err; - read_lock(&neigh_tbl_lock); family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is @@ -2369,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl; - tbl = tbl->next, t++) { + for (t = 0; t < NEIGH_NR_TABLES; t++) { + tbl = neigh_tables[t]; + + if (!tbl) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) @@ -2383,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } - read_unlock(&neigh_tbl_lock); cb->args[0] = t; return skb->len; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9dd06699b09c..999341244434 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -12,6 +12,7 @@ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> +#include <net/switchdev.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/nsproxy.h> @@ -325,6 +326,23 @@ static ssize_t tx_queue_len_store(struct device *dev, } NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) +{ + dev->gro_flush_timeout = val; + return 0; +} + +static ssize_t gro_flush_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); +} +NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -387,7 +405,7 @@ static ssize_t phys_port_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct netdev_phys_port_id ppid; + struct netdev_phys_item_id ppid; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) @@ -399,6 +417,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_switch_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_item_id ppid; + + ret = netdev_switch_parent_id_get(netdev, &ppid); + if (!ret) + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_switch_id); + static struct attribute *net_class_attrs[] = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -422,7 +462,9 @@ static struct attribute *net_class_attrs[] = { &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_switch_id.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e6645b4f330a..e0ad5d16c9c5 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this @@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, */ goto out; } - skb->vlan_tci = 0; } status = netdev_start_xmit(skb, dev, txq, false); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 443256bdcddc..da934fc3faa8 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3728,8 +3728,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race * with proc_create_data() */ - if (pkt_dev->entry) - proc_remove(pkt_dev->entry); + proc_remove(pkt_dev->entry); /* And update the thread if_list */ _rem_dev_from_if_list(t, pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 88e8de3b59b0..d06107d36ec8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -36,6 +36,7 @@ #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> +#include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> @@ -43,6 +44,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> +#include <net/switchdev.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> @@ -868,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ - + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -952,7 +955,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; - struct netdev_phys_port_id ppid; + struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { @@ -967,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_item_id psid; + + err = netdev_switch_parent_id_get(dev, &psid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1039,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1196,8 +1220,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, - [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ + [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2221,8 +2246,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, + unsigned int change, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2240,11 +2265,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); - return; + return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return NULL; +} + +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) +{ + struct net *net = dev_net(dev); + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + struct sk_buff *skb; + + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + if (skb) + rtmsg_ifinfo_send(skb, dev, flags); } EXPORT_SYMBOL(rtmsg_ifinfo); @@ -2313,7 +2355,7 @@ errout: int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, + const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; @@ -2339,6 +2381,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) +{ + u16 vid = 0; + + if (vlan_attr) { + if (nla_len(vlan_attr) != sizeof(u16)) { + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n"); + return -EINVAL; + } + + vid = nla_get_u16(vlan_attr); + + if (!vid || vid >= VLAN_VID_MASK) { + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n", + vid); + return -EINVAL; + } + } + *p_vid = vid; + return 0; +} + static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -2346,6 +2410,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; + u16 vid; int err; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); @@ -2371,6 +2436,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ @@ -2379,7 +2448,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; - err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); + err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, + nlh->nlmsg_flags); if (err) goto out; else @@ -2390,9 +2460,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if ((ndm->ndm_flags & NTF_SELF)) { if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, + vid, nlh->nlmsg_flags); else - err = ndo_dflt_fdb_add(ndm, tb, dev, addr, + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err) { @@ -2410,7 +2481,7 @@ out: int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { int err = -EINVAL; @@ -2439,6 +2510,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) struct net_device *dev; int err = -EINVAL; __u8 *addr; + u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; @@ -2466,6 +2538,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ @@ -2475,7 +2551,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) const struct net_device_ops *ops = br_dev->netdev_ops; if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); if (err) goto out; @@ -2486,9 +2562,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { if (dev->netdev_ops->ndo_fdb_del) - err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); + err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr, + vid); else - err = ndo_dflt_fdb_del(ndm, tb, dev, addr); + err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); @@ -2628,12 +2705,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, + unsigned int attrnum, unsigned int flag) +{ + if (mask & flag) + return nla_put_u8(skb, attrnum, !!(flags & flag)); + return 0; +} + int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev, u16 mode) + struct net_device *dev, u16 mode, + u32 flags, u32 mask) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; + struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); @@ -2665,13 +2752,46 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (!br_afspec) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || - nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } + + if (mode != BRIDGE_MODE_UNDEF) { + if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); + protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); + if (!protinfo) + goto nla_put_failure; + + if (brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_FAST_LEAVE, + BR_MULTICAST_FAST_LEAVE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING, BR_LEARNING) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROXYARP, BR_PROXYARP)) { + nla_nest_cancel(skb, protinfo); + goto nla_put_failure; + } + + nla_nest_end(skb, protinfo); + return nlmsg_end(skb, nlh); nla_put_failure: nlmsg_cancel(skb, nlh); diff --git a/net/core/scm.c b/net/core/scm.c index b442e7e25e60..3b6899b7d810 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) struct cmsghdr *cmsg; int err; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) - { + for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 32e31c299631..ae13ef6b3ea7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; atomic_set(&fclones->fclone_ref, 1); - fclones->skb2.fclone = SKB_FCLONE_FREE; + fclones->skb2.fclone = SKB_FCLONE_CLONE; fclones->skb2.pfmemalloc = pfmemalloc; } out: @@ -336,59 +336,85 @@ struct netdev_alloc_cache { unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, + gfp_t gfp_mask) { - struct netdev_alloc_cache *nc; - void *data = NULL; - int order; - unsigned long flags; + const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; + struct page *page = NULL; + gfp_t gfp = gfp_mask; + + if (order) { + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); + nc->frag.size = PAGE_SIZE << (page ? order : 0); + } - local_irq_save(flags); - nc = this_cpu_ptr(&netdev_alloc_cache); - if (unlikely(!nc->frag.page)) { + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->frag.page = page; + + return page; +} + +static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, + unsigned int fragsz, gfp_t gfp_mask) +{ + struct netdev_alloc_cache *nc = this_cpu_ptr(cache); + struct page *page = nc->frag.page; + unsigned int size; + int offset; + + if (unlikely(!page)) { refill: - for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { - gfp_t gfp = gfp_mask; + page = __page_frag_refill(nc, gfp_mask); + if (!page) + return NULL; + + /* if size can vary use frag.size else just use PAGE_SIZE */ + size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; - nc->frag.page = alloc_pages(gfp, order); - if (likely(nc->frag.page)) - break; - if (--order < 0) - goto end; - } - nc->frag.size = PAGE_SIZE << order; /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1, - &nc->frag.page->_count); - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; + atomic_add(size - 1, &page->_count); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + nc->frag.offset = size; } - if (nc->frag.offset + fragsz > nc->frag.size) { - if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) { - if (!atomic_sub_and_test(nc->pagecnt_bias, - &nc->frag.page->_count)) - goto refill; - /* OK, page count is 0, we can safely set it */ - atomic_set(&nc->frag.page->_count, - NETDEV_PAGECNT_MAX_BIAS); - } else { - atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias, - &nc->frag.page->_count); - } - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; + offset = nc->frag.offset - fragsz; + if (unlikely(offset < 0)) { + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + goto refill; + + /* if size can vary use frag.size else just use PAGE_SIZE */ + size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; + + /* OK, page count is 0, we can safely set it */ + atomic_set(&page->_count, size); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + offset = size - fragsz; } - data = page_address(nc->frag.page) + nc->frag.offset; - nc->frag.offset += fragsz; nc->pagecnt_bias--; -end: + nc->frag.offset = offset; + + return page_address(page) + offset; +} + +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + unsigned long flags; + void *data; + + local_irq_save(flags); + data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -406,11 +432,25 @@ void *netdev_alloc_frag(unsigned int fragsz) } EXPORT_SYMBOL(netdev_alloc_frag); +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); +} + +void *napi_alloc_frag(unsigned int fragsz) +{ + return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} +EXPORT_SYMBOL(napi_alloc_frag); + /** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on + * __alloc_rx_skb - allocate an skbuff for rx * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb + * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case we have to fallback to __alloc_skb() + * If SKB_ALLOC_NAPI is set, page fragment will be allocated + * from napi_cache instead of netdev_cache. * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate @@ -419,11 +459,11 @@ EXPORT_SYMBOL(netdev_alloc_frag); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) +static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, + int flags) { struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + + unsigned int fragsz = SKB_DATA_ALIGN(length) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { @@ -432,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __netdev_alloc_frag(fragsz, gfp_mask); + data = (flags & SKB_ALLOC_NAPI) ? + __napi_alloc_frag(fragsz, gfp_mask) : + __netdev_alloc_frag(fragsz, gfp_mask); if (likely(data)) { skb = build_skb(data, fragsz); @@ -440,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, put_page(virt_to_head_page(data)); } } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, + skb = __alloc_skb(length, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); } + return skb; +} + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has NET_SKB_PAD headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + length += NET_SKB_PAD; + skb = __alloc_rx_skb(length, gfp_mask, 0); + if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; } + return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); +/** + * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * @napi: napi instance this buffer was allocated for + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages + * + * Allocate a new sk_buff for use in NAPI receive. This buffer will + * attempt to allocate the head from a special reserved region used + * only for NAPI Rx allocation. By doing this we can save several + * CPU cycles by avoiding having to disable and re-enable IRQs. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + length += NET_SKB_PAD + NET_IP_ALIGN; + skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); + + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + } + + return skb; +} +EXPORT_SYMBOL(__napi_alloc_skb); + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { @@ -541,26 +638,27 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); - break; + return; case SKB_FCLONE_ORIG: fclones = container_of(skb, struct sk_buff_fclones, skb1); - if (atomic_dec_and_test(&fclones->fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, fclones); - break; - case SKB_FCLONE_CLONE: - fclones = container_of(skb, struct sk_buff_fclones, skb2); - - /* The clone portion is available for - * fast-cloning again. + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. */ - skb->fclone = SKB_FCLONE_FREE; + if (atomic_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; - if (atomic_dec_and_test(&fclones->fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, fclones); + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } + if (!atomic_dec_and_test(&fclones->fclone_ref)) + return; +fastpath: + kmem_cache_free(skbuff_fclone_cache, fclones); } static void skb_release_head_state(struct sk_buff *skb) @@ -872,15 +970,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct sk_buff_fclones *fclones = container_of(skb, struct sk_buff_fclones, skb1); - struct sk_buff *n = &fclones->skb2; + struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_FREE) { - n->fclone = SKB_FCLONE_CLONE; - atomic_inc(&fclones->fclone_ref); + atomic_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -3002,7 +3100,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg) { + if (!sg && !nskb->remcsum_offload) { nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), @@ -3074,7 +3172,7 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum) { + if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; @@ -3088,6 +3186,16 @@ perform_csum_check: * (see validate_xmit_skb_list() for example) */ segs->prev = tail; + + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } return segs; err: @@ -4130,6 +4238,113 @@ err_free: } EXPORT_SYMBOL(skb_vlan_untag); +int skb_ensure_writable(struct sk_buff *skb, int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable); + +/* remove VLAN header from packet and update csum accordingly. */ +static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +{ + struct vlan_hdr *vhdr; + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + __skb_push(skb, offset); + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + goto pull; + + skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *vlan_tci = ntohs(vhdr->h_vlan_TCI); + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + + skb_reset_mac_len(skb); +pull: + __skb_pull(skb, offset); + + return err; +} + +int skb_vlan_pop(struct sk_buff *skb) +{ + u16 vlan_tci; + __be16 vlan_proto; + int err; + + if (likely(vlan_tx_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __skb_vlan_pop(skb, &vlan_tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + vlan_proto = skb->protocol; + err = __skb_vlan_pop(skb, &vlan_tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_pop); + +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +{ + if (vlan_tx_tag_present(skb)) { + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + /* __vlan_insert_tag expect skb->data pointing to mac header. + * So change skb->data before calling it and change back to + * original position later + */ + __skb_push(skb, offset); + err = __vlan_insert_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (err) + return err; + skb->protocol = skb->vlan_proto; + skb->mac_len += VLAN_HLEN; + __skb_pull(skb, offset); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(skb->data + + (2 * ETH_ALEN), VLAN_HLEN, 0)); + } + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_push); + /** * alloc_skb_with_frags - allocate skb with page frags * diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67b1069..9a56b2000c3f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -888,6 +888,19 @@ set_rcvbuf: } break; + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1213,6 +1226,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_max_pacing_rate; break; + case SO_INCOMING_CPU: + v.val = sk->sk_incoming_cpu; + break; + default: return -ENOPROTOOPT; } @@ -1517,6 +1534,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2457,7 +2475,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cf9cd13509a7..31baba2a71ce 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int net_msg_warn; /* Unused, but still a sysctl */ + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -215,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif +static int proc_do_rss_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table fake_table; + char buf[NETDEV_RSS_KEY_LEN * 3]; + + snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + return proc_dostring(&fake_table, write, buffer, lenp, ppos); +} + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -263,6 +277,13 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "netdev_rss_key", + .data = &netdev_rss_key, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_do_rss_key, + }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", diff --git a/net/core/utils.c b/net/core/utils.c index efc76dd9dcd1..7b803884c162 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -33,9 +33,6 @@ #include <asm/byteorder.h> #include <asm/uaccess.h> -int net_msg_warn __read_mostly = 1; -EXPORT_SYMBOL(net_msg_warn); - DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); /* * All net warning printk()s should be guarded by this function. |