diff options
Diffstat (limited to 'net')
35 files changed, 579 insertions, 580 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 0814a560e5f3..dd3bf582e6f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5925,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -5934,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + GFP_KERNEL); + /* * Flush the unicast and multicast chains */ @@ -5943,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); diff --git a/net/core/dst.c b/net/core/dst.c index a028409ee438..e956ce6d1378 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); -/** - * __skb_dst_set_noref - sets skb dst, without a reference - * @skb: buffer - * @dst: dst entry - * @force: if force is set, use noref version even for DST_NOCACHE entries - * - * Sets skb dst, assuming a reference was not taken on dst - * skb_dst_drop() should not dst_release() this dst - */ -void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) -{ - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); - /* If dst not in cache, we must take a reference, because - * dst_release() will destroy dst as soon as its refcount becomes zero - */ - if (unlikely((dst->flags & DST_NOCACHE) && !force)) { - dst_hold(dst); - skb_dst_set(skb, dst); - } else { - skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; - } -} -EXPORT_SYMBOL(__skb_dst_set_noref); - /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 715f51f321e9..550892cd6b3f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; +static const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); + if (sset == ETH_SS_RSS_HASH_FUNCS) + return ARRAY_SIZE(rss_hash_func_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev, if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); + else if (stringset == ETH_SS_RSS_HASH_FUNCS) + memcpy(data, rss_hash_func_strings, + sizeof(rss_hash_func_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -618,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; @@ -679,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, goto out; } - ret = ops->set_rxfh(dev, indir, NULL); + ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); out: kfree(indir); @@ -697,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, u32 total_size; u32 indir_bytes; u32 *indir = NULL; + u8 dev_hfunc = 0; u8 *hkey = NULL; u8 *rss_config; - if (!(dev->ethtool_ops->get_rxfh_indir_size || - dev->ethtool_ops->get_rxfh_key_size) || - !dev->ethtool_ops->get_rxfh) + if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -710,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; rxfh.indir_size = dev_indir_size; @@ -727,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; - /* If the user buffer size is 0, this is just a query for the - * device table size and key size. Otherwise, if the User size is - * not equal to device table size or key size it's an error. - */ - if (!user_indir_size && !user_key_size) - return 0; - if ((user_indir_size && (user_indir_size != dev_indir_size)) || (user_key_size && (user_key_size != dev_key_size))) return -EINVAL; @@ -750,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); - if (!ret) { - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh, rss_config[0]), - rss_config, total_size)) - ret = -EFAULT; - } + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (ret) + goto out; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), + &dev_hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) { + ret = -EFAULT; + } +out: kfree(rss_config); return ret; @@ -776,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || - !ops->get_rxnfc || !ops->set_rxfh) + if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; - /* If either indir or hash key is valid, proceed further. - * It is not valid to request that both be unchanged. + /* If either indir, hash key or function is valid, proceed further. + * Must request at least one change: indir size, hash key or function. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0)) + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) @@ -845,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey); + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); out: kfree(rss_config); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 61cb7e7cc3c7..a9be2c161702 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2245,8 +2245,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, + unsigned int change, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2264,11 +2264,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); - return; + return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return NULL; +} + +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) +{ + struct net *net = dev_net(dev); + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + struct sk_buff *skb; + + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + if (skb) + rtmsg_ifinfo_send(skb, dev, flags); } EXPORT_SYMBOL(rtmsg_ifinfo); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 92116dfe827c..7a338fb55cc4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; atomic_set(&fclones->fclone_ref, 1); - fclones->skb2.fclone = SKB_FCLONE_FREE; + fclones->skb2.fclone = SKB_FCLONE_CLONE; fclones->skb2.pfmemalloc = pfmemalloc; } out: @@ -541,26 +541,27 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); - break; + return; case SKB_FCLONE_ORIG: fclones = container_of(skb, struct sk_buff_fclones, skb1); - if (atomic_dec_and_test(&fclones->fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, fclones); - break; - - case SKB_FCLONE_CLONE: - fclones = container_of(skb, struct sk_buff_fclones, skb2); - /* The clone portion is available for - * fast-cloning again. + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. */ - skb->fclone = SKB_FCLONE_FREE; + if (atomic_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; - if (atomic_dec_and_test(&fclones->fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, fclones); + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } + if (!atomic_dec_and_test(&fclones->fclone_ref)) + return; +fastpath: + kmem_cache_free(skbuff_fclone_cache, fclones); } static void skb_release_head_state(struct sk_buff *skb) @@ -872,15 +873,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct sk_buff_fclones *fclones = container_of(skb, struct sk_buff_fclones, skb1); - struct sk_buff *n = &fclones->skb2; + struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_FREE) { - n->fclone = SKB_FCLONE_CLONE; - atomic_inc(&fclones->fclone_ref); + atomic_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b7826575d215..640f26c6a9fe 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, + const struct sk_buff *skb, + int ee_origin) +{ + struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + + if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || + (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + (!skb->dev)) + return false; + + info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; + info->ipi_ifindex = skb->dev->ifindex; + return true; +} + /* * Handle MSG_ERRQUEUE */ @@ -414,6 +430,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int err; int copied; + WARN_ON_ONCE(sk->sk_family == AF_INET6); + err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) @@ -444,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; sin->sin_family = AF_UNSPEC; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; @@ -1049,7 +1069,7 @@ e_inval: } /** - * ipv4_pktinfo_prepare - transfert some info from rtable to skb + * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6513ade8d6dc..8f9cd200ce20 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -288,6 +288,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), + SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), + SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), + SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), + SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dc13a3657e8e..427aee33ffc0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; - - xmit_size_goal = mss_now; - - if (large_allowed && sk_can_gso(sk)) { - u32 gso_size, hlen; - - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); - gso_size = max_t(u32, gso_size, - sysctl_tcp_min_tso_segs * mss_now); - - xmit_size_goal = min_t(u32, gso_size, - sk->sk_gso_max_size - 1 - hlen); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + u32 new_size_goal, size_goal, hlen; + + if (!large_allowed || !sk_can_gso(sk)) + return mss_now; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + new_size_goal = sk->sk_gso_max_size - 1 - hlen; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 20de0118c98e..6b6002416a73 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -363,16 +363,28 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!(ca->found & hystart_detect)) { + if (ca->found & hystart_detect) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } + } + if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) @@ -381,15 +393,16 @@ static void hystart_update(struct sock *sk, u32 delay) ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + - HYSTART_DELAY_THRESH(ca->delay_min>>4)) + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found |= HYSTART_DELAY; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } - /* - * Either one of two conditions are met, - * we exit from slow start immediately. - */ - if (ca->found & hystart_detect) - tp->snd_ssthresh = tp->snd_cwnd; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f5bd4bd3f7e6..f37ecf53ee8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } + +/* Return how many segs we'd like on a TSO packet, + * to send one TSO packet per ms + */ +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +{ + u32 bytes, segs; + + bytes = min(sk->sk_pacing_rate >> 10, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + + return min_t(u32, segs, sk->sk_gso_max_segs); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited) + bool *is_cwnd_limited, u32 max_segs) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - tp->xmit_size_goal_segs * tp->mss_cache)) + if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int cwnd_quota; int result; bool is_cwnd_limited = false; + u32 max_segs; sent_pkts = 0; @@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } + max_segs = tcp_tso_autosize(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } else { if (!push_one && - tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + tcp_tso_should_defer(sk, skb, &is_cwnd_limited, + max_segs)) break; } + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, + max_segs), + nonagle); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * This allows for : @@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - limit = mss_now; - if (tso_segs > 1 && !tcp_urg_mode(tp)) - limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - sk->sk_gso_max_segs), - nonagle); - - if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) - break; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b2d606833ce4..dd8e00634563 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -336,38 +336,45 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } -static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, - unsigned short hnum, - __be16 sport, __be32 daddr, __be16 dport, int dif) +static inline int compute_score(struct sock *sk, struct net *net, + __be32 saddr, unsigned short hnum, __be16 sport, + __be32 daddr, __be16 dport, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + ipv6_only_sock(sk)) + return -1; - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); + + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 4; + } + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 4; } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 4; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; + } + return score; } @@ -378,33 +385,39 @@ static inline int compute_score2(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + ipv6_only_sock(sk)) + return -1; - if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + inet = inet_sk(sk); - if (inet->inet_rcv_saddr != daddr) + if (inet->inet_rcv_saddr != daddr || + inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET) ? 2 : 1; + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; - if (inet->inet_num != hnum) + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; + score += 4; + } - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + return score; } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc1139687fd7..2464a00e36ab 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,6 +325,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +{ + int ifindex = skb->dev ? skb->dev->ifindex : -1; + + if (skb->protocol == htons(ETH_P_IPV6)) + IP6CB(skb)->iif = ifindex; + else + PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; +} + /* * Handle MSG_ERRQUEUE */ @@ -388,8 +398,12 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; - if (np->rxopt.all) + if (np->rxopt.all) { + if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && + serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) + ip6_datagram_prepare_pktinfo_errqueue(skb); ip6_datagram_recv_common_ctl(sk, msg, skb); + } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) @@ -491,7 +505,10 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } - put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + + if (src_info.ipi6_ifindex >= 0) + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, + sizeof(src_info), &src_info); } } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 16a7e81e3f99..ace10d0b3aac 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -95,6 +95,7 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct in6_addr any; for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && @@ -102,6 +103,22 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, (t->dev->flags & IFF_UP)) return t; } + + memset(&any, 0, sizeof(any)); + hash = HASH(&any, local); + for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + (t->dev->flags & IFF_UP)) + return t; + } + + hash = HASH(remote, &any); + for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e1a9583bb419..66980d8d98d1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -110,12 +110,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } - opt = xchg(&inet6_sk(sk)->opt, opt); - } else { - spin_lock(&sk->sk_dst_lock); - opt = xchg(&inet6_sk(sk)->opt, opt); - spin_unlock(&sk->sk_dst_lock); } + opt = xchg(&inet6_sk(sk)->opt, opt); sk_dst_reset(sk); return opt; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7cfb5d745a2d..7f96432292ce 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -148,72 +148,85 @@ static inline int compute_score(struct sock *sk, struct net *net, const struct in6_addr *daddr, __be16 dport, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - sk->sk_family == PF_INET6) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6) + return -1; - score = 0; - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) - return -1; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score++; - } + score = 0; + inet = inet_sk(sk); + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; } + + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score++; + } + + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) + return -1; + score++; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + return score; } #define SCORE2_MAX (1 + 1 + 1) static inline int compute_score2(struct sock *sk, struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, unsigned short hnum, - int dif) + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - sk->sk_family == PF_INET6) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6) + return -1; - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + + score = 0; + inet = inet_sk(sk); + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; - score = 0; - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score++; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) - return -1; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score++; - } + score++; } + + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) + return -1; + score++; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + return score; } - /* called with read_rcu_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 1f933136155a..3aedbda7658a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -343,7 +343,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, skb_dst_drop(skb); if (noref) { if (!local) - skb_dst_set_noref_force(skb, &rt->dst); + skb_dst_set_noref(skb, &rt->dst); else skb_dst_set(skb, dst_clone(&rt->dst)); } else @@ -487,7 +487,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, skb_dst_drop(skb); if (noref) { if (!local) - skb_dst_set_noref_force(skb, &rt->dst); + skb_dst_set_noref(skb, &rt->dst); else skb_dst_set(skb, dst_clone(&rt->dst)); } else diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index e771a46933e5..9584526c0778 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -600,7 +600,7 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, fl.saddr = tun_key->ipv4_src; fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb_mark; - fl.flowi4_proto = IPPROTO_GRE; + fl.flowi4_proto = ipproto; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index cd61280941e5..5aed341406c2 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -72,10 +72,6 @@ static unsigned long basic_get(struct tcf_proto *tp, u32 handle) return l; } -static void basic_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int basic_init(struct tcf_proto *tp) { struct basic_head *head; @@ -113,18 +109,12 @@ static void basic_destroy(struct tcf_proto *tp) static int basic_delete(struct tcf_proto *tp, unsigned long arg) { - struct basic_head *head = rtnl_dereference(tp->root); - struct basic_filter *t, *f = (struct basic_filter *) arg; - - list_for_each_entry(t, &head->flist, link) - if (t == f) { - list_del_rcu(&t->link); - tcf_unbind_filter(tp, &t->res); - call_rcu(&t->rcu, basic_delete_filter); - return 0; - } + struct basic_filter *f = (struct basic_filter *) arg; - return -ENOENT; + list_del_rcu(&f->link); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, basic_delete_filter); + return 0; } static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { @@ -188,10 +178,9 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - err = -ENOBUFS; fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); - if (fnew == NULL) - goto errout; + if (!fnew) + return -ENOBUFS; tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; @@ -293,7 +282,6 @@ static struct tcf_proto_ops cls_basic_ops __read_mostly = { .init = basic_init, .destroy = basic_destroy, .get = basic_get, - .put = basic_put, .change = basic_change, .delete = basic_delete, .walk = basic_walk, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index eed49d1d0878..84c8219c3e1c 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -109,19 +109,12 @@ static void __cls_bpf_delete_prog(struct rcu_head *rcu) static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) { - struct cls_bpf_head *head = rtnl_dereference(tp->root); - struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; + struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg; - list_for_each_entry(prog, &head->plist, link) { - if (prog == todel) { - list_del_rcu(&prog->link); - tcf_unbind_filter(tp, &prog->res); - call_rcu(&prog->rcu, __cls_bpf_delete_prog); - return 0; - } - } - - return -ENOENT; + list_del_rcu(&prog->link); + tcf_unbind_filter(tp, &prog->res); + call_rcu(&prog->rcu, __cls_bpf_delete_prog); + return 0; } static void cls_bpf_destroy(struct tcf_proto *tp) @@ -148,7 +141,7 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) if (head == NULL) return 0UL; - list_for_each_entry_rcu(prog, &head->plist, link) { + list_for_each_entry(prog, &head->plist, link) { if (prog->handle == handle) { ret = (unsigned long) prog; break; @@ -158,10 +151,6 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, struct cls_bpf_prog *prog, unsigned long base, struct nlattr **tb, @@ -344,7 +333,7 @@ static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; - list_for_each_entry_rcu(prog, &head->plist, link) { + list_for_each_entry(prog, &head->plist, link) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long) prog, arg) < 0) { @@ -363,7 +352,6 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .init = cls_bpf_init, .destroy = cls_bpf_destroy, .get = cls_bpf_get, - .put = cls_bpf_put, .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index d61a801222c1..741bfa7debb2 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -67,10 +67,6 @@ static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle) return 0UL; } -static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int cls_cgroup_init(struct tcf_proto *tp) { return 0; @@ -117,11 +113,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); - if (head) - new->handle = head->handle; - else - new->handle = handle; - + new->handle = handle; new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], cgroup_policy); @@ -217,7 +209,6 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { .classify = cls_cgroup_classify, .destroy = cls_cgroup_destroy, .get = cls_cgroup_get, - .put = cls_cgroup_put, .delete = cls_cgroup_delete, .walk = cls_cgroup_walk, .dump = cls_cgroup_dump, diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 4ac515f2a6ce..8e227180cabb 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -426,10 +426,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, goto err2; /* Copy fold into fnew */ - fnew->handle = fold->handle; - fnew->keymask = fold->keymask; fnew->tp = fold->tp; - fnew->handle = fold->handle; fnew->nkeys = fold->nkeys; fnew->keymask = fold->keymask; @@ -578,16 +575,12 @@ static unsigned long flow_get(struct tcf_proto *tp, u32 handle) struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; - list_for_each_entry_rcu(f, &head->filters, list) + list_for_each_entry(f, &head->filters, list) if (f->handle == handle) return (unsigned long)f; return 0; } -static void flow_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -654,7 +647,7 @@ static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; - list_for_each_entry_rcu(f, &head->filters, list) { + list_for_each_entry(f, &head->filters, list) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long)f, arg) < 0) { @@ -674,7 +667,6 @@ static struct tcf_proto_ops cls_flow_ops __read_mostly = { .change = flow_change, .delete = flow_delete, .get = flow_get, - .put = flow_put, .dump = flow_dump, .walk = flow_walk, .owner = THIS_MODULE, diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index dbfdfd1f1a9f..23fda2ac0d19 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -111,10 +111,6 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle) return 0; } -static void fw_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int fw_init(struct tcf_proto *tp) { return 0; @@ -411,7 +407,6 @@ static struct tcf_proto_ops cls_fw_ops __read_mostly = { .init = fw_init, .destroy = fw_destroy, .get = fw_get, - .put = fw_put, .change = fw_change, .delete = fw_delete, .walk = fw_walk, diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 109a329b7198..098a27360b91 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -256,10 +256,6 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) return 0; } -static void route4_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int route4_init(struct tcf_proto *tp) { return 0; @@ -649,7 +645,6 @@ static struct tcf_proto_ops cls_route4_ops __read_mostly = { .init = route4_init, .destroy = route4_destroy, .get = route4_get, - .put = route4_put, .change = route4_change, .delete = route4_delete, .walk = route4_walk, diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 6bb55f277a5a..b7af3623a26a 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -271,10 +271,6 @@ static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) return 0; } -static void rsvp_put(struct tcf_proto *tp, unsigned long f) -{ -} - static int rsvp_init(struct tcf_proto *tp) { struct rsvp_head *data; @@ -708,7 +704,6 @@ static struct tcf_proto_ops RSVP_OPS __read_mostly = { .init = rsvp_init, .destroy = rsvp_destroy, .get = rsvp_get, - .put = rsvp_put, .change = rsvp_change, .delete = rsvp_delete, .walk = rsvp_walk, diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 30f10fb07f4a..0d9d8911a621 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -116,13 +116,6 @@ static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL; } - -static void tcindex_put(struct tcf_proto *tp, unsigned long f) -{ - pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f); -} - - static int tcindex_init(struct tcf_proto *tp) { struct tcindex_data *p; @@ -560,7 +553,6 @@ static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { .init = tcindex_init, .destroy = tcindex_destroy, .get = tcindex_get, - .put = tcindex_put, .change = tcindex_change, .delete = tcindex_delete, .walk = tcindex_walk, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 0472909bb014..09487afbfd51 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -299,10 +299,6 @@ static unsigned long u32_get(struct tcf_proto *tp, u32 handle) return (unsigned long)u32_lookup_key(ht, handle); } -static void u32_put(struct tcf_proto *tp, unsigned long f) -{ -} - static u32 gen_new_htid(struct tc_u_common *tp_c) { int i = 0x800; @@ -1021,7 +1017,6 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .init = u32_init, .destroy = u32_destroy, .get = u32_get, - .put = u32_put, .change = u32_change, .delete = u32_delete, .walk = u32_walk, diff --git a/net/socket.c b/net/socket.c index ee3ee39eefa5..f676ac4a3701 100644 --- a/net/socket.c +++ b/net/socket.c @@ -651,7 +651,8 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size); } -int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +static int do_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t size, bool nosec) { struct kiocb iocb; struct sock_iocb siocb; @@ -659,25 +660,22 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) init_sync_kiocb(&iocb, NULL); iocb.private = &siocb; - ret = __sock_sendmsg(&iocb, sock, msg, size); + ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) : + __sock_sendmsg(&iocb, sock, msg, size); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&iocb); return ret; } + +int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + return do_sock_sendmsg(sock, msg, size, false); +} EXPORT_SYMBOL(sock_sendmsg); static int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size) { - struct kiocb iocb; - struct sock_iocb siocb; - int ret; - - init_sync_kiocb(&iocb, NULL); - iocb.private = &siocb; - ret = __sock_sendmsg_nosec(&iocb, sock, msg, size); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); - return ret; + return do_sock_sendmsg(sock, msg, size, true); } int kernel_sendmsg(struct socket *sock, struct msghdr *msg, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f0761c771734..96ceefeb9daf 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -233,8 +233,11 @@ static void bclink_retransmit_pkt(u32 after, u32 to) */ void tipc_bclink_wakeup_users(void) { - while (skb_queue_len(&bclink->link.waiting_sks)) - tipc_sk_rcv(skb_dequeue(&bclink->link.waiting_sks)); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&bclink->link.waiting_sks))) + tipc_sk_rcv(skb); + } /** @@ -950,7 +953,7 @@ int tipc_bclink_init(void) spin_lock_init(&bclink->lock); __skb_queue_head_init(&bcl->outqueue); __skb_queue_head_init(&bcl->deferred_queue); - __skb_queue_head_init(&bcl->waiting_sks); + skb_queue_head_init(&bcl->waiting_sks); bcl->next_out_no = 1; spin_lock_init(&bclink->node.lock); __skb_queue_head_init(&bclink->node.waiting_sks); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 56248db75274..ba6083dca95b 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -38,39 +38,6 @@ #include "link.h" #include "name_distr.h" -/** - * struct publ_list - list of publications made by this node - * @list: circular list of publications - * @list_size: number of entries in list - */ -struct publ_list { - struct list_head list; - u32 size; -}; - -static struct publ_list publ_zone = { - .list = LIST_HEAD_INIT(publ_zone.list), - .size = 0, -}; - -static struct publ_list publ_cluster = { - .list = LIST_HEAD_INIT(publ_cluster.list), - .size = 0, -}; - -static struct publ_list publ_node = { - .list = LIST_HEAD_INIT(publ_node.list), - .size = 0, -}; - -static struct publ_list *publ_lists[] = { - NULL, - &publ_zone, /* publ_lists[TIPC_ZONE_SCOPE] */ - &publ_cluster, /* publ_lists[TIPC_CLUSTER_SCOPE] */ - &publ_node /* publ_lists[TIPC_NODE_SCOPE] */ -}; - - int sysctl_tipc_named_timeout __read_mostly = 2000; /** @@ -146,8 +113,8 @@ struct sk_buff *tipc_named_publish(struct publication *publ) struct sk_buff *buf; struct distr_item *item; - list_add_tail(&publ->local_list, &publ_lists[publ->scope]->list); - publ_lists[publ->scope]->size++; + list_add_tail_rcu(&publ->local_list, + &tipc_nametbl->publ_list[publ->scope]); if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -172,7 +139,6 @@ struct sk_buff *tipc_named_withdraw(struct publication *publ) struct distr_item *item; list_del(&publ->local_list); - publ_lists[publ->scope]->size--; if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -195,21 +161,17 @@ struct sk_buff *tipc_named_withdraw(struct publication *publ) * @pls: linked list of publication items to be packed into buffer chain */ static void named_distribute(struct sk_buff_head *list, u32 dnode, - struct publ_list *pls) + struct list_head *pls) { struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; - uint dsz = pls->size * ITEM_SIZE; uint msg_dsz = (tipc_node_get_mtu(dnode, 0) / ITEM_SIZE) * ITEM_SIZE; - uint rem = dsz; - uint msg_rem = 0; + uint msg_rem = msg_dsz; - list_for_each_entry(publ, &pls->list, local_list) { + list_for_each_entry(publ, pls, local_list) { /* Prepare next buffer: */ if (!skb) { - msg_rem = min_t(uint, rem, msg_dsz); - rem -= msg_rem; skb = named_prepare_buf(PUBLICATION, msg_rem, dnode); if (!skb) { pr_warn("Bulk publication failure\n"); @@ -227,8 +189,14 @@ static void named_distribute(struct sk_buff_head *list, u32 dnode, if (!msg_rem) { __skb_queue_tail(list, skb); skb = NULL; + msg_rem = msg_dsz; } } + if (skb) { + msg_set_size(buf_msg(skb), INT_H_SIZE + (msg_dsz - msg_rem)); + skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); + __skb_queue_tail(list, skb); + } } /** @@ -240,10 +208,12 @@ void tipc_named_node_up(u32 dnode) __skb_queue_head_init(&head); - read_lock_bh(&tipc_nametbl_lock); - named_distribute(&head, dnode, &publ_cluster); - named_distribute(&head, dnode, &publ_zone); - read_unlock_bh(&tipc_nametbl_lock); + rcu_read_lock(); + named_distribute(&head, dnode, + &tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]); + named_distribute(&head, dnode, + &tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]); + rcu_read_unlock(); tipc_link_xmit(&head, dnode, dnode); } @@ -290,12 +260,12 @@ static void tipc_publ_purge(struct publication *publ, u32 addr) { struct publication *p; - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); p = tipc_nametbl_remove_publ(publ->type, publ->lower, publ->node, publ->ref, publ->key); if (p) tipc_publ_unsubscribe(p, addr); - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); if (p != publ) { pr_err("Unable to remove publication from failed node\n" @@ -304,7 +274,7 @@ static void tipc_publ_purge(struct publication *publ, u32 addr) publ->key); } - kfree(p); + kfree_rcu(p, rcu); } void tipc_publ_notify(struct list_head *nsub_list, u32 addr) @@ -341,7 +311,7 @@ static bool tipc_update_nametbl(struct distr_item *i, u32 node, u32 dtype) ntohl(i->key)); if (publ) { tipc_publ_unsubscribe(publ, node); - kfree(publ); + kfree_rcu(publ, rcu); return true; } } else { @@ -406,14 +376,14 @@ void tipc_named_rcv(struct sk_buff *buf) u32 count = msg_data_sz(msg) / ITEM_SIZE; u32 node = msg_orignode(msg); - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); while (count--) { if (!tipc_update_nametbl(item, node, msg_type(msg))) tipc_named_add_backlog(item, msg_type(msg), node); item++; } tipc_named_process_backlog(); - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); kfree_skb(buf); } @@ -429,11 +399,12 @@ void tipc_named_reinit(void) struct publication *publ; int scope; - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++) - list_for_each_entry(publ, &publ_lists[scope]->list, local_list) + list_for_each_entry_rcu(publ, &tipc_nametbl->publ_list[scope], + local_list) publ->node = tipc_own_addr; - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 772be1cd8bf6..aafa684c4db9 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -2,7 +2,7 @@ * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014, Ericsson AB - * Copyright (c) 2004-2008, 2010-2011, Wind River Systems + * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -92,6 +92,7 @@ struct sub_seq { * @ns_list: links to adjacent name sequences in hash chain * @subscriptions: list of subscriptions for this 'type' * @lock: spinlock controlling access to publication lists of all sub-sequences + * @rcu: RCU callback head used for deferred freeing */ struct name_seq { u32 type; @@ -101,21 +102,11 @@ struct name_seq { struct hlist_node ns_list; struct list_head subscriptions; spinlock_t lock; + struct rcu_head rcu; }; -/** - * struct name_table - table containing all existing port name publications - * @types: pointer to fixed-sized array of name sequence lists, - * accessed via hashing on 'type'; name sequence lists are *not* sorted - * @local_publ_count: number of publications issued by this node - */ -struct name_table { - struct hlist_head *types; - u32 local_publ_count; -}; - -static struct name_table table; -DEFINE_RWLOCK(tipc_nametbl_lock); +struct name_table *tipc_nametbl; +DEFINE_SPINLOCK(tipc_nametbl_lock); static int hash(int x) { @@ -142,9 +133,7 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper, publ->node = node; publ->ref = port_ref; publ->key = key; - INIT_LIST_HEAD(&publ->local_list); INIT_LIST_HEAD(&publ->pport_list); - INIT_LIST_HEAD(&publ->nodesub_list); return publ; } @@ -179,22 +168,10 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea nseq->alloc = 1; INIT_HLIST_NODE(&nseq->ns_list); INIT_LIST_HEAD(&nseq->subscriptions); - hlist_add_head(&nseq->ns_list, seq_head); + hlist_add_head_rcu(&nseq->ns_list, seq_head); return nseq; } -/* - * nameseq_delete_empty - deletes a name sequence structure if now unused - */ -static void nameseq_delete_empty(struct name_seq *seq) -{ - if (!seq->first_free && list_empty(&seq->subscriptions)) { - hlist_del_init(&seq->ns_list); - kfree(seq->sseqs); - kfree(seq); - } -} - /** * nameseq_find_subseq - find sub-sequence (if any) matching a name instance * @@ -475,8 +452,8 @@ static struct name_seq *nametbl_find_seq(u32 type) struct hlist_head *seq_head; struct name_seq *ns; - seq_head = &table.types[hash(type)]; - hlist_for_each_entry(ns, seq_head, ns_list) { + seq_head = &tipc_nametbl->seq_hlist[hash(type)]; + hlist_for_each_entry_rcu(ns, seq_head, ns_list) { if (ns->type == type) return ns; } @@ -487,7 +464,9 @@ static struct name_seq *nametbl_find_seq(u32 type) struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper, u32 scope, u32 node, u32 port, u32 key) { + struct publication *publ; struct name_seq *seq = nametbl_find_seq(type); + int index = hash(type); if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE) || (lower > upper)) { @@ -497,12 +476,16 @@ struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper, } if (!seq) - seq = tipc_nameseq_create(type, &table.types[hash(type)]); + seq = tipc_nameseq_create(type, + &tipc_nametbl->seq_hlist[index]); if (!seq) return NULL; - return tipc_nameseq_insert_publ(seq, type, lower, upper, + spin_lock_bh(&seq->lock); + publ = tipc_nameseq_insert_publ(seq, type, lower, upper, scope, node, port, key); + spin_unlock_bh(&seq->lock); + return publ; } struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, @@ -514,8 +497,16 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, if (!seq) return NULL; + spin_lock_bh(&seq->lock); publ = tipc_nameseq_remove_publ(seq, lower, node, ref, key); - nameseq_delete_empty(seq); + if (!seq->first_free && list_empty(&seq->subscriptions)) { + hlist_del_init_rcu(&seq->ns_list); + kfree(seq->sseqs); + spin_unlock_bh(&seq->lock); + kfree_rcu(seq, rcu); + return publ; + } + spin_unlock_bh(&seq->lock); return publ; } @@ -544,14 +535,14 @@ u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *destnode) if (!tipc_in_scope(*destnode, tipc_own_addr)) return 0; - read_lock_bh(&tipc_nametbl_lock); + rcu_read_lock(); seq = nametbl_find_seq(type); if (unlikely(!seq)) goto not_found; + spin_lock_bh(&seq->lock); sseq = nameseq_find_subseq(seq, instance); if (unlikely(!sseq)) - goto not_found; - spin_lock_bh(&seq->lock); + goto no_match; info = sseq->info; /* Closest-First Algorithm */ @@ -601,7 +592,7 @@ u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *destnode) no_match: spin_unlock_bh(&seq->lock); not_found: - read_unlock_bh(&tipc_nametbl_lock); + rcu_read_unlock(); *destnode = node; return ref; } @@ -627,13 +618,12 @@ int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, struct name_info *info; int res = 0; - read_lock_bh(&tipc_nametbl_lock); + rcu_read_lock(); seq = nametbl_find_seq(type); if (!seq) goto exit; spin_lock_bh(&seq->lock); - sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); sseq_stop = seq->sseqs + seq->first_free; for (; sseq != sseq_stop; sseq++) { @@ -651,10 +641,9 @@ int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, if (info->cluster_list_size != info->node_list_size) res = 1; } - spin_unlock_bh(&seq->lock); exit: - read_unlock_bh(&tipc_nametbl_lock); + rcu_read_unlock(); return res; } @@ -667,22 +656,23 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, struct publication *publ; struct sk_buff *buf = NULL; - if (table.local_publ_count >= TIPC_MAX_PUBLICATIONS) { + spin_lock_bh(&tipc_nametbl_lock); + if (tipc_nametbl->local_publ_count >= TIPC_MAX_PUBLICATIONS) { pr_warn("Publication failed, local publication limit reached (%u)\n", TIPC_MAX_PUBLICATIONS); + spin_unlock_bh(&tipc_nametbl_lock); return NULL; } - write_lock_bh(&tipc_nametbl_lock); publ = tipc_nametbl_insert_publ(type, lower, upper, scope, tipc_own_addr, port_ref, key); if (likely(publ)) { - table.local_publ_count++; + tipc_nametbl->local_publ_count++; buf = tipc_named_publish(publ); /* Any pending external events? */ tipc_named_process_backlog(); } - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); if (buf) named_cluster_distribute(buf); @@ -695,27 +685,28 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) { struct publication *publ; - struct sk_buff *buf; + struct sk_buff *skb = NULL; - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); publ = tipc_nametbl_remove_publ(type, lower, tipc_own_addr, ref, key); if (likely(publ)) { - table.local_publ_count--; - buf = tipc_named_withdraw(publ); + tipc_nametbl->local_publ_count--; + skb = tipc_named_withdraw(publ); /* Any pending external events? */ tipc_named_process_backlog(); - write_unlock_bh(&tipc_nametbl_lock); list_del_init(&publ->pport_list); - kfree(publ); + kfree_rcu(publ, rcu); + } else { + pr_err("Unable to remove local publication\n" + "(type=%u, lower=%u, ref=%u, key=%u)\n", + type, lower, ref, key); + } + spin_unlock_bh(&tipc_nametbl_lock); - if (buf) - named_cluster_distribute(buf); + if (skb) { + named_cluster_distribute(skb); return 1; } - write_unlock_bh(&tipc_nametbl_lock); - pr_err("Unable to remove local publication\n" - "(type=%u, lower=%u, ref=%u, key=%u)\n", - type, lower, ref, key); return 0; } @@ -725,12 +716,14 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) void tipc_nametbl_subscribe(struct tipc_subscription *s) { u32 type = s->seq.type; + int index = hash(type); struct name_seq *seq; - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); seq = nametbl_find_seq(type); if (!seq) - seq = tipc_nameseq_create(type, &table.types[hash(type)]); + seq = tipc_nameseq_create(type, + &tipc_nametbl->seq_hlist[index]); if (seq) { spin_lock_bh(&seq->lock); tipc_nameseq_subscribe(seq, s); @@ -739,7 +732,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s) pr_warn("Failed to create subscription for {%u,%u,%u}\n", s->seq.type, s->seq.lower, s->seq.upper); } - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); } /** @@ -749,18 +742,23 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) { struct name_seq *seq; - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); seq = nametbl_find_seq(s->seq.type); if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); - spin_unlock_bh(&seq->lock); - nameseq_delete_empty(seq); + if (!seq->first_free && list_empty(&seq->subscriptions)) { + hlist_del_init_rcu(&seq->ns_list); + kfree(seq->sseqs); + spin_unlock_bh(&seq->lock); + kfree_rcu(seq, rcu); + } else { + spin_unlock_bh(&seq->lock); + } } - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); } - /** * subseq_list - print specified sub-sequence contents into the given buffer */ @@ -882,8 +880,8 @@ static int nametbl_list(char *buf, int len, u32 depth_info, lowbound = 0; upbound = ~0; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { - seq_head = &table.types[i]; - hlist_for_each_entry(seq, seq_head, ns_list) { + seq_head = &tipc_nametbl->seq_hlist[i]; + hlist_for_each_entry_rcu(seq, seq_head, ns_list) { ret += nameseq_list(seq, buf + ret, len - ret, depth, seq->type, lowbound, upbound, i); @@ -898,8 +896,8 @@ static int nametbl_list(char *buf, int len, u32 depth_info, } ret += nametbl_header(buf + ret, len - ret, depth); i = hash(type); - seq_head = &table.types[i]; - hlist_for_each_entry(seq, seq_head, ns_list) { + seq_head = &tipc_nametbl->seq_hlist[i]; + hlist_for_each_entry_rcu(seq, seq_head, ns_list) { if (seq->type == type) { ret += nameseq_list(seq, buf + ret, len - ret, depth, type, @@ -931,11 +929,11 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space) pb = TLV_DATA(rep_tlv); pb_len = ULTRA_STRING_MAX_LEN; argv = (struct tipc_name_table_query *)TLV_DATA(req_tlv_area); - read_lock_bh(&tipc_nametbl_lock); + rcu_read_lock(); str_len = nametbl_list(pb, pb_len, ntohl(argv->depth), ntohl(argv->type), ntohl(argv->lowbound), ntohl(argv->upbound)); - read_unlock_bh(&tipc_nametbl_lock); + rcu_read_unlock(); str_len += 1; /* for "\0" */ skb_put(buf, TLV_SPACE(str_len)); TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); @@ -945,12 +943,18 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space) int tipc_nametbl_init(void) { - table.types = kcalloc(TIPC_NAMETBL_SIZE, sizeof(struct hlist_head), - GFP_ATOMIC); - if (!table.types) + int i; + + tipc_nametbl = kzalloc(sizeof(*tipc_nametbl), GFP_ATOMIC); + if (!tipc_nametbl) return -ENOMEM; - table.local_publ_count = 0; + for (i = 0; i < TIPC_NAMETBL_SIZE; i++) + INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]); + + INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]); + INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]); + INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_NODE_SCOPE]); return 0; } @@ -965,17 +969,19 @@ static void tipc_purge_publications(struct name_seq *seq) struct sub_seq *sseq; struct name_info *info; - if (!seq->sseqs) { - nameseq_delete_empty(seq); - return; - } + spin_lock_bh(&seq->lock); sseq = seq->sseqs; info = sseq->info; list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) { tipc_nametbl_remove_publ(publ->type, publ->lower, publ->node, publ->ref, publ->key); - kfree(publ); + kfree_rcu(publ, rcu); } + hlist_del_init_rcu(&seq->ns_list); + kfree(seq->sseqs); + spin_lock_bh(&seq->lock); + + kfree_rcu(seq, rcu); } void tipc_nametbl_stop(void) @@ -983,23 +989,24 @@ void tipc_nametbl_stop(void) u32 i; struct name_seq *seq; struct hlist_head *seq_head; - struct hlist_node *safe; /* Verify name table is empty and purge any lingering * publications, then release the name table */ - write_lock_bh(&tipc_nametbl_lock); + spin_lock_bh(&tipc_nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { - if (hlist_empty(&table.types[i])) + if (hlist_empty(&tipc_nametbl->seq_hlist[i])) continue; - seq_head = &table.types[i]; - hlist_for_each_entry_safe(seq, safe, seq_head, ns_list) { + seq_head = &tipc_nametbl->seq_hlist[i]; + hlist_for_each_entry_rcu(seq, seq_head, ns_list) { tipc_purge_publications(seq); } } - kfree(table.types); - table.types = NULL; - write_unlock_bh(&tipc_nametbl_lock); + spin_unlock_bh(&tipc_nametbl_lock); + + synchronize_net(); + kfree(tipc_nametbl); + } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, @@ -1103,7 +1110,7 @@ static int __tipc_nl_seq_list(struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_publ) { struct hlist_head *seq_head; - struct name_seq *seq; + struct name_seq *seq = NULL; int err; int i; @@ -1113,22 +1120,21 @@ static int __tipc_nl_seq_list(struct tipc_nl_msg *msg, u32 *last_type, i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { - seq_head = &table.types[i]; + seq_head = &tipc_nametbl->seq_hlist[i]; if (*last_type) { seq = nametbl_find_seq(*last_type); if (!seq) return -EPIPE; } else { - seq = hlist_entry_safe((seq_head)->first, - struct name_seq, ns_list); + hlist_for_each_entry_rcu(seq, seq_head, ns_list) + break; if (!seq) continue; } - hlist_for_each_entry_from(seq, ns_list) { + hlist_for_each_entry_from_rcu(seq, ns_list) { spin_lock_bh(&seq->lock); - err = __tipc_nl_subseq_list(msg, seq, last_lower, last_publ); @@ -1160,8 +1166,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; - read_lock_bh(&tipc_nametbl_lock); - + rcu_read_lock(); err = __tipc_nl_seq_list(&msg, &last_type, &last_lower, &last_publ); if (!err) { done = 1; @@ -1174,8 +1179,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) */ cb->prev_seq = 1; } - - read_unlock_bh(&tipc_nametbl_lock); + rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index c62877826655..5f0dee92010d 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -43,7 +43,9 @@ struct tipc_port_list; /* * TIPC name types reserved for internal TIPC use (both current and planned) */ -#define TIPC_ZM_SRV 3 /* zone master service name type */ +#define TIPC_ZM_SRV 3 /* zone master service name type */ +#define TIPC_PUBL_SCOPE_NUM (TIPC_NODE_SCOPE + 1) +#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ /** * struct publication - info about a published (name or) name sequence @@ -60,6 +62,7 @@ struct tipc_port_list; * @node_list: adjacent matching name seq publications with >= node scope * @cluster_list: adjacent matching name seq publications with >= cluster scope * @zone_list: adjacent matching name seq publications with >= zone scope + * @rcu: RCU callback head used for deferred freeing * * Note that the node list, cluster list, and zone list are circular lists. */ @@ -77,10 +80,23 @@ struct publication { struct list_head node_list; struct list_head cluster_list; struct list_head zone_list; + struct rcu_head rcu; }; +/** + * struct name_table - table containing all existing port name publications + * @seq_hlist: name sequence hash lists + * @publ_list: pulication lists + * @local_publ_count: number of publications issued by this node + */ +struct name_table { + struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE]; + struct list_head publ_list[TIPC_PUBL_SCOPE_NUM]; + u32 local_publ_count; +}; -extern rwlock_t tipc_nametbl_lock; +extern spinlock_t tipc_nametbl_lock; +extern struct name_table *tipc_nametbl; int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9658d9b63876..4731cad99d1c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -827,39 +827,6 @@ exit: return TIPC_OK; } -/** - * dest_name_check - verify user is permitted to send to specified port name - * @dest: destination address - * @m: descriptor for message to be sent - * - * Prevents restricted configuration commands from being issued by - * unauthorized users. - * - * Returns 0 if permission is granted, otherwise errno - */ -static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) -{ - struct tipc_cfg_msg_hdr hdr; - - if (unlikely(dest->addrtype == TIPC_ADDR_ID)) - return 0; - if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES)) - return 0; - if (likely(dest->addr.name.name.type == TIPC_TOP_SRV)) - return 0; - if (likely(dest->addr.name.name.type != TIPC_CFG_SRV)) - return -EACCES; - - if (!m->msg_iovlen || (m->msg_iov[0].iov_len < sizeof(hdr))) - return -EMSGSIZE; - if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr))) - return -EFAULT; - if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN))) - return -EACCES; - - return 0; -} - static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) { struct sock *sk = sock->sk; @@ -912,7 +879,7 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, struct tipc_name_seq *seq = &dest->addr.nameseq; u32 mtu; long timeo; - int rc = -EINVAL; + int rc; if (unlikely(!dest)) return -EDESTADDRREQ; @@ -945,9 +912,6 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, tsk->conn_instance = dest->addr.name.name.instance; } } - rc = dest_name_check(dest, m); - if (rc) - goto exit; timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 31b5cb232a43..0344206b984f 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -305,7 +305,6 @@ static int subscr_subscribe(struct tipc_subscr *s, kfree(sub); return -EINVAL; } - INIT_LIST_HEAD(&sub->nameseq_list); list_add(&sub->subscription_list, &subscriber->subscription_list); sub->subscriber = subscriber; sub->swap = swap; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 88bf289abdc9..cee479bc655c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -55,6 +55,7 @@ static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(unsigned long arg); +static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); @@ -561,7 +562,7 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_lock(&hash_resize_mutex); total = 0; - for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } @@ -601,7 +602,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) write_lock_bh(&net->xfrm.xfrm_policy_lock); /* reset the bydst and inexact table in all directions */ - for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; @@ -779,8 +780,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); - xfrm_pol_hold(policy); - net->xfrm.policy_count[dir]++; + __xfrm_policy_link(policy, dir); atomic_inc(&net->xfrm.flow_cache_genid); /* After previous checking, family can either be AF_INET or AF_INET6 */ @@ -799,7 +799,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); - list_add(&policy->walk.all, &net->xfrm.policy_all); write_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) @@ -1247,17 +1246,10 @@ out: static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); - struct hlist_head *chain = policy_hash_bysel(net, &pol->selector, - pol->family, dir); list_add(&pol->walk.all, &net->xfrm.policy_all); - hlist_add_head(&pol->bydst, chain); - hlist_add_head(&pol->byidx, net->xfrm.policy_byidx+idx_hash(net, pol->index)); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); - - if (xfrm_bydst_should_resize(net, dir, NULL)) - schedule_work(&net->xfrm.policy_hash_work); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, @@ -1265,17 +1257,31 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, { struct net *net = xp_net(pol); - if (hlist_unhashed(&pol->bydst)) + if (list_empty(&pol->walk.all)) return NULL; - hlist_del_init(&pol->bydst); - hlist_del(&pol->byidx); - list_del(&pol->walk.all); + /* Socket policies are not hashed. */ + if (!hlist_unhashed(&pol->bydst)) { + hlist_del(&pol->bydst); + hlist_del(&pol->byidx); + } + + list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } +static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) +{ + __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); +} + +static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) +{ + __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); +} + int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); @@ -1307,7 +1313,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) if (pol) { pol->curlft.add_time = get_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); - __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); + xfrm_sk_policy_link(pol, dir); } if (old_pol) { if (pol) @@ -1316,7 +1322,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ - __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); + xfrm_sk_policy_unlink(old_pol, dir); } write_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -1349,7 +1355,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); write_lock_bh(&net->xfrm.xfrm_policy_lock); - __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir); + xfrm_sk_policy_link(newp, dir); write_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } @@ -1878,7 +1884,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, static void xfrm_policy_queue_process(unsigned long arg) { - int err = 0; struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; @@ -1941,7 +1946,7 @@ static void xfrm_policy_queue_process(unsigned long arg) skb_dst_drop(skb); skb_dst_set(skb, dst); - err = dst_output(skb); + dst_output(skb); } out: @@ -2966,10 +2971,11 @@ static int __net_init xfrm_policy_init(struct net *net) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; - for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; + net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; @@ -3021,7 +3027,7 @@ static void xfrm_policy_fini(struct net *net) WARN_ON(!list_empty(&net->xfrm.policy_all)); - for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e812e988c111..8128594ab379 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -824,13 +824,15 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = xfrm_mark_put(skb, &x->mark); if (ret) goto out; - if (x->replay_esn) { + if (x->replay_esn) ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL, xfrm_replay_state_esn_len(x->replay_esn), x->replay_esn); - if (ret) - goto out; - } + else + ret = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), + &x->replay); + if (ret) + goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -2569,6 +2571,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->tfcpad)); if (x->replay_esn) l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn)); + else + l += nla_total_size(sizeof(struct xfrm_replay_state)); if (x->security) l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) + x->security->ctx_len); |