summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/fib_semantics.c15
-rw-r--r--net/ipv4/gre_offload.c2
-rw-r--r--net/ipv4/icmp.c15
-rw-r--r--net/ipv4/inet_connection_sock.c21
-rw-r--r--net/ipv4/inet_fragment.c4
-rw-r--r--net/ipv4/inet_timewait_sock.c7
-rw-r--r--net/ipv4/inetpeer.c15
-rw-r--r--net/ipv4/ip_fragment.c5
-rw-r--r--net/ipv4/ip_gre.c18
-rw-r--r--net/ipv4/ip_input.c25
-rw-r--r--net/ipv4/ip_vti.c3
-rw-r--r--net/ipv4/ipmr.c262
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c3
-rw-r--r--net/ipv4/route.c51
-rw-r--r--net/ipv4/sysctl_net_ipv4.c60
-rw-r--r--net/ipv4/tcp.c83
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_fastopen.c120
-rw-r--r--net/ipv4/tcp_input.c238
-rw-r--r--net/ipv4/tcp_ipv4.c24
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_output.c181
-rw-r--r--net/ipv4/tcp_recovery.c52
-rw-r--r--net/ipv4/tcp_timer.c42
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/udp.c36
-rw-r--r--net/ipv4/udp_offload.c2
31 files changed, 834 insertions, 476 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e31108e5ef79..ce4aa827be05 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
- int err;
+ int err, tcp_fastopen;
lock_sock(sk);
@@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog)
* because the socket was in TCP_LISTEN state previously but
* was shutdown() rather than close().
*/
- if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
- (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+ if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+ (tcp_fastopen & TFO_SERVER_ENABLE) &&
!inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
fastopen_queue_tune(sk, backlog);
- tcp_fastopen_init_key_once(true);
+ tcp_fastopen_init_key_once(sock_net(sk));
}
err = inet_csk_listen_start(sk, backlog);
@@ -826,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how)
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ /* fall through */
default:
sk->sk_shutdown |= how;
if (sk->sk_prot->shutdown)
@@ -839,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_LISTEN:
if (!(how & RCV_SHUTDOWN))
break;
- /* Fall through */
+ /* fall through */
case TCP_SYN_SENT:
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7c45b8896709..a8d7c5a9fb05 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSARP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
+ /* fall through */
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
if (err)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7ce22a2c07ce..e1e2ec0525e6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1522,6 +1522,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (inetdev_valid_mtu(dev->mtu))
break;
/* disable IP when MTU is not enough */
+ /* fall through */
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
@@ -1757,7 +1758,7 @@ static int inet_validate_link_af(const struct net_device *dev,
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int err, rem;
- if (dev && !__in_dev_get_rtnl(dev))
+ if (dev && !__in_dev_get_rcu(dev))
return -EAFNOSUPPORT;
err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL);
@@ -1781,7 +1782,7 @@ static int inet_validate_link_af(const struct net_device *dev,
static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int rem;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..467b3c15395f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi)
atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
} endfor_nexthops(fi);
}
-
-static inline void fib_add_weight(struct fib_info *fi,
- const struct fib_nh *nh)
-{
- fi->fib_weight += nh->nh_weight;
-}
-
#else /* CONFIG_IP_ROUTE_MULTIPATH */
#define fib_rebalance(fi) do { } while (0)
-#define fib_add_weight(fi, nh) do { } while (0)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
* |
* |-> {local prefix} (terminal node)
*/
-static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
- struct fib_nh *nh, struct netlink_ext_ack *extack)
+static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
+ struct netlink_ext_ack *extack)
{
int err = 0;
struct net *net;
@@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
int linkdown = 0;
change_nexthops(fi) {
- err = fib_check_nh(cfg, fi, nexthop_nh, extack);
+ err = fib_check_nh(cfg, nexthop_nh, extack);
if (err != 0)
goto failure;
if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
@@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
- fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
fib_rebalance(fi);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 416bb304a281..1859c473b21a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (gso_partial) {
+ if (gso_partial && skb_is_gso(skb)) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 681e33998e03..3c1570d3e22f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto)
}
/*
- * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
* ICMP_PARAMETERPROB.
*/
@@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb)
if (iph->ihl < 5) /* Mangled header, drop. */
goto out_err;
- if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
switch (icmph->code & 15) {
case ICMP_NET_UNREACH:
case ICMP_HOST_UNREACH:
@@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb)
}
if (icmph->code > NR_ICMP_UNREACH)
goto out;
- } else if (icmph->type == ICMP_PARAMETERPROB)
+ break;
+ case ICMP_PARAMETERPROB:
info = ntohl(icmph->un.gateway) >> 24;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
+ if (icmph->code == ICMP_EXC_FRAGTIME)
+ goto out;
+ break;
+ }
/*
* Throw it at our lower layers
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8a91ebbf0c01..5c965ecc96a0 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -494,17 +494,15 @@ EXPORT_SYMBOL(inet_csk_accept);
* to optimize.
*/
void inet_csk_init_xmit_timers(struct sock *sk,
- void (*retransmit_handler)(unsigned long),
- void (*delack_handler)(unsigned long),
- void (*keepalive_handler)(unsigned long))
+ void (*retransmit_handler)(struct timer_list *t),
+ void (*delack_handler)(struct timer_list *t),
+ void (*keepalive_handler)(struct timer_list *t))
{
struct inet_connection_sock *icsk = inet_csk(sk);
- setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
- (unsigned long)sk);
- setup_timer(&icsk->icsk_delack_timer, delack_handler,
- (unsigned long)sk);
- setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
+ timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+ timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
+ timer_setup(&sk->sk_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
@@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
-static void reqsk_timer_handler(unsigned long data)
+static void reqsk_timer_handler(struct timer_list *t)
{
- struct request_sock *req = (struct request_sock *)data;
+ struct request_sock *req = from_timer(req, t, rsk_timer);
struct sock *sk_listener = req->rsk_listener;
struct net *net = sock_net(sk_listener);
struct inet_connection_sock *icsk = inet_csk(sk_listener);
@@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
req->num_timeout = 0;
req->sk = NULL;
- setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
- (unsigned long)req);
+ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
mod_timer(&req->rsk_timer, jiffies + timeout);
inet_ehash_insert(req_to_sk(req), NULL);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index af74d0433453..7f3ef5c287a1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
spin_unlock(&hb->chain_lock);
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
- f->frag_expire((unsigned long) fq);
+ f->frag_expire(&fq->timer);
return evicted;
}
@@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
f->constructor(q, arg);
add_frag_mem_limit(nf, f->qsize);
- setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+ timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
refcount_set(&q->refcnt, 1);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b039159e67a..a4bab81f1462 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -142,9 +142,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
-static void tw_timer_handler(unsigned long data)
+static void tw_timer_handler(struct timer_list *t)
{
- struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
+ struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
if (tw->tw_kill)
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
@@ -188,8 +188,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
- setup_pinned_timer(&tw->tw_timer, tw_timer_handler,
- (unsigned long)tw);
+ timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e7eb590c86ce..914d56928578 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
break;
}
if (cmp == -1)
- pp = &(*pp)->rb_left;
+ pp = &next->rb_left;
else
- pp = &(*pp)->rb_right;
+ pp = &next->rb_right;
}
*parent_p = parent;
*pp_p = pp;
@@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow);
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- struct inet_peer *p, *n;
+ struct rb_node *p = rb_first(&base->rb_root);
- rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) {
- inet_putpeer(p);
+ while (p) {
+ struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);
+
+ p = rb_next(p);
+ rb_erase(&peer->rb_node, &base->rb_root);
+ inet_putpeer(peer);
cond_resched();
}
- base->rb_root = RB_ROOT;
base->total = 0;
}
EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 46408c220d9d..9215654a401f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -190,12 +190,13 @@ static bool frag_expire_skip_icmp(u32 user)
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
-static void ip_expire(unsigned long arg)
+static void ip_expire(struct timer_list *t)
{
+ struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct ipq *qp;
struct net *net;
- qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+ qp = container_of(frag, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
rcu_read_lock();
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9cee986ac6b8..c105a315b1a3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
struct ip_tunnel *tunnel;
struct erspanhdr *ershdr;
const struct iphdr *iph;
- __be32 session_id;
__be32 index;
int len;
@@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
/* The original GRE header does not have key field,
* Use ERSPAN 10-bit session ID as key.
*/
- session_id = cpu_to_be32(ntohs(ershdr->session_id));
- tpi->key = session_id;
+ tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
index = ershdr->md.index;
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
tpi->flags | TUNNEL_KEY,
@@ -581,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
if (gre_handle_offloads(skb, false))
goto err_free_rt;
- if (skb->len > dev->mtu) {
- pskb_trim(skb, dev->mtu);
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ pskb_trim(skb, dev->mtu + dev->hard_header_len);
truncate = true;
}
@@ -733,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
- if (skb->len > dev->mtu) {
- pskb_trim(skb, dev->mtu);
+ if (skb->len > dev->mtu + dev->hard_header_len) {
+ pskb_trim(skb, dev->mtu + dev->hard_header_len);
truncate = true;
}
@@ -1222,6 +1220,7 @@ static int gre_tap_init(struct net_device *dev)
{
__gre_tunnel_init(dev);
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
return ip_tunnel_init(dev);
}
@@ -1245,13 +1244,16 @@ static int erspan_tunnel_init(struct net_device *dev)
tunnel->tun_hlen = 8;
tunnel->parms.iph.protocol = IPPROTO_GRE;
- t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
+ sizeof(struct erspanhdr);
+ t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
dev->mtu = ETH_DATA_LEN - t_hlen - 4;
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netif_keep_dst(dev);
return ip_tunnel_init(dev);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fa2dc8f692c6..57fc13c6ab2b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -311,9 +311,10 @@ drop:
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt;
+ int (*edemux)(struct sk_buff *skb);
struct net_device *dev = skb->dev;
- void (*edemux)(struct sk_buff *skb);
+ struct rtable *rt;
+ int err;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
- edemux(skb);
+ err = edemux(skb);
+ if (unlikely(err))
+ goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
* how the packet travels inside Linux networking.
*/
if (!skb_valid_dst(skb)) {
- int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, dev);
- if (unlikely(err)) {
- if (err == -EXDEV)
- __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
- goto drop;
- }
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, dev);
+ if (unlikely(err))
+ goto drop_error;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
drop:
kfree_skb(skb);
return NET_RX_DROP;
+
+drop_error:
+ if (err == -EXDEV)
+ __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+ goto drop;
}
/*
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 02d70ca99db1..58465c0a8682 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
struct ip_tunnel_parm *parms = &tunnel->parms;
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev; /* Device to other host */
+ int pkt_len = skb->len;
int err;
int mtu;
@@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
- err = skb->len;
+ err = pkt_len;
iptunnel_xmit_stats(dev, err);
return NETDEV_TX_OK;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c9b3e6e069ae..40a43ad294cb 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,6 +67,7 @@
#include <net/fib_rules.h>
#include <linux/netconf.h>
#include <net/nexthop.h>
+#include <net/switchdev.h>
struct ipmr_rule {
struct fib_rule common;
@@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net)
fib_rules_unregister(net->ipv4.mr_rules_ops);
rtnl_unlock();
}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+ return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+ return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
#else
#define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net)
net->ipv4.mrt = NULL;
rtnl_unlock();
}
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+ return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+ return 0;
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+ return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
#endif
static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
}
#endif
+static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
+ struct net *net,
+ enum fib_event_type event_type,
+ struct vif_device *vif,
+ vifi_t vif_index, u32 tb_id)
+{
+ struct vif_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .dev = vif->dev,
+ .vif_index = vif_index,
+ .vif_flags = vif->flags,
+ .tb_id = tb_id,
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_vif_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct vif_device *vif,
+ vifi_t vif_index, u32 tb_id)
+{
+ struct vif_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .dev = vif->dev,
+ .vif_index = vif_index,
+ .vif_flags = vif->flags,
+ .tb_id = tb_id,
+ };
+
+ ASSERT_RTNL();
+ net->ipv4.ipmr_seq++;
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
+ struct net *net,
+ enum fib_event_type event_type,
+ struct mfc_cache *mfc, u32 tb_id)
+{
+ struct mfc_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .mfc = mfc,
+ .tb_id = tb_id
+ };
+
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct mfc_cache *mfc, u32 tb_id)
+{
+ struct mfc_entry_notifier_info info = {
+ .info = {
+ .family = RTNL_FAMILY_IPMR,
+ .net = net,
+ },
+ .mfc = mfc,
+ .tb_id = tb_id
+ };
+
+ ASSERT_RTNL();
+ net->ipv4.ipmr_seq++;
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
/**
* vif_delete - Delete a VIF entry
* @notify: Set to 1, if the caller is a notifier_call
@@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
struct list_head *head)
{
+ struct net *net = read_pnet(&mrt->net);
struct vif_device *v;
struct net_device *dev;
struct in_device *in_dev;
@@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
+ if (VIF_EXISTS(mrt, vifi))
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
+ mrt->id);
+
write_lock_bh(&mrt_lock);
dev = v->dev;
v->dev = NULL;
@@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
kmem_cache_free(mrt_cachep, c);
}
-static inline void ipmr_cache_free(struct mfc_cache *c)
+void ipmr_cache_free(struct mfc_cache *c)
{
call_rcu(&c->rcu, ipmr_cache_free_rcu);
}
+EXPORT_SYMBOL(ipmr_cache_free);
/* Destroy an unresolved cache entry, killing queued skbs
* and reporting error to netlink readers.
@@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
struct vifctl *vifc, int mrtsock)
{
int vifi = vifc->vifc_vifi;
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+ };
struct vif_device *v = &mrt->vif_table[vifi];
struct net_device *dev;
struct in_device *in_dev;
@@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
/* Fill in the VIF structures */
+ attr.orig_dev = dev;
+ if (!switchdev_port_attr_get(dev, &attr)) {
+ memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
+ v->dev_parent_id.id_len = attr.u.ppid.id_len;
+ } else {
+ v->dev_parent_id.id_len = 0;
+ }
v->rate_limit = vifc->vifc_rate_limit;
v->local = vifc->vifc_lcl_addr.s_addr;
v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
if (vifi+1 > mrt->maxvif)
mrt->maxvif = vifi+1;
write_unlock_bh(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
return 0;
}
@@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
if (c) {
c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
c->mfc_un.res.minvif = MAXVIFS;
+ refcount_set(&c->mfc_un.res.refcount, 1);
}
return c;
}
@@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
+ struct net *net = read_pnet(&mrt->net);
struct mfc_cache *c;
/* The entries are added/deleted only under RTNL */
@@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
return -ENOENT;
rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
list_del_rcu(&c->list);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
+ ipmr_cache_put(c);
return 0;
}
@@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
+ mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
@@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
ipmr_cache_resolve(net, mrt, uc, c);
ipmr_cache_free(uc);
}
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
@@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
+ struct net *net = read_pnet(&mrt->net);
struct mfc_cache *c, *tmp;
LIST_HEAD(list);
int i;
@@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
continue;
rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
list_del_rcu(&c->list);
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c,
+ mrt->id);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
+ ipmr_cache_put(c);
}
if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
@@ -1393,6 +1528,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
+ /* fall through */
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc)) {
@@ -1724,10 +1860,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
return dst_output(net, sk, skb);
}
+#ifdef CONFIG_NET_SWITCHDEV
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+ int in_vifi, int out_vifi)
+{
+ struct vif_device *out_vif = &mrt->vif_table[out_vifi];
+ struct vif_device *in_vif = &mrt->vif_table[in_vifi];
+
+ if (!skb->offload_mr_fwd_mark)
+ return false;
+ if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
+ return false;
+ return netdev_phys_item_id_same(&out_vif->dev_parent_id,
+ &in_vif->dev_parent_id);
+}
+#else
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+ int in_vifi, int out_vifi)
+{
+ return false;
+}
+#endif
+
/* Processing handlers for ipmr_forward */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *c, int vifi)
+ int in_vifi, struct sk_buff *skb,
+ struct mfc_cache *c, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
@@ -1748,6 +1907,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
goto out_free;
}
+ if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+ goto out_free;
+
if (vif->flags & VIFF_TUNNEL) {
rt = ip_route_output_ports(net, &fl4, NULL,
vif->remote, vif->local,
@@ -1925,8 +2087,8 @@ forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, skb2, cache,
- psend);
+ ipmr_queue_xmit(net, mrt, true_vifi,
+ skb2, cache, psend);
}
psend = ct;
}
@@ -1937,9 +2099,10 @@ last_forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+ ipmr_queue_xmit(net, mrt, true_vifi, skb2,
+ cache, psend);
} else {
- ipmr_queue_xmit(net, mrt, skb, cache, psend);
+ ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend);
return;
}
}
@@ -2156,6 +2319,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
return -EMSGSIZE;
+ if (c->mfc_flags & MFC_OFFLOAD)
+ rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
return -EMSGSIZE;
@@ -3048,14 +3214,87 @@ static const struct net_protocol pim_protocol = {
};
#endif
+static unsigned int ipmr_seq_read(struct net *net)
+{
+ ASSERT_RTNL();
+
+ return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb)
+{
+ struct mr_table *mrt;
+ int err;
+
+ err = ipmr_rules_dump(net, nb);
+ if (err)
+ return err;
+
+ ipmr_for_each_table(mrt, net) {
+ struct vif_device *v = &mrt->vif_table[0];
+ struct mfc_cache *mfc;
+ int vifi;
+
+ /* Notifiy on table VIF entries */
+ read_lock(&mrt_lock);
+ for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+ if (!v->dev)
+ continue;
+
+ call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
+ v, vifi, mrt->id);
+ }
+ read_unlock(&mrt_lock);
+
+ /* Notify on table MFC entries */
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+ call_ipmr_mfc_entry_notifier(nb, net,
+ FIB_EVENT_ENTRY_ADD, mfc,
+ mrt->id);
+ }
+
+ return 0;
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .fib_seq_read = ipmr_seq_read,
+ .fib_dump = ipmr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ipmr_notifier_init(struct net *net)
+{
+ struct fib_notifier_ops *ops;
+
+ net->ipv4.ipmr_seq = 0;
+
+ ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ net->ipv4.ipmr_notifier_ops = ops;
+
+ return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+ fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+ net->ipv4.ipmr_notifier_ops = NULL;
+}
+
/* Setup for IP multicast routing */
static int __net_init ipmr_net_init(struct net *net)
{
int err;
+ err = ipmr_notifier_init(net);
+ if (err)
+ goto ipmr_notifier_fail;
+
err = ipmr_rules_init(net);
if (err < 0)
- goto fail;
+ goto ipmr_rules_fail;
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
@@ -3072,7 +3311,9 @@ proc_cache_fail:
proc_vif_fail:
ipmr_rules_exit(net);
#endif
-fail:
+ipmr_rules_fail:
+ ipmr_notifier_exit(net);
+ipmr_notifier_fail:
return err;
}
@@ -3082,6 +3323,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_cache", net->proc_net);
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
+ ipmr_notifier_exit(net);
ipmr_rules_exit(net);
}
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 811689e523c3..f75fc6b53115 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
if (synproxy == NULL)
return NF_ACCEPT;
- if (nf_is_loopback_packet(skb))
+ if (nf_is_loopback_packet(skb) ||
+ ip_hdr(skb)->protocol != IPPROTO_TCP)
return NF_ACCEPT;
thoff = ip_hdrlen(skb);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index a0f37b208268..0443ca4120b0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
else
return NF_ACCEPT;
}
- /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ /* Only ICMPs can be IP_CT_IS_REPLY: */
+ /* fall through */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94d4cd2d5ea4..4306db827374 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
EXPORT_SYMBOL(rt_dst_alloc);
/* called in rcu_read_lock() section */
-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, int our)
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct in_device *in_dev, u32 *itag)
{
- struct rtable *rth;
- struct in_device *in_dev = __in_dev_get_rcu(dev);
- unsigned int flags = RTCF_MULTICAST;
- u32 itag = 0;
int err;
/* Primary sanity checks. */
-
if (!in_dev)
return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
skb->protocol != htons(ETH_P_IP))
- goto e_inval;
+ return -EINVAL;
if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
- goto e_inval;
+ return -EINVAL;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
- goto e_inval;
+ return -EINVAL;
} else {
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
- in_dev, &itag);
+ in_dev, itag);
if (err < 0)
- goto e_err;
+ return err;
}
+ return 0;
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, int our)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ unsigned int flags = RTCF_MULTICAST;
+ struct rtable *rth;
+ u32 itag = 0;
+ int err;
+
+ err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
+ if (err)
+ return err;
+
if (our)
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
- goto e_nobufs;
+ return -ENOBUFS;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
@@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb_dst_set(skb, &rth->dst);
return 0;
-
-e_nobufs:
- return -ENOBUFS;
-e_inval:
- return -EINVAL;
-e_err:
- return err;
}
@@ -2507,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
struct rtable *ort = (struct rtable *) dst_orig;
struct rtable *rt;
- rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
if (rt) {
struct dst_entry *new = &rt->dst;
@@ -3032,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
int __init ip_rt_init(void)
{
- int rc = 0;
int cpu;
ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
@@ -3089,7 +3094,7 @@ int __init ip_rt_init(void)
#endif
register_pernet_subsys(&rt_genid_ops);
register_pernet_subsys(&ipv4_inetpeer_ops);
- return rc;
+ return 0;
}
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0d3c038d7b04..cac8dd309f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
-static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen);
struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
struct tcp_fastopen_context *ctxt;
int ret;
@@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
return -ENOMEM;
rcu_read_lock();
- ctxt = rcu_dereference(tcp_fastopen_ctx);
+ ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
if (ctxt)
memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
else
@@ -282,12 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
ret = -EINVAL;
goto bad_key;
}
- /* Generate a dummy secret but don't publish it. This
- * is needed so we don't regenerate a new key on the
- * first invocation of tcp_fastopen_cookie_gen
- */
- tcp_fastopen_init_key_once(false);
- tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+ tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
}
bad_key:
@@ -358,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen_blackhole_timeout);
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
- tcp_fastopen_active_timeout_reset();
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
return ret;
}
@@ -401,27 +400,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_fastopen",
- .data = &sysctl_tcp_fastopen,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_fastopen_key",
- .mode = 0600,
- .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
- .proc_handler = proc_tcp_fastopen_key,
- },
- {
- .procname = "tcp_fastopen_blackhole_timeout_sec",
- .data = &sysctl_tcp_fastopen_blackhole_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_tfo_blackhole_detect_timeout,
- .extra1 = &zero,
- },
- {
.procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow,
.maxlen = sizeof(int),
@@ -1085,6 +1063,28 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_fastopen",
+ .data = &init_net.ipv4.sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fastopen_key",
+ .mode = 0600,
+ .data = &init_net.ipv4.sysctl_tcp_fastopen,
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ .proc_handler = proc_tcp_fastopen_key,
+ },
+ {
+ .procname = "tcp_fastopen_blackhole_timeout_sec",
+ .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tfo_blackhole_detect_timeout,
+ .extra1 = &zero,
+ },
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
.procname = "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402720ab..3b34850d361f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -413,8 +413,10 @@ void tcp_init_sock(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->out_of_order_queue = RB_ROOT;
+ sk->tcp_rtx_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
INIT_LIST_HEAD(&tp->tsq_node);
+ INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -456,8 +458,22 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
+void tcp_init_transfer(struct sock *sk, int bpf_op)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_mtup_init(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_metrics(sk);
+ tcp_call_bpf(sk, bpf_op);
+ tcp_init_congestion_control(sk);
+ tcp_init_buffer_space(sk);
+}
+
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
+{
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -686,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- if (!tcp_send_head(sk))
- return;
-
skb = tcp_write_queue_tail(sk);
+ if (!skb)
+ return;
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
@@ -869,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
* available to the caller, no more, no less.
*/
skb->reserved_tailroom = skb->end - skb->tail - size;
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
__kfree_skb(skb);
@@ -948,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+ if (!skb || (copy = size_goal - skb->len) <= 0 ||
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- skb_queue_empty(&sk->sk_write_queue));
+ tcp_rtx_and_write_queues_empty(sk));
if (!skb)
goto wait_for_memory;
@@ -1027,7 +1043,7 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+ tcp_tx_timestamp(sk, sk->sk_tsflags);
if (!(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
@@ -1126,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
@@ -1183,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto out_err;
}
- skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+ skb = tcp_write_queue_tail(sk);
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
@@ -1259,7 +1275,7 @@ restart:
int max = size_goal;
skb = tcp_write_queue_tail(sk);
- if (tcp_send_head(sk)) {
+ if (skb) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
@@ -1279,7 +1295,7 @@ new_segment:
process_backlog = false;
goto restart;
}
- first_skb = skb_queue_empty(&sk->sk_write_queue);
+ first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg, first_skb),
sk->sk_allocation,
@@ -1404,7 +1420,7 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
+ tcp_tx_timestamp(sk, sockc.tsflags);
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
@@ -1505,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
/* XXX -- need to support SO_PEEK_OFF */
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ return err;
+ copied += skb->len;
+ }
+
skb_queue_walk(&sk->sk_write_queue, skb) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
@@ -2304,6 +2327,37 @@ static inline bool tcp_need_reset(int state)
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+ struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ /* Since we are deleting whole queue, no need to
+ * list_del(&skb->tcp_tsorted_anchor)
+ */
+ tcp_rtx_queue_unlink(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ tcp_skb_tsorted_anchor_cleanup(skb);
+ sk_wmem_free_skb(sk, skb);
+ }
+ tcp_rtx_queue_purge(sk);
+ INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+ sk_mem_reclaim(sk);
+ tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -2362,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
* issue in __tcp_select_window()
*/
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
- tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
dst_release(sk->sk_rx_dst);
@@ -2749,7 +2802,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
- tcp_fastopen_init_key_once(true);
+ tcp_fastopen_init_key_once(net);
fastopen_queue_tune(sk, val);
} else {
@@ -2759,7 +2812,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
- } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 66ac69f7bd19..06fbe102a425 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -389,7 +389,7 @@ static void tcp_cdg_release(struct sock *sk)
kfree(ca->gradients);
}
-struct tcp_congestion_ops tcp_cdg __read_mostly = {
+static struct tcp_congestion_ops tcp_cdg __read_mostly = {
.cong_avoid = tcp_cdg_cong_avoid,
.cwnd_event = tcp_cdg_cwnd_event,
.pkts_acked = tcp_cdg_acked,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index e3c33220c418..7ee4aadcdd71 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -9,15 +9,18 @@
#include <net/inetpeer.h>
#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
-
-struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-
-static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
-
-void tcp_fastopen_init_key_once(bool publish)
+void tcp_fastopen_init_key_once(struct net *net)
{
- static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ struct tcp_fastopen_context *ctxt;
+
+ rcu_read_lock();
+ ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+ if (ctxt) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
/* tcp_fastopen_reset_cipher publishes the new context
* atomically, so we allow this race happening here.
@@ -25,8 +28,8 @@ void tcp_fastopen_init_key_once(bool publish)
* All call sites of tcp_fastopen_cookie_gen also check
* for a valid cookie, so this is an acceptable risk.
*/
- if (net_get_random_once(key, sizeof(key)) && publish)
- tcp_fastopen_reset_cipher(key, sizeof(key));
+ get_random_bytes(key, sizeof(key));
+ tcp_fastopen_reset_cipher(net, key, sizeof(key));
}
static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -37,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
kfree(ctx);
}
-int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+void tcp_fastopen_ctx_destroy(struct net *net)
+{
+ struct tcp_fastopen_context *ctxt;
+
+ spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+ ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+ lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
+ spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+ if (ctxt)
+ call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
+}
+
+int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
{
int err;
struct tcp_fastopen_context *ctx, *octx;
@@ -61,26 +79,27 @@ error: kfree(ctx);
}
memcpy(ctx->key, key, len);
- spin_lock(&tcp_fastopen_ctx_lock);
+ spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
- octx = rcu_dereference_protected(tcp_fastopen_ctx,
- lockdep_is_held(&tcp_fastopen_ctx_lock));
- rcu_assign_pointer(tcp_fastopen_ctx, ctx);
- spin_unlock(&tcp_fastopen_ctx_lock);
+ octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+ lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+ spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
return err;
}
-static bool __tcp_fastopen_cookie_gen(const void *path,
+static bool __tcp_fastopen_cookie_gen(struct net *net,
+ const void *path,
struct tcp_fastopen_cookie *foc)
{
struct tcp_fastopen_context *ctx;
bool ok = false;
rcu_read_lock();
- ctx = rcu_dereference(tcp_fastopen_ctx);
+ ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
if (ctx) {
crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
@@ -96,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path,
*
* XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
*/
-static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+static bool tcp_fastopen_cookie_gen(struct net *net,
+ struct request_sock *req,
struct sk_buff *syn,
struct tcp_fastopen_cookie *foc)
{
@@ -104,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
const struct iphdr *iph = ip_hdr(syn);
__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
- return __tcp_fastopen_cookie_gen(path, foc);
+ return __tcp_fastopen_cookie_gen(net, path, foc);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -112,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
struct tcp_fastopen_cookie tmp;
- if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
struct in6_addr *buf = &tmp.addr;
int i;
for (i = 0; i < 4; i++)
buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
- return __tcp_fastopen_cookie_gen(buf, foc);
+ return __tcp_fastopen_cookie_gen(net, buf, foc);
}
}
#endif
@@ -216,12 +236,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
refcount_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
- inet_csk(child)->icsk_af_ops->rebuild_header(child);
- tcp_init_congestion_control(child);
- tcp_mtup_init(child);
- tcp_init_metrics(child);
- tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tcp_init_buffer_space(child);
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
@@ -279,25 +294,26 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct tcp_fastopen_cookie *foc)
{
- struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+ int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
- if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
tcp_fastopen_queue_check(sk))) {
foc->len = -1;
return NULL;
}
- if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
if (foc->len >= 0 && /* Client presents or requests a cookie */
- tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
foc->len == valid_foc.len &&
!memcmp(foc->val, valid_foc.val, foc->len)) {
@@ -347,7 +363,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
return false;
}
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
cookie->len = -1;
return true;
}
@@ -401,25 +417,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
* TFO connection with data exchanges.
*/
-/* Default to 1hr */
-unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
-static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
-static unsigned long tfo_active_disable_stamp __read_mostly;
-
/* Disable active TFO and record current jiffies and
* tfo_active_disable_times
*/
void tcp_fastopen_active_disable(struct sock *sk)
{
- atomic_inc(&tfo_active_disable_times);
- tfo_active_disable_stamp = jiffies;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE);
-}
+ struct net *net = sock_net(sk);
-/* Reset tfo_active_disable_times to 0 */
-void tcp_fastopen_active_timeout_reset(void)
-{
- atomic_set(&tfo_active_disable_times, 0);
+ atomic_inc(&net->ipv4.tfo_active_disable_times);
+ net->ipv4.tfo_active_disable_stamp = jiffies;
+ NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
}
/* Calculate timeout for tfo active disable
@@ -428,17 +435,18 @@ void tcp_fastopen_active_timeout_reset(void)
*/
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
- int tfo_da_times = atomic_read(&tfo_active_disable_times);
- int multiplier;
+ unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
+ int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
unsigned long timeout;
+ int multiplier;
if (!tfo_da_times)
return false;
/* Limit timout to max: 2^6 * initial timeout */
multiplier = 1 << min(tfo_da_times - 1, 6);
- timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
- if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+ timeout = multiplier * tfo_bh_timeout * HZ;
+ if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
return true;
/* Mark check bit so we can check for successful active TFO
@@ -457,27 +465,25 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct rb_node *p;
- struct sk_buff *skb;
struct dst_entry *dst;
+ struct sk_buff *skb;
if (!tp->syn_fastopen)
return;
if (!tp->data_segs_in) {
- p = rb_first(&tp->out_of_order_queue);
- if (p && !rb_next(p)) {
- skb = rb_entry(p, struct sk_buff, rbnode);
+ skb = skb_rb_first(&tp->out_of_order_queue);
+ if (skb && !skb_rb_next(skb)) {
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
tcp_fastopen_active_disable(sk);
return;
}
}
} else if (tp->syn_fastopen_ch &&
- atomic_read(&tfo_active_disable_times)) {
+ atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
dst = sk_dst_get(sk);
if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
- tcp_fastopen_active_timeout_reset();
+ atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
dst_release(dst);
}
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index db9bb46b5776..b2390bfdc68f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1142,6 +1142,7 @@ struct tcp_sacktag_state {
u64 last_sackt;
struct rate_sample *rate;
int flag;
+ unsigned int mss_now;
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
if (pkt_len >= skb->len && !in_sack)
return 0;
- err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+ err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
}
@@ -1288,13 +1290,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
-static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+ struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
@@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
- tcp_unlink_write_queue(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ tcp_rtx_queue_unlink_and_free(skb, sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
@@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
goto fallback;
/* Can only happen with delayed DSACK + discard craziness */
- if (unlikely(skb == tcp_write_queue_head(sk)))
+ prev = skb_rb_prev(skb);
+ if (!prev)
goto fallback;
- prev = tcp_write_queue_prev(sk, skb);
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
@@ -1495,18 +1496,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!skb_shift(prev, skb, len))
goto fallback;
- if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+ if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;
/* Hole filled allows collapsing with the next as well, this is very
* useful when hole on every nth skb pattern happens
*/
- if (prev == tcp_write_queue_tail(sk))
+ skb = skb_rb_next(prev);
+ if (!skb)
goto out;
- skb = tcp_write_queue_next(sk, prev);
if (!skb_can_shift(skb) ||
- (skb == tcp_send_head(sk)) ||
((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
(mss != tcp_skb_seglen(skb)))
goto out;
@@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
len = skb->len;
if (skb_shift(prev, skb, len)) {
pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+ tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+ len, mss, 0);
}
out:
@@ -1538,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *tmp;
- tcp_for_write_queue_from(skb, sk) {
+ skb_rbtree_walk_from(skb) {
int in_sack = 0;
bool dup_sack = dup_sack_in;
- if (skb == tcp_send_head(sk))
- break;
-
/* queue is in-order => we can short-circuit the walk early */
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
@@ -1593,6 +1591,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
tcp_skb_pcount(skb),
skb->skb_mstamp);
tcp_rate_skb_delivered(sk, skb, state->rate);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ list_del_init(&skb->tcp_tsorted_anchor);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1604,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
return skb;
}
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+ struct tcp_sacktag_state *state,
+ u32 seq)
+{
+ struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+ struct sk_buff *skb;
+ int unack_bytes;
+
+ while (*p) {
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (before(seq, TCP_SKB_CB(skb)->seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+ p = &parent->rb_right;
+ continue;
+ }
+
+ state->fack_count = 0;
+ unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
+ if (state->mss_now && unack_bytes > 0)
+ state->fack_count = unack_bytes / state->mss_now;
+
+ return skb;
+ }
+ return NULL;
+}
+
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
- if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
- break;
+ if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+ return skb;
- state->fack_count += tcp_skb_pcount(skb);
- }
- return skb;
+ return tcp_sacktag_bsearch(sk, state, skip_to_seq);
}
static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1742,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
}
- skb = tcp_write_queue_head(sk);
+ state->mss_now = tcp_current_mss(sk);
state->fack_count = 0;
+ skb = NULL;
i = 0;
if (!tp->sacked_out) {
@@ -1967,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk)
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
@@ -1976,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk)
}
tcp_clear_all_retrans_hints(tp);
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
+ skb_rbtree_walk_from(skb) {
mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
is_reneg);
if (mark_lost)
@@ -2205,20 +2224,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
WARN_ON(packets > tp->packets_out);
- if (tp->lost_skb_hint) {
- skb = tp->lost_skb_hint;
- cnt = tp->lost_cnt_hint;
+ skb = tp->lost_skb_hint;
+ if (skb) {
/* Head already handled? */
- if (mark_head && skb != tcp_write_queue_head(sk))
+ if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
return;
+ cnt = tp->lost_cnt_hint;
} else {
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
cnt = 0;
}
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ skb_rbtree_walk_from(skb) {
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
@@ -2242,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
/* If needed, chop off the prefix to mark as lost. */
lost = (packets - oldcnt) * mss;
if (lost < skb->len &&
- tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+ tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ lost, mss, GFP_ATOMIC) < 0)
break;
cnt = packets;
}
@@ -2326,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk)
if (tp->retrans_out)
return true;
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
return true;
@@ -2367,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (unmark_loss) {
struct sk_buff *skb;
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
tp->lost_out = 0;
@@ -2614,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk)
unsigned int mss = tcp_current_mss(sk);
u32 prior_lost = tp->lost_out;
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
if (tcp_skb_seglen(skb) > mss &&
!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2710,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
* is updated in tcp_ack()). Otherwise fall back to
* the conventional recovery.
*/
- if (tcp_send_head(sk) &&
+ if (!tcp_write_queue_empty(sk) &&
after(tcp_wnd_end(tp), tp->snd_nxt)) {
*rexmit = REXMIT_NEW;
return;
@@ -2802,9 +2816,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- if (WARN_ON(!tp->packets_out && tp->sacked_out))
+ if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
- if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+ if (!tp->sacked_out && tp->fackets_out)
tp->fackets_out = 0;
/* Now state machine starts.
@@ -2871,6 +2885,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
+ /* fall through */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -3054,8 +3069,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
shinfo = skb_shinfo(skb);
if (!before(shinfo->tskey, prior_snd_una) &&
- before(shinfo->tskey, tcp_sk(sk)->snd_una))
- __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+ tcp_skb_tsorted_save(skb) {
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ } tcp_skb_tsorted_restore(skb);
+ }
}
/* Remove acknowledged frames from the retransmission queue. If our packet
@@ -3071,11 +3089,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
+ struct sk_buff *skb, *next;
bool fully_acked = true;
long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
long ca_rtt_us = -1L;
- struct sk_buff *skb;
u32 pkts_acked = 0;
u32 last_in_flight = 0;
bool rtt_update;
@@ -3083,7 +3101,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt = 0;
- while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+ for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
u32 acked_pcount;
@@ -3101,8 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
break;
fully_acked = false;
} else {
- /* Speedup tcp_unlink_write_queue() and next loop */
- prefetchw(skb->next);
acked_pcount = tcp_skb_pcount(skb);
}
@@ -3154,12 +3170,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!fully_acked)
break;
- tcp_unlink_write_queue(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
+ tcp_rtx_queue_unlink_and_free(skb, sk);
}
if (!skb)
@@ -3251,12 +3267,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
static void tcp_ack_probe(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *head = tcp_send_head(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Was it a usable window open? */
-
- if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+ if (!head)
+ return;
+ if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3376,7 +3394,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
tp->pred_flags = 0;
tcp_fast_path_check(sk);
- if (tcp_send_head(sk))
+ if (!tcp_write_queue_empty(sk))
tcp_slow_start_after_idle_check(sk);
if (nwin > tp->max_window) {
@@ -3561,8 +3579,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
sack_state.first_sackt = 0;
sack_state.rate = &rs;
- /* We very likely will need to access write queue head. */
- prefetchw(sk->sk_write_queue.next);
+ /* We very likely will need to access rtx queue. */
+ prefetch(sk->tcp_rtx_queue.rb_node);
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3676,8 +3694,7 @@ no_queue:
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
- if (tcp_send_head(sk))
- tcp_ack_probe(sk);
+ tcp_ack_probe(sk);
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@@ -4330,7 +4347,7 @@ static void tcp_ofo_queue(struct sock *sk)
p = rb_first(&tp->out_of_order_queue);
while (p) {
- skb = rb_entry(p, struct sk_buff, rbnode);
+ skb = rb_to_skb(p);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
@@ -4394,7 +4411,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct rb_node **p, *q, *parent;
+ struct rb_node **p, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
bool fragstolen;
@@ -4453,7 +4470,7 @@ coalesce_done:
parent = NULL;
while (*p) {
parent = *p;
- skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ skb1 = rb_to_skb(parent);
if (before(seq, TCP_SKB_CB(skb1)->seq)) {
p = &parent->rb_left;
continue;
@@ -4498,9 +4515,7 @@ insert:
merge_right:
/* Remove other segments covered by skb. */
- while ((q = rb_next(&skb->rbnode)) != NULL) {
- skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4515,7 +4530,7 @@ merge_right:
tcp_drop(sk, skb1);
}
/* If there is no skb after us, we are the last_skb ! */
- if (!q)
+ if (!skb1)
tp->ooo_last_skb = skb;
add_sack:
@@ -4701,7 +4716,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
if (list)
return !skb_queue_is_last(list, skb) ? skb->next : NULL;
- return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+ return skb_rb_next(skb);
}
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4722,7 +4737,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
}
/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
@@ -4730,7 +4745,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
while (*p) {
parent = *p;
- skb1 = rb_entry(parent, struct sk_buff, rbnode);
+ skb1 = rb_to_skb(parent);
if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
p = &parent->rb_left;
else
@@ -4849,26 +4864,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *head;
- struct rb_node *p;
u32 start, end;
- p = rb_first(&tp->out_of_order_queue);
- skb = rb_entry_safe(p, struct sk_buff, rbnode);
+ skb = skb_rb_first(&tp->out_of_order_queue);
new_range:
if (!skb) {
- p = rb_last(&tp->out_of_order_queue);
- /* Note: This is possible p is NULL here. We do not
- * use rb_entry_safe(), as ooo_last_skb is valid only
- * if rbtree is not empty.
- */
- tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+ tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
return;
}
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
for (head = skb;;) {
- skb = tcp_skb_next(skb, NULL);
+ skb = skb_rb_next(skb);
/* Range is terminated when we see a gap or when
* we are at the queue end.
@@ -4911,14 +4919,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
do {
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
- tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+ tcp_drop(sk, rb_to_skb(node));
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
node = prev;
} while (node);
- tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+ tp->ooo_last_skb = rb_to_skb(prev);
/* Reset SACK state. A conforming SACK implementation will
* do the same at a timeout based retransmit. When a connection
@@ -5513,20 +5521,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
security_inet_conn_established(sk, skb);
}
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
- tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
- tcp_init_congestion_control(sk);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
*/
tp->lsndtime = tcp_jiffies32;
- tcp_init_buffer_space(sk);
-
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
@@ -5540,7 +5541,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+ struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
@@ -5575,9 +5576,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
- tcp_for_write_queue_from(data, sk) {
- if (data == tcp_send_head(sk) ||
- __tcp_retransmit_skb(sk, data, 1))
+ skb_rbtree_walk_from(data) {
+ if (__tcp_retransmit_skb(sk, data, 1))
break;
}
tcp_rearm_rto(sk);
@@ -5693,7 +5693,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tcp_is_sack(tp) && sysctl_tcp_fack)
tcp_enable_fack(tp);
- tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -5919,15 +5918,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (req) {
inet_csk(sk)->icsk_retransmits = 0;
reqsk_fastopen_remove(sk, req, false);
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
} else {
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
- tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tcp_init_congestion_control(sk);
-
- tcp_mtup_init(sk);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tp->copied_seq = tp->rcv_nxt;
- tcp_init_buffer_space(sk);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5947,19 +5949,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
- if (req) {
- /* Re-arm the timer because data may have been sent out.
- * This is similar to the regular data transmission case
- * when new data has just been ack'ed.
- *
- * (TFO) - we could try to be more aggressive and
- * retransmitting any data sooner based on when they
- * are sent out.
- */
- tcp_rearm_rto(sk);
- } else
- tcp_init_metrics(sk);
-
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
@@ -6056,6 +6045,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
+ /* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..ecee4ddb24c5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
TCP_TIMEOUT_INIT;
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
BUG_ON(!skb);
tcp_mstamp_refresh(tp);
@@ -1503,23 +1503,23 @@ csum_err:
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
-void tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
if (skb->pkt_type != PACKET_HOST)
- return;
+ return 0;
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
- return;
+ return 0;
iph = ip_hdr(skb);
th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
- return;
+ return 0;
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
@@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
skb_dst_set_noref(skb, dst);
}
}
+ return 0;
}
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
@@ -1778,8 +1779,9 @@ do_time_wait:
refcounted = false;
goto process;
}
- /* Fall through to ACK */
}
+ /* to ACK */
+ /* fall through */
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
@@ -2472,6 +2474,11 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
+ net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+ spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
return 0;
fail:
tcp_sk_exit(net);
@@ -2481,7 +2488,12 @@ fail:
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
+ struct net *net;
+
inet_twsk_purge(&tcp_hashinfo, AF_INET);
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ tcp_fastopen_ctx_destroy(net);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..2341b9f857b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
INIT_LIST_HEAD(&newtp->tsq_node);
+ INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
tcp_init_wl(newtp, treq->rcv_isn);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..53dc1267c85e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -42,6 +42,8 @@
#include <linux/gfp.h>
#include <linux/module.h>
+#include <trace/events/tcp.h>
+
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse __read_mostly = 1;
@@ -66,15 +68,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
- tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ __skb_unlink(skb, &sk->sk_write_queue);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -971,6 +975,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
HRTIMER_MODE_ABS_PINNED);
}
+static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ skb->skb_mstamp = tp->tcp_mstamp;
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -1003,10 +1013,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
oskb = skb;
- if (unlikely(skb_cloned(skb)))
- skb = pskb_copy(skb, gfp_mask);
- else
- skb = skb_clone(skb, gfp_mask);
+
+ tcp_skb_tsorted_save(oskb) {
+ if (unlikely(skb_cloned(oskb)))
+ skb = pskb_copy(oskb, gfp_mask);
+ else
+ skb = skb_clone(oskb, gfp_mask);
+ } tcp_skb_tsorted_restore(oskb);
+
if (unlikely(!skb))
return -ENOBUFS;
}
@@ -1127,7 +1141,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
err = net_xmit_eval(err);
}
if (!err && oskb) {
- oskb->skb_mstamp = tp->tcp_mstamp;
+ tcp_update_skb_after_send(tp, oskb);
tcp_rate_skb_sent(sk, oskb);
}
return err;
@@ -1239,12 +1253,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
TCP_SKB_CB(skb)->eor = 0;
}
+/* Insert buff after skb on the write or rtx queue of sk. */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+ struct sk_buff *buff,
+ struct sock *sk,
+ enum tcp_queue tcp_queue)
+{
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+ __skb_queue_after(&sk->sk_write_queue, skb, buff);
+ else
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1327,7 +1354,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk);
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+ list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
return 0;
}
@@ -1614,10 +1642,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
* is caused by insufficient sender buffer:
* 1) just sent some data (see tcp_write_xmit)
* 2) not cwnd limited (this else condition)
- * 3) no more data to send (null tcp_send_head )
+ * 3) no more data to send (tcp_write_queue_empty())
* 4) application is hitting buffer limit (SOCK_NOSPACE)
*/
- if (!tcp_send_head(sk) && sk->sk_socket &&
+ if (tcp_write_queue_empty(sk) && sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
(1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1813,7 +1841,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+ struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
struct sk_buff *buff;
@@ -1822,7 +1851,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
- return tcp_fragment(sk, skb, len, mss_now, gfp);
+ return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
@@ -1858,7 +1887,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk);
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
return 0;
}
@@ -1928,8 +1957,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
goto send_now;
}
- head = tcp_write_queue_head(sk);
-
+ /* TODO : use tsorted_sent_queue ? */
+ head = tcp_rtx_queue_head(sk);
+ if (!head)
+ goto send_now;
age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
@@ -2147,13 +2178,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
- /* Always send the 1st or 2nd skb in write queue.
+ /* Always send skb if rtx queue is empty.
* No need to wait for TX completion to call us back,
* after softirq/tasklet schedule.
* This helps when TX completions are delayed too much.
*/
- if (skb == sk->sk_write_queue.next ||
- skb->prev == sk->sk_write_queue.next)
+ if (tcp_rtx_queue_empty(sk))
return false;
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2204,7 +2234,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
* it's the "most interesting" or current chrono we are
* tracking and starts busy chrono if we have pending data.
*/
- if (tcp_write_queue_empty(sk))
+ if (tcp_rtx_and_write_queues_empty(sk))
tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
else if (type == tp->chrono_type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2260,7 +2290,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp" is used as a start point for the retransmit timer */
- skb->skb_mstamp = tp->tcp_mstamp;
+ tcp_update_skb_after_send(tp, skb);
goto repair; /* Skip network transmission */
}
@@ -2299,7 +2329,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
nonagle);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, limit, mss_now, gfp)))
break;
if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2339,7 +2370,7 @@ repair:
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
- return !tp->packets_out && tcp_send_head(sk);
+ return !tp->packets_out && !tcp_write_queue_empty(sk);
}
bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2363,7 +2394,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
- tcp_send_head(sk))
+ !tcp_write_queue_empty(sk))
return false;
/* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2416,18 +2447,14 @@ void tcp_send_loss_probe(struct sock *sk)
int mss = tcp_current_mss(sk);
skb = tcp_send_head(sk);
- if (skb) {
- if (tcp_snd_wnd_test(tp, skb, mss)) {
- pcount = tp->packets_out;
- tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
- if (tp->packets_out > pcount)
- goto probe_sent;
- goto rearm_timer;
- }
- skb = tcp_write_queue_prev(sk, skb);
- } else {
- skb = tcp_write_queue_tail(sk);
+ if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+ pcount = tp->packets_out;
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ if (tp->packets_out > pcount)
+ goto probe_sent;
+ goto rearm_timer;
}
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
/* At most one outstanding TLP retransmission. */
if (tp->tlp_high_seq)
@@ -2445,10 +2472,11 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
- if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+ if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+ (pcount - 1) * mss, mss,
GFP_ATOMIC)))
goto rearm_timer;
- skb = tcp_write_queue_next(sk, skb);
+ skb = skb_rb_next(skb);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2648,7 +2676,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+ struct sk_buff *next_skb = skb_rb_next(skb);
int skb_size, next_skb_size;
skb_size = skb->len;
@@ -2665,8 +2693,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
}
tcp_highest_sack_combine(sk, next_skb, skb);
- tcp_unlink_write_queue(next_skb, sk);
-
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2694,7 +2720,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_skb_collapse_tstamp(skb, next_skb);
- sk_wmem_free_skb(sk, next_skb);
+ tcp_rtx_queue_unlink_and_free(next_skb, sk);
return true;
}
@@ -2705,8 +2731,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
return false;
if (skb_cloned(skb))
return false;
- if (skb == tcp_send_head(sk))
- return false;
/* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -2729,7 +2753,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
- tcp_for_write_queue_from_safe(skb, tmp, sk) {
+ skb_rbtree_walk_from_safe(skb, tmp) {
if (!tcp_can_collapse(sk, skb))
break;
@@ -2804,7 +2828,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
len = cur_mss * segs;
if (skb->len > len) {
- if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+ if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+ cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
if (skb_unclone(skb, GFP_ATOMIC))
@@ -2838,17 +2863,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;
- nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ tcp_skb_tsorted_save(skb) {
+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } tcp_skb_tsorted_restore(skb);
+
if (!err)
- skb->skb_mstamp = tp->tcp_mstamp;
+ tcp_update_skb_after_send(tp, skb);
} else {
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
if (likely(!err)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+ trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
@@ -2892,29 +2921,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
void tcp_xmit_retransmit_queue(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *rtx_head, *hole = NULL;
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- struct sk_buff *hole = NULL;
u32 max_segs;
int mib_idx;
if (!tp->packets_out)
return;
- if (tp->retransmit_skb_hint) {
- skb = tp->retransmit_skb_hint;
- } else {
- skb = tcp_write_queue_head(sk);
- }
-
+ rtx_head = tcp_rtx_queue_head(sk);
+ skb = tp->retransmit_skb_hint ?: rtx_head;
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
- tcp_for_write_queue_from(skb, sk) {
+ skb_rbtree_walk_from(skb) {
__u8 sacked;
int segs;
- if (skb == tcp_send_head(sk))
- break;
-
if (tcp_pacing_check(sk))
break;
@@ -2959,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk) &&
+ if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
@@ -3001,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
- if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+ if (!tskb && tcp_under_memory_pressure(sk))
+ tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+ if (tskb) {
coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- if (!tcp_send_head(sk)) {
+ if (tcp_write_queue_empty(sk)) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
@@ -3023,6 +3047,7 @@ coalesce:
goto coalesce;
return;
}
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3071,20 +3096,24 @@ int tcp_send_synack(struct sock *sk)
{
struct sk_buff *skb;
- skb = tcp_write_queue_head(sk);
+ skb = tcp_rtx_queue_head(sk);
if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
- pr_debug("%s: wrong queue state\n", __func__);
+ pr_err("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
- struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ struct sk_buff *nskb;
+
+ tcp_skb_tsorted_save(skb) {
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ } tcp_skb_tsorted_restore(skb);
if (!nskb)
return -ENOMEM;
- tcp_unlink_write_queue(skb, sk);
+ INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+ tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
- __tcp_add_write_queue_head(sk, nskb);
- sk_wmem_free_skb(sk, skb);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
sk->sk_wmem_queued += nskb->truesize;
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
@@ -3307,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
tcb->end_seq += skb->len;
__skb_header_release(skb);
- __tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
tp->write_seq = tcb->end_seq;
@@ -3385,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
if (!err) {
tp->syn_data = (fo->copied > 0);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
goto done;
}
- /* data was not sent, this is our new send_head */
- sk->sk_send_head = syn_data;
+ /* data was not sent, put it in write_queue */
+ __skb_queue_tail(&sk->sk_write_queue, syn_data);
tp->packets_out -= tcp_skb_pcount(syn_data);
fallback:
@@ -3433,6 +3462,7 @@ int tcp_connect(struct sock *sk)
tp->retrans_stamp = tcp_time_stamp(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3627,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
- if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+ if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(skb, mss);
@@ -3657,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
- if (tp->packets_out || !tcp_send_head(sk)) {
+ if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 449cd914d58e..cda6074a429a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
+ struct sk_buff *skb, *n;
u32 reo_wnd;
*reo_timeout = 0;
@@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
- tcp_for_write_queue(skb, sk) {
+ list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
+ tcp_tsorted_anchor) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ s32 remaining;
- if (skb == tcp_send_head(sk))
- break;
-
- /* Skip ones already (s)acked */
- if (!after(scb->end_seq, tp->snd_una) ||
- scb->sacked & TCPCB_SACKED_ACKED)
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
- tp->rack.end_seq, scb->end_seq)) {
- /* Step 3 in draft-cheng-tcpm-rack-00.txt:
- * A packet is lost if its elapsed time is beyond
- * the recent RTT plus the reordering window.
- */
- u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
- skb->skb_mstamp);
- s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
-
- if (remaining < 0) {
- tcp_rack_mark_skb_lost(sk, skb);
- continue;
- }
-
- /* Skip ones marked lost but not yet retransmitted */
- if ((scb->sacked & TCPCB_LOST) &&
- !(scb->sacked & TCPCB_SACKED_RETRANS))
- continue;
+ if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+ tp->rack.end_seq, scb->end_seq))
+ break;
+ /* A packet is lost if it has not been s/acked beyond
+ * the recent RTT plus the reordering window.
+ */
+ remaining = tp->rack.rtt_us + reo_wnd -
+ tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ if (remaining < 0) {
+ tcp_rack_mark_skb_lost(sk, skb);
+ list_del_init(&skb->tcp_tsorted_anchor);
+ } else {
/* Record maximum wait time (+1 to avoid 0) */
*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
-
- } else if (!(scb->sacked & TCPCB_RETRANS)) {
- /* Original data are sent sequentially so stop early
- * b/c the rest are all sent after rack_sent
- */
- break;
}
}
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 655dd8d7f064..804a8d34ce86 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
- if (unlikely(!start_ts))
- start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+ if (unlikely(!start_ts)) {
+ struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+ if (!head)
+ return false;
+ start_ts = tcp_skb_timestamp(head);
+ }
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -283,15 +288,17 @@ out:
*
* Returns: Nothing (void)
*/
-static void tcp_delack_timer(unsigned long data)
+static void tcp_delack_timer(struct timer_list *t)
{
- struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk =
+ from_timer(icsk, t, icsk_delack_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
tcp_delack_timer_handler(sk);
} else {
- inet_csk(sk)->icsk_ack.blocked = 1;
+ icsk->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
@@ -304,11 +311,12 @@ static void tcp_delack_timer(unsigned long data)
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
u32 start_ts;
- if (tp->packets_out || !tcp_send_head(sk)) {
+ if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0;
return;
}
@@ -321,9 +329,9 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+ start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
- tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+ skb->skb_mstamp = tp->tcp_mstamp;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) >
jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +416,7 @@ void tcp_retransmit_timer(struct sock *sk)
if (!tp->packets_out)
goto out;
- WARN_ON(tcp_write_queue_empty(sk));
+ WARN_ON(tcp_rtx_queue_empty(sk));
tp->tlp_high_seq = 0;
@@ -441,7 +449,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+ tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
@@ -473,7 +481,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk);
- if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+ if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* do not backoff.
*/
@@ -570,9 +578,11 @@ out:
sk_mem_reclaim(sk);
}
-static void tcp_write_timer(unsigned long data)
+static void tcp_write_timer(struct timer_list *t)
{
- struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk =
+ from_timer(icsk, t, icsk_retransmit_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
@@ -607,9 +617,9 @@ void tcp_set_keepalive(struct sock *sk, int val)
EXPORT_SYMBOL_GPL(tcp_set_keepalive);
-static void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer (struct timer_list *t)
{
- struct sock *sk = (struct sock *) data;
+ struct sock *sk = from_timer(sk, t, sk_timer);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
@@ -647,7 +657,7 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
- if (tp->packets_out || tcp_send_head(sk))
+ if (tp->packets_out || !tcp_write_queue_empty(sk))
goto resched;
elapsed = keepalive_time_elapsed(tp);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 218cfcc77650..ee113ff15fd0 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
{
- return min(tp->snd_ssthresh, tp->snd_cwnd-1);
+ return min(tp->snd_ssthresh, tp->snd_cwnd);
}
static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 784ced0b9150..7c9a6e4a7253 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2220,9 +2220,10 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
return NULL;
}
-void udp_v4_early_demux(struct sk_buff *skb)
+int udp_v4_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ struct in_device *in_dev = NULL;
const struct iphdr *iph;
const struct udphdr *uh;
struct sock *sk = NULL;
@@ -2233,25 +2234,21 @@ void udp_v4_early_demux(struct sk_buff *skb)
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
- return;
+ return 0;
iph = ip_hdr(skb);
uh = udp_hdr(skb);
- if (skb->pkt_type == PACKET_BROADCAST ||
- skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ if (skb->pkt_type == PACKET_MULTICAST) {
+ in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev)
- return;
+ return 0;
- /* we are supposed to accept bcast packets */
- if (skb->pkt_type == PACKET_MULTICAST) {
- ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
- iph->protocol);
- if (!ours)
- return;
- }
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return 0;
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr,
@@ -2262,7 +2259,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
}
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
- return;
+ return 0;
skb->sk = sk;
skb->destructor = sock_efree;
@@ -2271,12 +2268,23 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst) {
+ u32 itag = 0;
+
/* set noref for now.
* any place which wants to hold dst has to call
* dst_hold_safe()
*/
skb_dst_set_noref(skb, dst);
+
+ /* for unconnected multicast sockets we need to validate
+ * the source on each packet
+ */
+ if (!inet_sk(sk)->inet_daddr && in_dev)
+ return ip_mc_validate_source(skb, iph->daddr,
+ iph->saddr, iph->tos,
+ skb->dev, in_dev, &itag);
}
+ return 0;
}
int udp_rcv(struct sk_buff *skb)
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 97658bfc1b58..e360d55be555 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* will be using a length value equal to only one MSS sized
* segment instead of the entire frame.
*/
- if (gso_partial) {
+ if (gso_partial && skb_is_gso(skb)) {
uh->len = htons(skb_shinfo(skb)->gso_size +
SKB_GSO_CB(skb)->data_offset +
skb->head - (unsigned char *)uh);