diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 171 | 
1 files changed, 97 insertions, 74 deletions
| diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 441ae9da3a23..8bd9911fdd16 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -364,7 +364,7 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)   * be sent.   */  static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, -				int tcp_header_len) +			 struct tcphdr *th, int tcp_header_len)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -375,7 +375,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,  			INET_ECN_xmit(sk);  			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {  				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -				tcp_hdr(skb)->cwr = 1; +				th->cwr = 1;  				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;  			}  		} else if (!tcp_ca_needs_ecn(sk)) { @@ -383,7 +383,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,  			INET_ECN_dontxmit(sk);  		}  		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) -			tcp_hdr(skb)->ece = 1; +			th->ece = 1;  	}  } @@ -949,12 +949,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	skb_orphan(skb);  	skb->sk = sk; -	skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree; +	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;  	skb_set_hash_from_sk(skb, sk);  	atomic_add(skb->truesize, &sk->sk_wmem_alloc);  	/* Build TCP header and checksum it. */ -	th = tcp_hdr(skb); +	th = (struct tcphdr *)skb->data;  	th->source		= inet->inet_sport;  	th->dest		= inet->inet_dport;  	th->seq			= htonl(tcb->seq); @@ -962,14 +962,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |  					tcb->tcp_flags); -	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { -		/* RFC1323: The window in SYN & SYN/ACK segments -		 * is never scaled. -		 */ -		th->window	= htons(min(tp->rcv_wnd, 65535U)); -	} else { -		th->window	= htons(tcp_select_window(sk)); -	}  	th->check		= 0;  	th->urg_ptr		= 0; @@ -986,9 +978,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	tcp_options_write((__be32 *)(th + 1), tp, &opts);  	skb_shinfo(skb)->gso_type = sk->sk_gso_type; -	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) -		tcp_ecn_send(sk, skb, tcp_header_size); - +	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { +		th->window      = htons(tcp_select_window(sk)); +		tcp_ecn_send(sk, skb, th, tcp_header_size); +	} else { +		/* RFC1323: The window in SYN & SYN/ACK segments +		 * is never scaled. +		 */ +		th->window	= htons(min(tp->rcv_wnd, 65535U)); +	}  #ifdef CONFIG_TCP_MD5SIG  	/* Calculate the MD5 hash, as we have all we need now */  	if (md5) { @@ -1111,11 +1109,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de  	tcp_verify_left_out(tp);  } +static bool tcp_has_tx_tstamp(const struct sk_buff *skb) +{ +	return TCP_SKB_CB(skb)->txstamp_ack || +		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); +} +  static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)  {  	struct skb_shared_info *shinfo = skb_shinfo(skb); -	if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && +	if (unlikely(tcp_has_tx_tstamp(skb)) &&  	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {  		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);  		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; @@ -1123,9 +1127,17 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)  		shinfo->tx_flags &= ~tsflags;  		shinfo2->tx_flags |= tsflags;  		swap(shinfo->tskey, shinfo2->tskey); +		TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; +		TCP_SKB_CB(skb)->txstamp_ack = 0;  	}  } +static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) +{ +	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; +	TCP_SKB_CB(skb)->eor = 0; +} +  /* Function to create two new TCP segments.  Shrinks the given segment   * to the specified size and appends a new segment with the rest of the   * packet to the list.  This won't be called frequently, I hope. @@ -1171,6 +1183,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);  	TCP_SKB_CB(buff)->tcp_flags = flags;  	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; +	tcp_skb_fragment_eor(skb, buff);  	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {  		/* Copy and checksum data tail into the new buffer. */ @@ -1731,6 +1744,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,  	/* This packet was never sent out yet, so no SACK bits. */  	TCP_SKB_CB(buff)->sacked = 0; +	tcp_skb_fragment_eor(skb, buff); +  	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;  	skb_split(skb, buff, len);  	tcp_fragment_tstamp(skb, buff); @@ -2204,14 +2219,13 @@ bool tcp_schedule_loss_probe(struct sock *sk)  /* Thanks to skb fast clones, we can detect if a prior transmit of   * a packet is still in a qdisc or driver queue.   * In this case, there is very little point doing a retransmit ! - * Note: This is called from BH context only.   */  static bool skb_still_in_host_queue(const struct sock *sk,  				    const struct sk_buff *skb)  {  	if (unlikely(skb_fclone_busy(sk, skb))) { -		NET_INC_STATS_BH(sock_net(sk), -				 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); +		NET_INC_STATS(sock_net(sk), +			      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);  		return true;  	}  	return false; @@ -2266,14 +2280,14 @@ void tcp_send_loss_probe(struct sock *sk)  	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))  		goto rearm_timer; -	if (__tcp_retransmit_skb(sk, skb)) +	if (__tcp_retransmit_skb(sk, skb, 1))  		goto rearm_timer;  	/* Record snd_nxt for loss detection. */  	tp->tlp_high_seq = tp->snd_nxt;  probe_sent: -	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); +	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);  	/* Reset s.t. tcp_rearm_rto will restart timer from now */  	inet_csk(sk)->icsk_pending = 0;  rearm_timer: @@ -2444,14 +2458,15 @@ u32 __tcp_select_window(struct sock *sk)  void tcp_skb_collapse_tstamp(struct sk_buff *skb,  			     const struct sk_buff *next_skb)  { -	const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); -	u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; - -	if (unlikely(tsflags)) { +	if (unlikely(tcp_has_tx_tstamp(next_skb))) { +		const struct skb_shared_info *next_shinfo = +			skb_shinfo(next_skb);  		struct skb_shared_info *shinfo = skb_shinfo(skb); -		shinfo->tx_flags |= tsflags; +		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;  		shinfo->tskey = next_shinfo->tskey; +		TCP_SKB_CB(skb)->txstamp_ack |= +			TCP_SKB_CB(next_skb)->txstamp_ack;  	}  } @@ -2490,6 +2505,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)  	 * packet counting does not break.  	 */  	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; +	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;  	/* changed transmit queue under us so clear hints */  	tcp_clear_retrans_hints_partial(tp); @@ -2541,6 +2557,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  		if (!tcp_can_collapse(sk, skb))  			break; +		if (!tcp_skb_can_collapse_to(to)) +			break; +  		space -= skb->len;  		if (first) { @@ -2567,17 +2586,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,   * state updates are done by the caller.  Returns non-zero if an   * error occurred which prevented the send.   */ -int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)  { -	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk);  	unsigned int cur_mss; -	int err; +	int diff, len, err; + -	/* Inconslusive MTU probe */ -	if (icsk->icsk_mtup.probe_size) { +	/* Inconclusive MTU probe */ +	if (icsk->icsk_mtup.probe_size)  		icsk->icsk_mtup.probe_size = 0; -	}  	/* Do not sent more than we queued. 1/4 is reserved for possible  	 * copying overhead: fragmentation, tunneling, mangling etc. @@ -2610,38 +2629,37 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	    TCP_SKB_CB(skb)->seq != tp->snd_una)  		return -EAGAIN; -	if (skb->len > cur_mss) { -		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) +	len = cur_mss * segs; +	if (skb->len > len) { +		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))  			return -ENOMEM; /* We'll try again later. */  	} else { -		int oldpcount = tcp_skb_pcount(skb); +		if (skb_unclone(skb, GFP_ATOMIC)) +			return -ENOMEM; -		if (unlikely(oldpcount > 1)) { -			if (skb_unclone(skb, GFP_ATOMIC)) -				return -ENOMEM; -			tcp_init_tso_segs(skb, cur_mss); -			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); -		} +		diff = tcp_skb_pcount(skb); +		tcp_set_skb_tso_segs(skb, cur_mss); +		diff -= tcp_skb_pcount(skb); +		if (diff) +			tcp_adjust_pcount(sk, skb, diff); +		if (skb->len < cur_mss) +			tcp_retrans_try_collapse(sk, skb, cur_mss);  	}  	/* RFC3168, section 6.1.1.1. ECN fallback */  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)  		tcp_ecn_clear_syn(sk, skb); -	tcp_retrans_try_collapse(sk, skb, cur_mss); - -	/* Make a copy, if the first transmission SKB clone we made -	 * is still in somebody's hands, else make a clone. -	 */ -  	/* make sure skb->data is aligned on arches that require it  	 * and check if ack-trimming & collapsing extended the headroom  	 * beyond what csum_start can cover.  	 */  	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||  		     skb_headroom(skb) >= 0xFFFF)) { -		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, -						   GFP_ATOMIC); +		struct sk_buff *nskb; + +		skb_mstamp_get(&skb->skb_mstamp); +		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);  		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :  			     -ENOBUFS;  	} else { @@ -2649,20 +2667,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	}  	if (likely(!err)) { +		segs = tcp_skb_pcount(skb); +  		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;  		/* Update global TCP statistics. */ -		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); +		TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) -			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); -		tp->total_retrans++; +			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); +		tp->total_retrans += segs;  	}  	return err;  } -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)  {  	struct tcp_sock *tp = tcp_sk(sk); -	int err = __tcp_retransmit_skb(sk, skb); +	int err = __tcp_retransmit_skb(sk, skb, segs);  	if (err == 0) {  #if FASTRETRANS_DEBUG > 0 @@ -2678,7 +2698,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  			tp->retrans_stamp = tcp_skb_timestamp(skb);  	} else if (err != -EBUSY) { -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);  	}  	if (tp->undo_retrans < 0) @@ -2753,6 +2773,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)  	tcp_for_write_queue_from(skb, sk) {  		__u8 sacked = TCP_SKB_CB(skb)->sacked; +		int segs;  		if (skb == tcp_send_head(sk))  			break; @@ -2760,14 +2781,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)  		if (!hole)  			tp->retransmit_skb_hint = skb; -		/* Assume this retransmit will generate -		 * only one packet for congestion window -		 * calculation purposes.  This works because -		 * tcp_retransmit_skb() will chop up the -		 * packet to be MSS sized and all the -		 * packet counting works out. -		 */ -		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) +		segs = tp->snd_cwnd - tcp_packets_in_flight(tp); +		if (segs <= 0)  			return;  		if (fwd_rexmitting) { @@ -2804,10 +2819,10 @@ begin_fwd:  		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))  			continue; -		if (tcp_retransmit_skb(sk, skb)) +		if (tcp_retransmit_skb(sk, skb, segs))  			return; -		NET_INC_STATS_BH(sock_net(sk), mib_idx); +		NET_INC_STATS(sock_net(sk), mib_idx);  		if (tcp_in_cwnd_reduction(sk))  			tp->prr_out += tcp_skb_pcount(skb); @@ -2960,7 +2975,7 @@ int tcp_send_synack(struct sock *sk)  struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,  				struct request_sock *req,  				struct tcp_fastopen_cookie *foc, -				bool attach_req) +				enum tcp_synack_type synack_type)  {  	struct inet_request_sock *ireq = inet_rsk(req);  	const struct tcp_sock *tp = tcp_sk(sk); @@ -2980,14 +2995,22 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,  	/* Reserve space for headers. */  	skb_reserve(skb, MAX_TCP_HEADER); -	if (attach_req) { +	switch (synack_type) { +	case TCP_SYNACK_NORMAL:  		skb_set_owner_w(skb, req_to_sk(req)); -	} else { +		break; +	case TCP_SYNACK_COOKIE: +		/* Under synflood, we do not attach skb to a socket, +		 * to avoid false sharing. +		 */ +		break; +	case TCP_SYNACK_FASTOPEN:  		/* sk is a const pointer, because we want to express multiple  		 * cpu might call us concurrently.  		 * sk->sk_wmem_alloc in an atomic, we can promote to rw.  		 */  		skb_set_owner_w(skb, (struct sock *)sk); +		break;  	}  	skb_dst_set(skb, dst); @@ -3015,7 +3038,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,  	skb_push(skb, tcp_header_size);  	skb_reset_transport_header(skb); -	th = tcp_hdr(skb); +	th = (struct tcphdr *)skb->data;  	memset(th, 0, sizeof(struct tcphdr));  	th->syn = 1;  	th->ack = 1; @@ -3036,7 +3059,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,  	th->window = htons(min(req->rsk_rcv_wnd, 65535U));  	tcp_options_write((__be32 *)(th + 1), NULL, &opts);  	th->doff = (tcp_header_size >> 2); -	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); +	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);  #ifdef CONFIG_TCP_MD5SIG  	/* Okay, we have all we need - do the md5 hash if needed */ @@ -3532,10 +3555,10 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)  	int res;  	tcp_rsk(req)->txhash = net_tx_rndhash(); -	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); +	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);  	if (!res) { -		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); +		__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); +		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);  	}  	return res;  } | 
