summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c461
1 files changed, 341 insertions, 120 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 257b61789eeb..eb97787be757 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -175,7 +176,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
static void tcp_incr_quickack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
@@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk)
if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
tp->reordering = dst_metric(dst, RTAX_REORDERING);
}
@@ -937,7 +939,7 @@ static void tcp_init_metrics(struct sock *sk)
tcp_set_rto(sk);
reset:
if (tp->srtt == 0) {
- /* RFC2988bis: We've failed to get a valid RTT sample from
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs
* from the more aggressive 1sec to avoid more spurious
@@ -947,7 +949,7 @@ reset:
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
}
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
- * retransmitted. In light of RFC2988bis' more aggressive 1sec
+ * retransmitted. In light of RFC6298 more aggressive 1sec
* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
* retransmission has occurred.
*/
@@ -979,15 +981,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
NET_INC_STATS_BH(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
- printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
- tp->reordering,
- tp->fackets_out,
- tp->sacked_out,
- tp->undo_marker ? tp->undo_retrans : 0);
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
#endif
tcp_disable_fack(tp);
}
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
}
/* This must be called before lost_out is incremented */
@@ -2339,6 +2344,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay;
+
+ /* Delay early retransmit and entering fast recovery for
+ * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+ * available, or RTO is scheduled to fire first.
+ */
+ if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+ return false;
+
+ delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+ if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+ return false;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
+ tp->early_retrans_delayed = 1;
+ return true;
+}
+
static inline int tcp_skb_timedout(const struct sock *sk,
const struct sk_buff *skb)
{
@@ -2446,7 +2472,7 @@ static inline int tcp_head_timedout(const struct sock *sk)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static int tcp_time_to_recover(struct sock *sk)
+static int tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
@@ -2492,6 +2518,16 @@ static int tcp_time_to_recover(struct sock *sk)
tcp_is_sack(tp) && !tcp_send_head(sk))
return 1;
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return !tcp_pause_early_retransmit(sk, flag);
+
return 0;
}
@@ -2680,22 +2716,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &inet->inet_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &np->daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &np->daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#endif
}
@@ -3025,6 +3061,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
+static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
+
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->high_seq = tp->snd_nxt;
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = tp->snd_una;
+ tp->undo_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ TCP_ECN_queue_cwr(tp);
+ }
+
+ tp->bytes_acked = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -3044,7 +3112,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
struct tcp_sock *tp = tcp_sk(sk);
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0, mib_idx;
+ int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -3128,7 +3196,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- if (!tcp_time_to_recover(sk)) {
+ if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
}
@@ -3145,32 +3213,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
}
/* Otherwise enter Recovery state */
-
- if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENORECOVERY;
- else
- mib_idx = LINUX_MIB_TCPSACKRECOVERY;
-
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
- tp->high_seq = tp->snd_nxt;
- tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
-
- if (icsk->icsk_ca_state < TCP_CA_CWR) {
- if (!(flag & FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
- }
-
- tp->bytes_acked = 0;
- tp->snd_cwnd_cnt = 0;
- tp->prior_cwnd = tp->snd_cwnd;
- tp->prr_delivered = 0;
- tp->prr_out = 0;
- tcp_set_ca_state(sk, TCP_CA_Recovery);
+ tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
}
@@ -3252,16 +3295,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
-static void tcp_rearm_rto(struct sock *sk)
+void tcp_rearm_rto(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (tp->early_retrans_delayed) {
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+ const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+ s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+ /* delta may not be positive if the socket is locked
+ * when the delayed ER timer fires and is rescheduled.
+ */
+ if (delta > 0)
+ rto = delta;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX);
}
+ tp->early_retrans_delayed = 0;
+}
+
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_rearm_rto(sk);
+
+ /* Stop if ER is disabled after the delayed ER timer is scheduled */
+ if (!tp->do_early_retrans)
+ return;
+
+ tcp_enter_recovery(sk, false);
+ tcp_update_scoreboard(sk, 1);
+ tcp_xmit_retransmit_queue(sk);
}
/* If we get here, the whole TSO packet has not been acked. */
@@ -3437,18 +3511,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!tp->packets_out && tcp_is_sack(tp)) {
icsk = inet_csk(sk);
if (tp->lost_out) {
- printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, icsk->icsk_ca_state);
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
- printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, icsk->icsk_ca_state);
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
- printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, icsk->icsk_ca_state);
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -3710,6 +3784,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ if (tp->early_retrans_delayed)
+ tcp_rearm_rto(sk);
+
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3875,10 +3952,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
- if (net_ratelimit())
- pr_info("%s: Illegal window scaling value %d >14 received\n",
- __func__,
- snd_wscale);
+ net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ __func__,
+ snd_wscale);
snd_wscale = 14;
}
opt_rx->snd_wscale = snd_wscale;
@@ -4434,7 +4510,7 @@ static void tcp_ofo_queue(struct sock *sk)
static int tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, size)) {
@@ -4453,6 +4529,102 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
return 0;
}
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (tcp_hdr(from)->fin || skb_cloned(to))
+ return false;
+
+ if (len <= skb_tailroom(to)) {
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ goto merge;
+ }
+
+ if (skb_has_frag_list(to) || skb_has_frag_list(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize -
+ SKB_TRUESIZE(skb_end_pointer(from) - from->head);
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+ skb_shinfo(from)->frags,
+ skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+
+ if (!skb_cloned(from))
+ skb_shinfo(from)->nr_frags = 0;
+
+ /* if the skb is cloned this does nothing since we set nr_frags to 0 */
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+ skb_frag_ref(from, i);
+
+ to->truesize += delta;
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ to->len += len;
+ to->data_len += len;
+
+merge:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ return true;
+}
+
+static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen)
+ kmem_cache_free(skbuff_head_cache, skb);
+ else
+ __kfree_skb(skb);
+}
+
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4491,23 +4663,13 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
- /* Packets in ofo can stay in queue a long time.
- * Better try to coalesce them right now
- * to avoid future tcp_collapse_ofo_queue(),
- * probably the most expensive function in tcp stack.
- */
- if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPRCVCOALESCE);
- BUG_ON(skb_copy_bits(skb, 0,
- skb_put(skb1, skb->len),
- skb->len));
- TCP_SKB_CB(skb1)->end_seq = end_seq;
- TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
- __kfree_skb(skb);
- skb = NULL;
- } else {
+ bool fragstolen;
+
+ if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ } else {
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
}
if (!tp->rx_opt.num_sacks ||
@@ -4583,12 +4745,65 @@ end:
skb_set_owner_r(skb, sk);
}
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ __skb_pull(skb, hdrlen);
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (!eaten) {
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb;
+ struct tcphdr *th;
+ bool fragstolen;
+
+ if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
+ goto err;
+
+ skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ th = (struct tcphdr *)skb_put(skb, sizeof(*th));
+ skb_reset_transport_header(skb);
+ memset(th, 0, sizeof(*th));
+
+ if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return -ENOMEM;
+}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
+ bool fragstolen = false;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
@@ -4633,8 +4848,7 @@ queue_and_out:
tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
@@ -4658,7 +4872,7 @@ queue_and_out:
tcp_fast_path_check(sk);
if (eaten > 0)
- __kfree_skb(skb);
+ kfree_skb_partial(skb, fragstolen);
else if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return;
@@ -5437,6 +5651,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
} else {
int eaten = 0;
int copied_early = 0;
+ bool fragstolen = false;
if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
@@ -5494,10 +5709,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- __skb_pull(skb, tcp_header_len);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
}
tcp_event_data_recv(sk, skb);
@@ -5519,7 +5732,7 @@ no_ack:
else
#endif
if (eaten)
- __kfree_skb(skb);
+ kfree_skb_partial(skb, fragstolen);
else
sk->sk_data_ready(sk, 0);
return 0;
@@ -5563,6 +5776,44 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb != NULL)
+ security_inet_conn_established(sk, skb);
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
@@ -5695,36 +5946,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
}
smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
- security_inet_conn_established(sk, skb);
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
-
- tcp_init_congestion_control(sk);
-
- /* Prevent spurious tcp_cwnd_restart() on first data
- * packet.
- */
- tp->lsndtime = tcp_time_stamp;
-
- tcp_init_buffer_space(sk);
-
- if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
+ tcp_finish_connect(sk, skb);
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5738,8 +5961,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
inet_csk_schedule_ack(sk);
icsk->icsk_ack.lrcvtime = tcp_time_stamp;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);