From ba1a6c7bc0ff33e405f5156dc8f4145437255f1f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: dccp: Always generate a Reset in response to option errors RFC4340 states that if a packet is received with an option error (such as a Mandatory Option as the last byte of the option list), the endpoint should repond with a Reset. In the LISTEN and RESPOND states, the endpoint correctly reponds with Reset, while in the REQUEST/OPEN states, packets with option errors are just ignored. The packet sequence is as follows: Case 1: Endpoint A Endpoint B (CLOSED) (CLOSED) <---------------- REQUEST RESPONSE -----------------> (*1) (with invalid option) <---------------- RESET (with Reset Code 5, "Option Error") (*1) currently just ignored, no Reset is sent Case 2: Endpoint A Endpoint B (OPEN) (OPEN) DATA-ACK -----------------> (*2) (with invalid option) <---------------- RESET (with Reset Code 5, "Option Error") (*2) currently just ignored, no Reset is sent This patch fixes the problem, by generating a Reset instead of silently ignoring option errors. Signed-off-by: Wei Yongjun Acked-by: Arnaldo Carvalho de Melo Acked-by: Gerrit Renker --- net/dccp/input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/input.c b/net/dccp/input.c index 803933ab396d..779d0ed9ae94 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -370,7 +370,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, goto discard; if (dccp_parse_options(sk, NULL, skb)) - goto discard; + return 1; if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); @@ -610,7 +610,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Step 8: Process options and mark acknowledgeable */ if (dccp_parse_options(sk, NULL, skb)) - goto discard; + return 1; if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); -- cgit v1.2.3 From 828755cee087e4a34f45d6c9db661ccd0631cc6d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Per-socket initialisation of feature negotiation This provides feature-negotiation initialisation for both DCCP sockets and DCCP request_sockets, to support feature negotiation during connection setup. It also resolves a FIXME regarding the congestion control initialisation. Thanks to Wei Yongjun for help with the IPv6 side of this patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 4 ++++ net/dccp/dccp.h | 3 ++- net/dccp/feat.c | 19 +++++++++++++++++++ net/dccp/feat.h | 1 + net/dccp/input.c | 2 -- net/dccp/ipv4.c | 3 ++- net/dccp/ipv6.c | 3 ++- net/dccp/minisocks.c | 7 ++++++- net/dccp/proto.c | 1 + 9 files changed, 37 insertions(+), 6 deletions(-) (limited to 'net/dccp/input.c') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 3978aff197d9..484b8a1fb023 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -412,6 +412,7 @@ extern void dccp_minisock_init(struct dccp_minisock *dmsk); * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) + * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -421,6 +422,7 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; + struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -498,6 +500,7 @@ struct dccp_ackvec; * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) * @dccps_minisock - associated minisock (accessed via dccp_msk) + * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) @@ -535,6 +538,7 @@ struct dccp_sock { __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; + struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b4bc6e095a0e..ab096c06bba0 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -252,7 +252,8 @@ extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); -extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); +extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, + struct sk_buff const *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 2ec2cd117699..faade82856fe 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -279,6 +279,25 @@ void dccp_feat_list_purge(struct list_head *fn_list) } EXPORT_SYMBOL_GPL(dccp_feat_list_purge); +/* generate @to as full clone of @from - @to must not contain any nodes */ +int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) +{ + struct dccp_feat_entry *entry, *new; + + INIT_LIST_HEAD(to); + list_for_each_entry(entry, from, node) { + new = dccp_feat_clone_entry(entry); + if (new == NULL) + goto cloning_failed; + list_add_tail(&new->node, to); + } + return 0; + +cloning_failed: + dccp_feat_list_purge(to); + return -ENOMEM; +} + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 94203c230c65..7e953fd0a79b 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -95,6 +95,7 @@ extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len); extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); +extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); extern int dccp_feat_init(struct dccp_minisock *dmsk); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c index 779d0ed9ae94..3070015edc75 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -590,8 +590,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) return 1; - - /* FIXME: do congestion control initialization */ goto discard; } if (dh->dccph_type == DCCP_PKT_RESET) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 882c5c4de69e..0ce84ea89119 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -595,7 +595,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5e1ee0da2c40..33e8a1ea3041 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -424,7 +424,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b2804e2d1b8c..e487133ae079 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -125,6 +125,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; newicsk->icsk_rto = DCCP_TIMEOUT_INIT; + INIT_LIST_HEAD(&newdp->dccps_featneg); if (dccp_feat_clone(sk, newsk)) goto out_free; @@ -304,7 +305,8 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); -void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) +int dccp_reqsk_init(struct request_sock *req, + struct dccp_sock const *dp, struct sk_buff const *skb) { struct dccp_request_sock *dreq = dccp_rsk(req); @@ -312,6 +314,9 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) inet_rsk(req)->acked = 0; req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; + + /* inherit feature negotiation options from listening socket */ + return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d0bd34819761..1cdf4ae99605 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -193,6 +193,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_init_xmit_timers(sk); + INIT_LIST_HEAD(&dp->dccps_featneg); /* * FIXME: We're hardcoding the CCID, and doing this at this point makes * the listening (master) sock get CCID control blocks, which is not -- cgit v1.2.3 From c49b22729f3da7479c4e6c572d53fdd40201d0bd Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Integration of dynamic feature activation - part 3 (client side) This integrates feature-activation in the client, with these details: 1. When dccp_parse_options() fails, the reset code is already set, request_sent _state_process() currently overrides this with `Packet Error', which is not intended - so changed to use the reset code set in dccp_parse_options(); 2. There was a FIXME to change the error code when dccp_ackvec_add() fails. I have looked this up and found that: * the check whether ackno < ISN is already made earlier, * this Response is likely the 1st packet with an Ackno that the client gets, * so when dccp_ackvec_add() fails, the reason is likely not a packet error. 3. When feature negotiation fails, the socket should be marked as not usable, so that the application is notified that an error occurs. This is achieved by a new label, which uses an error code of `Aborted' and which sets the socket state to CLOSED, as well as sk_err. 4. Avoids parsing the Ack twice in Respond state by not doing option processing again in dccp_rcv_respond_partopen_state_process (as option processing has already been done on the request_sock in dccp_check_req). Since this addresses congestion-control initialisation, a corresponding FIXME has been removed. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/input.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/input.c b/net/dccp/input.c index 3070015edc75..0672b7e1763e 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -421,8 +421,13 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + /* + * If option processing (Step 8) failed, return 1 here so that + * dccp_v4_do_rcv() sends a Reset. The Reset code depends on + * the option type and is set in dccp_parse_options(). + */ if (dccp_parse_options(sk, NULL, skb)) - goto out_invalid_packet; + return 1; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) @@ -475,6 +480,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, */ dccp_set_state(sk, DCCP_PARTOPEN); + /* + * If feature negotiation was successful, activate features now; + * an activation failure means that this host could not activate + * one ore more features (e.g. insufficient memory), which would + * leave at least one feature in an undefined state. + */ + if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) + goto unable_to_proceed; + /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -509,6 +523,16 @@ out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; + +unable_to_proceed: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; + /* + * We mark this socket as no longer usable, so that the loop in + * dccp_sendmsg() terminates and the application gets notified. + */ + dccp_set_state(sk, DCCP_CLOSED); + sk->sk_err = ECOMM; + return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, @@ -600,7 +624,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; } - if (sk->sk_state != DCCP_REQUESTING) { + if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { if (dccp_check_seqno(sk, skb)) goto discard; @@ -665,8 +689,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; case DCCP_REQUESTING: - /* FIXME: do congestion control initialization */ - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; -- cgit v1.2.3 From b235dc4abbc1356284bd0dc730efa711f394e0e2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl This removes the use of the sysctl and the minisock variable for the Send Ack Vector feature, which is now handled fully dynamically via feature negotiation; i.e. when CCID2 is enabled, Ack Vectors are automatically enabled (as per RFC 4341, 4.). Using a sysctl in parallel to this implementation would open the door to crashes, since much of the code relies on tests of the boolean minisock / sysctl variable. Thus, this patch replaces all tests of type if (dccp_msk(sk)->dccpms_send_ack_vector) /* ... */ with if (dp->dccps_hc_rx_ackvec != NULL) /* ... */ The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature negotiation concluded that Ack Vectors are to be used on the half-connection. Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child), so that the test is a valid one. The activation handler for Ack Vectors is called as soon as the feature negotiation has concluded at the * server when the Ack marking the transition RESPOND => OPEN arrives; * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN. Adding the sequence number of the Response packet to the Ack Vector has been removed, since (a) connection establishment implies that the Response has been received; (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e. this entry will always be ignored; (c) it can not be used for anything useful - to detect loss for instance, only packets received after the loss can serve as pseudo-dupacks. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 3 --- net/dccp/dccp.h | 3 +-- net/dccp/diag.c | 2 +- net/dccp/input.c | 12 +++--------- net/dccp/minisocks.c | 1 - net/dccp/options.c | 7 ++----- net/dccp/proto.c | 3 +-- net/dccp/sysctl.c | 7 ------- 9 files changed, 8 insertions(+), 33 deletions(-) (limited to 'net/dccp/input.c') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 1403745ab406..7a3bb1abb830 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ackvec = 1 - Whether or not to send Ack Vector options (sec. 11.5). - tx_ccid = 2 Default CCID for the sender-receiver half-connection. Depending on the choice of CCID, the Send Ack Vector feature is enabled automatically. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 60e94438eadd..61734e27abb7 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 #define DCCPF_INITIAL_ACK_RATIO 2 #define DCCPF_INITIAL_CCID DCCPC_CCID2 -#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 @@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_send_ack_vector; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 51436c825655..3fd16e82c003 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk) const struct dccp_sock *dp = dccp_sk(sk); return dp->dccps_timestamp_echo != 0 || #ifdef CONFIG_IP_DCCP_ACKVEC - (dccp_msk(sk)->dccpms_send_ack_vector && + (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || #endif inet_csk_ack_scheduled(sk); diff --git a/net/dccp/diag.c b/net/dccp/diag.c index d8a3509b26f6..93aae7c95550 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) info->tcpi_options |= TCPI_OPT_SACK; ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); diff --git a/net/dccp/input.c b/net/dccp/input.c index 0672b7e1763e..5eb443f656c1 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_ack_seq); } @@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) @@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); - if (dccp_msk(sk)->dccpms_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto out_invalid_packet; /* FIXME: change error code */ - /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); @@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 9e2232572662..0ebf8ebcf3de 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index 6b0704497e83..aca309e16632 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) goto out_invalid_option; break; @@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; @@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) if (dccp_insert_option_timestamp(sk, skb)) return -1; - } else if (dmsk->dccpms_send_ack_vector && + } else if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && dccp_insert_option_ackvec(sk, skb)) { return -1; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d420790b795..775eaa3d0c49 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -207,7 +207,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -225,7 +224,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dmsk->dccpms_send_ack_vector) { + if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 587c12f915c1..018e210875e1 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ackvec", - .data = &sysctl_dccp_feat_send_ack_vector, - .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, -- cgit v1.2.3 From bfbddd085a5bced6efb9e1bc4d029438f9639784 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Fix the adjustments to AWL and SWL This fixes a problem and a potential loophole with regard to seqno/ackno validity: the problem is that the initial adjustments to AWL/SWL were only performed at the begin of the connection, during the handshake. Since the Sequence Window feature is always greater than Wmin=32 (7.5.2), it is however necessary to perform these adjustments at least for the first W/W' (variables as per 7.5.1) packets in the lifetime of a connection. This requirement is complicated by the fact that W/W' can change at any time during the lifetime of a connection. Therefore the consequence is to perform this safety check each time SWL/AWL are updated. A second problem solved by this patch is that the remote/local Sequence Window feature values (which set the bounds for AWL/SWL/SWH) are undefined until the feature negotiation has completed. During the initial handshake we have more stringent sequence number protection, the changes added by this patch effect that {A,S}W{L,H} are within the correct bounds at the instant that feature negotiation completes (since the SeqWin feature activation handlers call dccp_update_gsr/gss()). A detailed rationale is below -- can be removed from the commit message. 1. Server sequence number checks during initial handshake --------------------------------------------------------- The server can not use the fields of the listening socket for seqno/ackno checks and thus needs to store all relevant information on a per-connection basis on the dccp_request socket. This is a size-constrained structure and has currently only ISS (dreq_iss) and ISR (dreq_isr) defined. Adding further fields (SW{L,H}, AW{L,H}) would increase the size of the struct and it is questionable whether this will have any practical gain. The currently implemented solution is as follows. * receiving first Request: dccp_v{4,6}_conn_request sets ISR := P.seqno, ISS := dccp_v{4,6}_init_sequence() * sending first Response: dccp_v{4,6}_send_response via dccp_make_response() sets P.seqno := ISS, sets P.ackno := ISR * receiving retransmitted Request: dccp_check_req() overrides ISR := P.seqno * answering retransmitted Request: dccp_make_response() sets ISS += 1, otherwise as per first Response * completing the handshake: succeeds in dccp_check_req() for the first Ack where P.ackno == ISS (P.seqno is not tested) * creating child socket: ISS, ISR are copied from the request_sock This solution will succeed whenever the server can receive the Request and the subsequent Ack in succession, without retransmissions. If there is packet loss, the client needs to retransmit until this condition succeeds; it will otherwise eventually give up. Adding further fields to the request_sock could increase the robustness a bit, in that it would make possible to let a reordered Ack (from a retransmitted Response) pass. The argument against such a solution is that if the packet loss is not persistent and an Ack gets through, why not wait for the one answering the original response: if the loss is persistent, it is probably better to not start the connection in the first place. Long story short: the present design (by Arnaldo) is simple and will likely work just as well as a more complicated solution. As a consequence, {A,S}W{L,H} are not needed until the moment the request_sock is cloned into the accept queue. At that stage feature negotiation has completed, so that the values for the local and remote Sequence Window feature (7.5.2) are known, i.e. we are now in a better position to compute {A,S}W{L,H}. 2. Client sequence number checks during initial handshake --------------------------------------------------------- Until entering PARTOPEN the client does not need the adjustments, since it constrains the Ack window to the packet it sent. * sending first Request: dccp_v{4,6}_connect() choose ISS, dccp_connect() then sets GAR := ISS (as per 8.5), dccp_transmit_skb() (with the previous bug fix) sets GSS := ISS, AWL := ISS, AWH := GSS * n-th retransmitted Request (with previous patch): dccp_retransmit_skb() via timer calls dccp_transmit_skb(), which sets GSS := ISS+n and then AWL := ISS, AWH := ISS+n * receiving any Response: dccp_rcv_request_sent_state_process() -- accepts packet if AWL <= P.ackno <= AWH; -- sets GSR = ISR = P.seqno * sending the Ack completing the handshake: dccp_send_ack() calls dccp_transmit_skb(), which sets GSS += 1 and AWL := ISS, AWH := GSS Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 20 ++++++++++++++++++++ net/dccp/input.c | 18 ++++++------------ net/dccp/minisocks.c | 30 +++++++++--------------------- 3 files changed, 35 insertions(+), 33 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f9ed0cbd1bf3..e4d6e76ced41 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -415,6 +415,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq) dp->dccps_gsr = seq; /* Sequence validity window depends on remote Sequence Window (7.5.1) */ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + /* + * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, + * 7.5.1 we perform this check beyond the initial handshake: W/W' are + * always > 32, so for the first W/W' packets in the lifetime of a + * connection we always have to adjust SWL. + * A second reason why we are doing this is that the window depends on + * the feature-remote value of Sequence Window: nothing stops the peer + * from updating this value while we are busy adjusting SWL for the + * first W packets (we would have to count from scratch again then). + * Therefore it is safer to always make sure that the Sequence Window + * is not artificially extended by a peer who grows SWL downwards by + * continually updating the feature-remote Sequence-Window. + * If sequence numbers wrap it is bad luck. But that will take a while + * (48 bit), and this measure prevents Sequence-number attacks. + */ + if (before48(dp->dccps_swl, dp->dccps_isr)) + dp->dccps_swl = dp->dccps_isr; dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } @@ -425,6 +442,9 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) dp->dccps_gss = seq; /* Ack validity window depends on local Sequence Window value (7.5.1) */ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + /* Adjust AWL so that it is not below ISS - see comment above for SWL */ + if (before48(dp->dccps_awl, dp->dccps_iss)) + dp->dccps_awl = dp->dccps_iss; dp->dccps_awh = dp->dccps_gss; } diff --git a/net/dccp/input.c b/net/dccp/input.c index 5eb443f656c1..e3f43d55e3ce 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -440,20 +440,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; - dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_update_gsr(sk, dp->dccps_isr); /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - * - * AWL was adjusted in dccp_v4_connect -acme + * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect + * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH + * is done as part of activating the feature values below, since + * these settings depend on the local/remote Sequence Window + * features, which were undefined or not confirmed until now. */ - dccp_set_seqno(&dp->dccps_swl, - max48(dp->dccps_swl, dp->dccps_isr)); + dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 0ecb19c5e8ce..f4d9c8f60ede 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -120,30 +120,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - */ - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; - dccp_update_gss(newsk, dreq->dreq_iss); - - newdp->dccps_isr = dreq->dreq_isr; - dccp_update_gsr(newsk, dreq->dreq_isr); - - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. + * Set S.ISR, S.GSR from packet (or Init Cookies) + * + * Setting AWL/AWH and SWL/SWH happens as part of the feature + * activation below, as these windows all depend on the local + * and remote Sequence Window feature values (7.5.2). */ - dccp_set_seqno(&newdp->dccps_swl, - max48(newdp->dccps_swl, newdp->dccps_isr)); - dccp_set_seqno(&newdp->dccps_awl, - max48(newdp->dccps_awl, newdp->dccps_iss)); + newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; + newdp->dccps_gar = newdp->dccps_iss; + newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; /* - * Activate features after initialising the sequence numbers, - * since CCID initialisation may depend on GSS, ISR, ISS etc. + * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { /* It is still raw copy of parent, so invalidate -- cgit v1.2.3 From ff49e27089ec363b7fc3849504e0435d447ab18a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp ccid-2: Ack Vector interface clean-up This patch brings the Ack Vector interface up to date. Its main purpose is to lay the basis for the subsequent patches of this set, which will use the new data structure fields and routines. There are no real algorithmic changes, rather an adaptation: (1) Replaced the static Ack Vector size (2) with a #define so that it can be adapted (with low loss / Ack Ratio, a value of 1 works, so 2 seems to be sufficient for the moment) and added a solution so that computing the ECN nonce will continue to work - even with larger Ack Vectors. (2) Replaced the #defines for Ack Vector states with a complete enum. (3) Replaced #defines to compute Ack Vector length and state with general purpose routines (inlines), and updated code to use these. (4) Added a `tail' field (conversion to circular buffer in subsequent patch). (5) Updated the (outdated) documentation for Ack Vector struct. (6) All sequence number containers now trimmed to 48 bits. (7) Removal of unused bits: * removed dccpav_ack_nonce from struct dccp_ackvec, since this is already redundantly stored in the `dccpavr_ack_nonce' (of Ack Vector record); * removed Elapsed Time for Ack Vectors (it was nowhere used); * replaced semantics of dccpavr_sent_len with dccpavr_ack_runlen, since the code needs to be able to remember the old run length; * reduced the de-/allocation routines (redundant / duplicate tests). Justification for removing Elapsed Time information [can be removed]: --------------------------------------------------------------------- 1. The Elapsed Time information for Ack Vectors was nowhere used in the code. 2. DCCP does not implement rate-based pacing of acknowledgments. The only recommendation for always including Elapsed Time is in section 11.3 of RFC 4340: "Receivers that rate-pace acknowledgements SHOULD [...] include Elapsed Time options". But such is not the case here. 3. It does not really improve estimation accuracy. The Elapsed Time field only records the time between the arrival of the last acknowledgeable packet and the time the Ack Vector is sent out. Since Linux does not (yet) implement delayed Acks, the time difference will typically be small, since often the arrival of a data packet triggers sending feedback at the HC-receiver. Justification for changes in de-/allocation routines [can be removed]: ---------------------------------------------------------------------- * INIT_LIST_HEAD in dccp_ackvec_record_new was redundant, since the list pointers were later overwritten when the node was added via list_add(); * dccp_ackvec_record_new() was called in a single place only; * calls to list_del_init() before calling dccp_ackvec_record_delete() were redundant, since subsequently the entire element was k-freed; * since all calls to dccp_ackvec_record_delete() were preceded to a call to list_del_init(), the WARN_ON test would never evaluate to true; * since all calls to dccp_ackvec_record_delete() were made from within list_for_each_entry_safe(), the test for avr == NULL was redundant; * list_empty() in ackvec_free was redundant, since the same condition is embedded in the loop condition of the subsequent list_for_each_entry_safe(). Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 178 ++++++++++++++++++------------------------------- net/dccp/ackvec.h | 103 +++++++++++++++------------- net/dccp/ccids/ccid2.c | 13 ++-- net/dccp/input.c | 6 +- 4 files changed, 127 insertions(+), 173 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 01e4d39fa232..85ad70cfba5b 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -1,7 +1,8 @@ /* * net/dccp/ackvec.c * - * An implementation of the DCCP protocol + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK * Copyright (c) 2005 Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or modify it @@ -23,24 +24,32 @@ static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; -static struct dccp_ackvec_record *dccp_ackvec_record_new(void) +struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) { - struct dccp_ackvec_record *avr = - kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); + struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); + + if (av != NULL) { + av->av_buf_head = DCCPAV_MAX_ACKVEC_LEN - 1; + INIT_LIST_HEAD(&av->av_records); + } + return av; +} - if (avr != NULL) - INIT_LIST_HEAD(&avr->avr_node); +static void dccp_ackvec_purge_records(struct dccp_ackvec *av) +{ + struct dccp_ackvec_record *cur, *next; - return avr; + list_for_each_entry_safe(cur, next, &av->av_records, avr_node) + kmem_cache_free(dccp_ackvec_record_slab, cur); + INIT_LIST_HEAD(&av->av_records); } -static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) +void dccp_ackvec_free(struct dccp_ackvec *av) { - if (unlikely(avr == NULL)) - return; - /* Check if deleting a linked record */ - WARN_ON(!list_empty(&avr->avr_node)); - kmem_cache_free(dccp_ackvec_record_slab, avr); + if (likely(av != NULL)) { + dccp_ackvec_purge_records(av); + kmem_cache_free(dccp_ackvec_slab, av); + } } static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, @@ -68,24 +77,16 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); - u16 len = av->av_vec_len + 2 * nr_opts, i; - u32 elapsed_time; + u16 len = av->av_vec_len + 2 * nr_opts; + u8 i, nonce = 0; const unsigned char *tail, *from; unsigned char *to; struct dccp_ackvec_record *avr; - suseconds_t delta; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) return -1; - delta = ktime_us_delta(ktime_get_real(), av->av_time); - elapsed_time = delta / 10; - - if (elapsed_time != 0 && - dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) - return -1; - - avr = dccp_ackvec_record_new(); + avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr == NULL) return -1; @@ -94,7 +95,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) to = skb_push(skb, len); len = av->av_vec_len; from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; + tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; for (i = 0; i < nr_opts; ++i) { int copylen = len; @@ -102,7 +103,13 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) if (len > DCCP_SINGLE_OPT_MAXLEN) copylen = DCCP_SINGLE_OPT_MAXLEN; - *to++ = DCCPO_ACK_VECTOR_0; + /* + * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via + * its type; ack_nonce is the sum of all individual buf_nonce's. + */ + nonce ^= av->av_buf_nonce[i]; + + *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; *to++ = copylen + 2; /* Check if buf_head wraps */ @@ -123,75 +130,24 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) } /* - * From RFC 4340, A.2: - * - * For each acknowledgement it sends, the HC-Receiver will add an - * acknowledgement record. ack_seqno will equal the HC-Receiver - * sequence number it used for the ack packet; ack_ptr will equal - * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will - * equal buf_nonce. + * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. */ - avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = av->av_buf_nonce; - avr->avr_sent_len = av->av_vec_len; + avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = nonce; + avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); dccp_ackvec_insert_avr(av, avr); dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " "ack_ackno=%llu\n", - dccp_role(sk), avr->avr_sent_len, + dccp_role(sk), avr->avr_ack_runlen, (unsigned long long)avr->avr_ack_seqno, (unsigned long long)avr->avr_ack_ackno); return 0; } -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; - av->av_buf_ackno = UINT48_MAX + 1; - av->av_buf_nonce = 0; - av->av_time = ktime_set(0, 0); - av->av_vec_len = 0; - INIT_LIST_HEAD(&av->av_records); - } - - return av; -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (unlikely(av == NULL)) - return; - - if (!list_empty(&av->av_records)) { - struct dccp_ackvec_record *avr, *next; - - list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { - list_del_init(&avr->avr_node); - dccp_ackvec_record_delete(avr); - } - } - - kmem_cache_free(dccp_ackvec_slab, av); -} - -static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const u32 index) -{ - return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; -} - -static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const u32 index) -{ - return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; -} - /* * If several packets are missing, the HC-Receiver may prefer to enter multiple * bytes with run length 0, rather than a single byte with a larger run length; @@ -204,7 +160,7 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, unsigned int gap; long new_head; - if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) + if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN) return -ENOBUFS; gap = packets - 1; @@ -212,18 +168,18 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, if (new_head < 0) { if (gap > 0) { - memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, + memset(av->av_buf, DCCPAV_NOT_RECEIVED, gap + new_head + 1); gap = -new_head; } - new_head += DCCP_MAX_ACKVEC_LEN; + new_head += DCCPAV_MAX_ACKVEC_LEN; } av->av_buf_head = new_head; if (gap > 0) memset(av->av_buf + av->av_buf_head + 1, - DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); + DCCPAV_NOT_RECEIVED, gap); av->av_buf[av->av_buf_head] = state; av->av_vec_len += packets; @@ -236,6 +192,8 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, const u64 ackno, const u8 state) { + u8 *cur_head = av->av_buf + av->av_buf_head, + *buf_end = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; /* * Check at the right places if the buffer is full, if it is, tell the * caller to start dropping packets till the HC-Sender acks our ACK @@ -260,7 +218,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, /* See if this is the first ackno being inserted */ if (av->av_vec_len == 0) { - av->av_buf[av->av_buf_head] = state; + *cur_head = state; av->av_vec_len = 1; } else if (after48(ackno, av->av_buf_ackno)) { const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); @@ -269,10 +227,9 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * Look if the state of this packet is the same as the * previous ackno and if so if we can bump the head len. */ - if (delta == 1 && - dccp_ackvec_state(av, av->av_buf_head) == state && - dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) - av->av_buf[av->av_buf_head]++; + if (delta == 1 && dccp_ackvec_state(cur_head) == state && + dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN) + *cur_head += 1; else if (dccp_ackvec_set_buf_head_state(av, delta, state)) return -ENOBUFS; } else { @@ -285,21 +242,17 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); - u32 index = av->av_buf_head; while (1) { - const u8 len = dccp_ackvec_len(av, index); - const u8 av_state = dccp_ackvec_state(av, index); + const u8 len = dccp_ackvec_runlen(cur_head); /* * valid packets not yet in av_buf have a reserved * entry, with a len equal to 0. */ - if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && - len == 0 && delta == 0) { /* Found our - reserved seat! */ + if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) { dccp_pr_debug("Found %llu reserved seat!\n", (unsigned long long)ackno); - av->av_buf[index] = state; + *cur_head = state; goto out; } /* len == 0 means one packet */ @@ -307,13 +260,12 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, goto out_duplicate; delta -= len + 1; - if (++index == DCCP_MAX_ACKVEC_LEN) - index = 0; + if (++cur_head == buf_end) + cur_head = av->av_buf; } } av->av_buf_ackno = ackno; - av->av_time = ktime_get_real(); out: return 0; @@ -333,13 +285,13 @@ static void dccp_ackvec_throw_record(struct dccp_ackvec *av, if (av->av_buf_head <= avr->avr_ack_ptr) av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; else - av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - + av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 - av->av_buf_head + avr->avr_ack_ptr; /* free records */ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del_init(&avr->avr_node); - dccp_ackvec_record_delete(avr); + list_del(&avr->avr_node); + kmem_cache_free(dccp_ackvec_record_slab, avr); } } @@ -357,7 +309,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, if (ackno == avr->avr_ack_seqno) { dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " "ack_ackno=%llu, ACKED!\n", - dccp_role(sk), 1, + dccp_role(sk), avr->avr_ack_runlen, (unsigned long long)avr->avr_ack_seqno, (unsigned long long)avr->avr_ack_ackno); dccp_ackvec_throw_record(av, avr); @@ -387,7 +339,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, */ avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); while (i--) { - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + const u8 rl = dccp_ackvec_runlen(vector); u64 ackno_end_rl; dccp_set_seqno(&ackno_end_rl, *ackno - rl); @@ -404,8 +356,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, break; found: if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { - const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { + if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) { dccp_pr_debug("%s ACK vector 0, len=%d, " "ack_seqno=%llu, ack_ackno=%llu, " "ACKED!\n", @@ -448,10 +399,9 @@ int __init dccp_ackvec_init(void) if (dccp_ackvec_slab == NULL) goto out_err; - dccp_ackvec_record_slab = - kmem_cache_create("dccp_ackvec_record", - sizeof(struct dccp_ackvec_record), - 0, SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", + sizeof(struct dccp_ackvec_record), + 0, SLAB_HWCACHE_ALIGN, NULL); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 1c10814fdf72..df18f9030dbb 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -3,9 +3,9 @@ /* * net/dccp/ackvec.h * - * An implementation of the DCCP protocol + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK * Copyright (c) 2005 Arnaldo Carvalho de Melo - * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -13,75 +13,84 @@ #include #include -#include #include #include -/* We can spread an ack vector across multiple options */ -#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) +/* + * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, + * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 + * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives + * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. + */ +#define DCCPAV_NUM_ACKVECS 2 +#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) /* Estimated minimum average Ack Vector length - used for updating MPS */ #define DCCPAV_MIN_OPTLEN 16 -#define DCCP_ACKVEC_STATE_RECEIVED 0 -#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) -#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) +enum dccp_ackvec_states { + DCCPAV_RECEIVED = 0x00, + DCCPAV_ECN_MARKED = 0x40, + DCCPAV_RESERVED = 0x80, + DCCPAV_NOT_RECEIVED = 0xC0 +}; +#define DCCPAV_MAX_RUNLEN 0x3F -#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ -#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ +static inline u8 dccp_ackvec_runlen(const u8 *cell) +{ + return *cell & DCCPAV_MAX_RUNLEN; +} -/** struct dccp_ackvec - ack vector - * - * This data structure is the one defined in RFC 4340, Appendix A. - * - * @av_buf_head - circular buffer head - * @av_buf_tail - circular buffer tail - * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the - * buffer (i.e. %av_buf_head) - * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked - * by the buffer with State 0 - * - * Additionally, the HC-Receiver must keep some information about the - * Ack Vectors it has recently sent. For each packet sent carrying an - * Ack Vector, it remembers four variables: +static inline u8 dccp_ackvec_state(const u8 *cell) +{ + return *cell & ~DCCPAV_MAX_RUNLEN; +} + +/** struct dccp_ackvec - Ack Vector main data structure * - * @av_records - list of dccp_ackvec_record - * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * This implements a fixed-size circular buffer within an array and is largely + * based on Appendix A of RFC 4340. * - * @av_time - the time in usecs - * @av_buf - circular buffer of acknowledgeable packets + * @av_buf: circular buffer storage area + * @av_buf_head: head index; begin of live portion in @av_buf + * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf + * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf + * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to + * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf + * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) + * @av_veclen: length of the live portion of @av_buf */ struct dccp_ackvec { - u64 av_buf_ackno; - struct list_head av_records; - ktime_t av_time; + u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; u16 av_buf_head; + u16 av_buf_tail; + u64 av_buf_ackno:48; + bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; + struct list_head av_records; u16 av_vec_len; - u8 av_buf_nonce; - u8 av_ack_nonce; - u8 av_buf[DCCP_MAX_ACKVEC_LEN]; }; -/** struct dccp_ackvec_record - ack vector record +/** struct dccp_ackvec_record - Records information about sent Ack Vectors * - * ACK vector record as defined in Appendix A of spec. + * These list entries define the additional information which the HC-Receiver + * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. * - * The list is sorted by avr_ack_seqno + * @avr_node: the list node in @av_records + * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on + * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to + * @avr_ack_ptr: pointer into @av_buf where this record starts + * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending + * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent * - * @avr_node - node in av_records - * @avr_ack_seqno - sequence number of the packet this record was sent on - * @avr_ack_ackno - sequence number being acknowledged - * @avr_ack_ptr - pointer into av_buf where this record starts - * @avr_ack_nonce - av_ack_nonce at the time this record was sent - * @avr_sent_len - lenght of the record in av_buf + * The list as a whole is sorted in descending order by @avr_ack_seqno. */ struct dccp_ackvec_record { struct list_head avr_node; - u64 avr_ack_seqno; - u64 avr_ack_ackno; + u64 avr_ack_seqno:48; + u64 avr_ack_ackno:48; u16 avr_ack_ptr; - u16 avr_sent_len; - u8 avr_ack_nonce; + u8 avr_ack_runlen; + u8 avr_ack_nonce:1; }; struct sock; diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f56ab68a4b78..813d5cd40e8b 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -580,8 +580,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) &vector, &veclen)) != -1) { /* go through this ack vector */ while (veclen--) { - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; - u64 ackno_end_rl = SUB48(ackno, rl); + u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector)); ccid2_pr_debug("ackvec start:%llu end:%llu\n", (unsigned long long)ackno, @@ -604,17 +603,15 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = *vector & - DCCP_ACKVEC_STATE_MASK; + const u8 state = dccp_ackvec_state(vector); /* new packet received or marked */ - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && + if (state != DCCPAV_NOT_RECEIVED && !seqp->ccid2s_acked) { - if (state == - DCCP_ACKVEC_STATE_ECN_MARKED) { + if (state == DCCPAV_ECN_MARKED) ccid2_congestion_event(sk, seqp); - } else + else ccid2_new_ack(sk, seqp, &maxincr); diff --git a/net/dccp/input.c b/net/dccp/input.c index e3f43d55e3ce..70ad0ba72146 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -377,8 +377,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) + DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) goto discard; dccp_deliver_input_to_ccids(sk, skb); @@ -627,8 +626,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) + DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) goto discard; dccp_deliver_input_to_ccids(sk, skb); -- cgit v1.2.3 From 68b1de15765f2b0e0925e692dab2b2fa2abd93fc Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp ccid-2: Algorithm to update buffer state This provides a routine to consistently update the buffer state when the peer acknowledges receipt of Ack Vectors; updating state in the list of Ack Vectors as well as in the circular buffer. While based on RFC 4340, several additional (and necessary) precautions were added to protect the consistency of the buffer state. These additions are essential, since analysis and experience showed that the basic algorithm was insufficient for this task (which lead to problems that were hard to debug). The algorithm now * deals with HC-sender acknowledging to HC-receiver and vice versa, * keeps track of the last unacknowledged but received seqno in tail_ackno, * has special cases to reset the overflow condition when appropriate, * is protected against receiving older information (would mess up buffer state). Note: The older code performed an unnecessary step, where the sender cleared Ack Vector state by parsing the Ack Vector received by the HC-receiver. Doing this was entirely redundant, since * the receiver always puts the full acknowledgment window (groups 2,3 in 11.4.2) into the Ack Vectors it sends; hence the HC-receiver is only interested in the highest state that the HC-sender received; * this means that the acknowledgment number on the (Data)Ack from the HC-sender is sufficient; and work done in parsing earlier state is not necessary, since the later state subsumes the earlier one (see also RFC 4340, A.4). This older interface (dccp_ackvec_parse()) is therefore removed. Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dccp/ackvec.h | 6 ++++ net/dccp/input.c | 4 +-- net/dccp/options.c | 6 ++-- 4 files changed, 98 insertions(+), 6 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1184d5e5dc96..f1341a617f96 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -92,6 +92,24 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) return 0; } +static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, + const u64 ackno) +{ + struct dccp_ackvec_record *avr; + /* + * Exploit that records are inserted in descending order of sequence + * number, start with the oldest record first. If @ackno is `before' + * the earliest ack_ackno, the packet is too old to be considered. + */ + list_for_each_entry_reverse(avr, av_list, avr_node) { + if (avr->avr_ack_seqno == ackno) + return avr; + if (before48(ackno, avr->avr_ack_seqno)) + break; + } + return NULL; +} + /* * Buffer index and length computation using modulo-buffersize arithmetic. * Note that, as pointers move from right to left, head is `before' tail. @@ -356,6 +374,76 @@ int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, return 0; } +/** + * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection + * This routine is called when the peer acknowledges the receipt of Ack Vectors + * up to and including @ackno. While based on on section A.3 of RFC 4340, here + * are additional precautions to prevent corrupted buffer state. In particular, + * we use tail_ackno to identify outdated records; it always marks the earliest + * packet of group (2) in 11.4.2. + */ +void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) + { + struct dccp_ackvec_record *avr, *next; + u8 runlen_now, eff_runlen; + s64 delta; + + avr = dccp_ackvec_lookup(&av->av_records, ackno); + if (avr == NULL) + return; + /* + * Deal with outdated acknowledgments: this arises when e.g. there are + * several old records and the acks from the peer come in slowly. In + * that case we may still have records that pre-date tail_ackno. + */ + delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); + if (delta < 0) + goto free_records; + /* + * Deal with overlapping Ack Vectors: don't subtract more than the + * number of packets between tail_ackno and ack_ackno. + */ + eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; + + runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); + /* + * The run length of Ack Vector cells does not decrease over time. If + * the run length is the same as at the time the Ack Vector was sent, we + * free the ack_ptr cell. That cell can however not be freed if the run + * length has increased: in this case we need to move the tail pointer + * backwards (towards higher indices), to its next-oldest neighbour. + */ + if (runlen_now > eff_runlen) { + + av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; + av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); + + /* This move may not have cleared the overflow flag. */ + if (av->av_overflow) + av->av_overflow = (av->av_buf_head == av->av_buf_tail); + } else { + av->av_buf_tail = avr->avr_ack_ptr; + /* + * We have made sure that avr points to a valid cell within the + * buffer. This cell is either older than head, or equals head + * (empty buffer): in both cases we no longer have any overflow. + */ + av->av_overflow = 0; + } + + /* + * The peer has acknowledged up to and including ack_ackno. Hence the + * first packet in group (2) of 11.4.2 is the successor of ack_ackno. + */ + av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); + +free_records: + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del(&avr->avr_node); + kmem_cache_free(dccp_ackvec_record_slab, avr); + } +} + int __init dccp_ackvec_init(void) { dccp_ackvec_slab = kmem_cache_create("dccp_ackvec", diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 92f65b0fef5b..b757e9b4110f 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -118,6 +118,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, const u8 *value, const u8 len); extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); +extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) @@ -149,6 +150,11 @@ static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, return -1; } +static inline void dccp_ackvec_clear_state(struct dccp_ackvec *av, + const u64 ackno) +{ +} + static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, const u64 ackno) { diff --git a/net/dccp/input.c b/net/dccp/input.c index 70ad0ba72146..77a5d57ab702 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -164,8 +164,8 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_hc_rx_ackvec != NULL) - dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvec_clear_state(dp->dccps_hc_rx_ackvec, + DCCP_SKB_CB(skb)->dccpd_ack_seq); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/options.c b/net/dccp/options.c index 3163ae980f16..b11d7b7167f0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; const unsigned char *opt_end = (unsigned char *)dh + @@ -133,9 +132,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) - goto out_invalid_option; + dccp_pr_debug("%s Ack Vector (len=%u)\n", dccp_role(sk), + len); break; case DCCPO_TIMESTAMP: if (len != 4) -- cgit v1.2.3 From 283fb4a5f39d1521d53e1044bff0ba2654acf145 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp ccid-2: Consolidate Ack-Vector processing within main DCCP module This aggregates Ack Vector processing (handling input and clearing old state) into one function, for the following reasons and benefits: * all Ack Vector-specific processing is now in one place; * duplicated code is removed; * ensuring sanity: from an Ack Vector point of view, it is better to clear the old state first before entering new state; * Ack Event handling happens mostly within the CCIDs, not the main DCCP module. Signed-off-by: Gerrit Renker --- net/dccp/input.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/input.c b/net/dccp/input.c index 77a5d57ab702..9a108ce17fc7 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -159,13 +159,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } -static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) +static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) { - struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - if (dp->dccps_hc_rx_ackvec != NULL) - dccp_ackvec_clear_state(dp->dccps_hc_rx_ackvec, - DCCP_SKB_CB(skb)->dccpd_ack_seq); + if (av == NULL) + return; + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvec_input(av, skb); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) @@ -364,21 +366,13 @@ discard: int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len) { - struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) - goto discard; + dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); @@ -621,14 +615,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dccp_parse_options(sk, NULL, skb)) return 1; - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) - goto discard; - + dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); } -- cgit v1.2.3 From ddab05568eaa70fc92b2aae957136f188f724e9c Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Clean up slow-path input processing This patch rearranges the order of statements of the slow-path input processing (i.e. any other state than OPEN), to resolve the following issues. 1. Dependencies: the order of statements now better matches RFC 4340, 8.5, i.e. step 7 is before step 9 (previously 9 was before 7), and parsing options in step 8 (which can consume resources) now comes after step 7. 2. Bug-fix: in state CLOSED, there should not be any sequence number checking or option processing. This is why the test for CLOSED has been moved after the test for LISTEN. 3. As before sequence number checks are omitted if in state LISTEN/REQUEST, due to the note underneath the table in RFC 4340, 7.5.3. 4. Packets are now passed on to Ack Vector / CCID processing only after - step 7 (receive unexpected packets), - step 9 (receive Reset), - step 13 (receive CloseReq), - step 14 (receive Close) and only if the state is PARTOPEN. This simplifies CCID processing: - in LISTEN/CLOSED the CCIDs are non-existent; - in RESPOND/REQUEST the CCIDs have not yet been negotiated; - in CLOSEREQ and active-CLOSING the node has already closed this socket; - in passive-CLOSING the client is waiting for its Reset. In the last case, RFC 4340, 8.3 leaves it open to ignore further incoming data, which is the approach taken here. As a result of (3), CCID processing is now indeed confined to OPEN/PARTOPEN states, i.e. congestion control is performed only on the flow of data packets. This avoids pathological cases of doing congestion control on those messages which set up and terminate the connection. I have done a few checks to see if this creates a problem in other parts of the code. This seems not to be the case; even if there were one, it would be better to fix it than to perform congestion control on Close/Request/Response messages. Similarly for Ack Vectors (as they depend on the negotiated CCID). Signed-off-by: Gerrit Renker --- net/dccp/input.c | 68 +++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/input.c b/net/dccp/input.c index 9a108ce17fc7..b1e38bf94456 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -603,22 +603,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; + } else if (sk->sk_state == DCCP_CLOSED) { + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; } - if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { - if (dccp_check_seqno(sk, skb)) - goto discard; - - /* - * Step 8: Process options and mark acknowledgeable - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; + /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ + if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) + goto discard; - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if ((dp->dccps_role != DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_RESPONSE) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); + goto discard; } + /* Step 8: Process options */ + if (dccp_parse_options(sk, NULL, skb)) + return 1; + /* * Step 9: Process Reset * If P.type == Reset, @@ -626,41 +640,21 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return - */ + */ if (dh->dccph_type == DCCP_PKT_RESET) { dccp_rcv_reset(sk, skb); return 0; - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && - dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ if (dccp_rcv_close(sk, skb)) return 0; goto discard; } switch (sk->sk_state) { - case DCCP_CLOSED: - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - case DCCP_REQUESTING: queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) @@ -669,8 +663,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, __kfree_skb(skb); return 0; - case DCCP_RESPOND: case DCCP_PARTOPEN: + /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ + dccp_handle_ackvec_processing(sk, skb); + dccp_deliver_input_to_ccids(sk, skb); + /* fall through */ + case DCCP_RESPOND: queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); break; -- cgit v1.2.3 From 49ffc29a0223adbe0ea7005eea3ab2a03abbeb06 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Clamping RTT values This extracts the clamping part of dccp_sample_rtt() and makes it available to other parts of the code (as e.g. used in the next patch). Note: The function dccp_sample_rtt() now reduces to subtracting the elapsed time. This could be eliminated but would require shorter prefixes and thus is not done by this patch - maybe an idea for later. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 9 ++++++++- net/dccp/input.c | 11 +---------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'net/dccp/input.c') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b63a82ccb2b2..5281190aa19c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -334,7 +334,14 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); extern int dccp_invalid_packet(struct sk_buff *skb); -extern u32 dccp_sample_rtt(struct sock *sk, long delta); + +static inline u32 dccp_sane_rtt(long usec_sample) +{ + if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) + DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); + return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); +} +extern u32 dccp_sample_rtt(struct sock *sk, long delta); static inline int dccp_bad_service_code(const struct sock *sk, const __be32 service) diff --git a/net/dccp/input.c b/net/dccp/input.c index b1e38bf94456..df0e6714aa11 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -707,16 +707,7 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) /* dccpor_elapsed_time is either zeroed out or set and > 0 */ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; + return dccp_sane_rtt(delta); } EXPORT_SYMBOL_GPL(dccp_sample_rtt); -- cgit v1.2.3 From 410e27a49bb98bc7fa3ff5fc05cc313817b9f253 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 9 Sep 2008 13:27:22 +0200 Subject: This reverts "Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp" as it accentally contained the wrong set of patches. These will be submitted separately. Signed-off-by: Gerrit Renker --- Documentation/networking/dccp.txt | 54 +- include/linux/dccp.h | 122 ++- include/net/tcp.h | 15 - net/dccp/Kconfig | 3 + net/dccp/Makefile | 5 +- net/dccp/ackvec.c | 619 ++++++------ net/dccp/ackvec.h | 204 ++-- net/dccp/ccid.c | 101 +- net/dccp/ccid.h | 113 +-- net/dccp/ccids/Kconfig | 30 +- net/dccp/ccids/ccid2.c | 622 +++++++----- net/dccp/ccids/ccid2.h | 63 +- net/dccp/ccids/ccid3.c | 762 +++++++++------ net/dccp/ccids/ccid3.h | 153 +-- net/dccp/ccids/lib/loss_interval.c | 30 +- net/dccp/ccids/lib/loss_interval.h | 4 +- net/dccp/ccids/lib/packet_history.c | 282 +++--- net/dccp/ccids/lib/packet_history.h | 78 +- net/dccp/ccids/lib/tfrc.h | 16 - net/dccp/ccids/lib/tfrc_equation.c | 29 +- net/dccp/dccp.h | 104 +- net/dccp/diag.c | 2 +- net/dccp/feat.c | 1805 +++++++++-------------------------- net/dccp/feat.h | 144 +-- net/dccp/input.c | 164 ++-- net/dccp/ipv4.c | 4 +- net/dccp/ipv6.c | 4 +- net/dccp/minisocks.c | 87 +- net/dccp/options.c | 341 ++++--- net/dccp/output.c | 279 ++---- net/dccp/probe.c | 75 +- net/dccp/proto.c | 281 ++---- net/dccp/qpolicy.c | 137 --- net/dccp/sysctl.c | 64 +- net/dccp/timer.c | 42 +- net/ipv4/tcp_input.c | 17 +- 36 files changed, 2884 insertions(+), 3971 deletions(-) delete mode 100644 net/dccp/qpolicy.c (limited to 'net/dccp/input.c') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index fcfc12534428..39131a3c78f8 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -45,25 +45,6 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree Socket options ============== -DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes -a policy ID as argument and can only be set before the connection (i.e. changes -during an established connection are not supported). Currently, two policies are -defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, -and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an -u32 priority value as ancillary data to sendmsg(), where higher numbers indicate -a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows: - cmsg->cmsg_level = SOL_DCCP; - cmsg->cmsg_type = DCCP_SCM_PRIORITY; - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ - -DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero -value is always interpreted as unbounded queue length. If different from zero, -the interpretation of this parameter depends on the current dequeuing policy -(see above): the "simple" policy will enforce a fixed queue size by returning -EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the -lowest-priority packet first. The default value for this parameter is -initialised from /proc/sys/net/dccp/default/tx_qlen. DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, @@ -76,24 +57,6 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. -DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs -supported by the endpoint (see include/linux/dccp.h for symbolic constants). -The caller needs to provide a sufficiently large (> 2) array of type uint8_t. - -DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same -time, combining the operation of the next two socket options. This option is -preferrable over the latter two, since often applications will use the same -type of CCID for both directions; and mixed use of CCIDs is not currently well -understood. This socket option takes as argument at least one uint8_t value, or -an array of uint8_t values, which must match available CCIDS (see above). CCIDs -must be registered on the socket before calling connect() or listen(). - -DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets -the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is `int', not uint8_t. - -DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. - DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait @@ -152,16 +115,23 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. +send_ndp = 1 + Whether or not to send NDP count options (sec. 7.7.2). + +send_ackvec = 1 + Whether or not to send Ack Vector options (sec. 11.5). + +ack_ratio = 2 + The default Ack Ratio (sec. 11.3) to use. + tx_ccid = 2 - Default CCID for the sender-receiver half-connection. Depending on the - choice of CCID, the Send Ack Vector feature is enabled automatically. + Default CCID for the sender-receiver half-connection. rx_ccid = 2 - Default CCID for the receiver-sender half-connection; see tx_ccid. + Default CCID for the receiver-sender half-connection. seq_window = 100 - The initial sequence window (sec. 7.5.2) of the sender. This influences - the local ackno validity and the remote seqno validity windows (7.5.1). + The initial sequence window (sec. 7.5.2). tx_qlen = 5 The size of the transmit buffer in packets. A value of 0 corresponds diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 010e2d87ed75..6080449fbec9 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -165,13 +165,9 @@ enum { DCCPO_TIMESTAMP_ECHO = 42, DCCPO_ELAPSED_TIME = 43, DCCPO_MAX = 45, - DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ - DCCPO_MAX_RX_CCID_SPECIFIC = 191, - DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ - DCCPO_MAX_TX_CCID_SPECIFIC = 255, + DCCPO_MIN_CCID_SPECIFIC = 128, + DCCPO_MAX_CCID_SPECIFIC = 255, }; -/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ -#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -180,36 +176,27 @@ enum { }; /* DCCP features (RFC 4340 section 6.4) */ -enum dccp_feature_numbers { +enum { DCCPF_RESERVED = 0, DCCPF_CCID = 1, - DCCPF_SHORT_SEQNOS = 2, + DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ DCCPF_SEQUENCE_WINDOW = 3, - DCCPF_ECN_INCAPABLE = 4, + DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ DCCPF_ACK_RATIO = 5, DCCPF_SEND_ACK_VECTOR = 6, DCCPF_SEND_NDP_COUNT = 7, DCCPF_MIN_CSUM_COVER = 8, - DCCPF_DATA_CHECKSUM = 9, + DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ /* 10-127 reserved */ DCCPF_MIN_CCID_SPECIFIC = 128, - DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ DCCPF_MAX_CCID_SPECIFIC = 255, }; -/* DCCP socket control message types for cmsg */ -enum dccp_cmsg_type { - DCCP_SCM_PRIORITY = 1, - DCCP_SCM_QPOLICY_MAX = 0xFFFF, - /* ^-- Up to here reserved exclusively for qpolicy parameters */ - DCCP_SCM_MAX -}; - -/* DCCP priorities for outgoing/queued packets */ -enum dccp_packet_dequeueing_policy { - DCCPQ_POLICY_SIMPLE, - DCCPQ_POLICY_PRIO, - DCCPQ_POLICY_MAX +/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ +struct dccp_so_feat { + __u8 dccpsf_feat; + __u8 __user *dccpsf_val; + __u8 dccpsf_len; }; /* DCCP socket options */ @@ -221,12 +208,6 @@ enum dccp_packet_dequeueing_policy { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 -#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 -#define DCCP_SOCKOPT_CCID 13 -#define DCCP_SOCKOPT_TX_CCID 14 -#define DCCP_SOCKOPT_RX_CCID 15 -#define DCCP_SOCKOPT_QPOLICY_ID 16 -#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 @@ -374,13 +355,62 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) return __dccp_hdr_len(dccp_hdr(skb)); } + +/* initial values for each feature */ +#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 +#define DCCPF_INITIAL_ACK_RATIO 2 +#define DCCPF_INITIAL_CCID DCCPC_CCID2 +#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 +/* FIXME: for now we're default to 1 but it should really be 0 */ +#define DCCPF_INITIAL_SEND_NDP_COUNT 1 + +/** + * struct dccp_minisock - Minimal DCCP connection representation + * + * Will be used to pass the state from dccp_request_sock to dccp_sock. + * + * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) + * @dccpms_ccid - Congestion Control Id (CCID) (section 10) + * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) + * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) + * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) + * @dccpms_pending - List of features being negotiated + * @dccpms_conf - + */ +struct dccp_minisock { + __u64 dccpms_sequence_window; + __u8 dccpms_rx_ccid; + __u8 dccpms_tx_ccid; + __u8 dccpms_send_ack_vector; + __u8 dccpms_send_ndp_count; + __u8 dccpms_ack_ratio; + struct list_head dccpms_pending; + struct list_head dccpms_conf; +}; + +struct dccp_opt_conf { + __u8 *dccpoc_val; + __u8 dccpoc_len; +}; + +struct dccp_opt_pend { + struct list_head dccpop_node; + __u8 dccpop_type; + __u8 dccpop_feat; + __u8 *dccpop_val; + __u8 dccpop_len; + int dccpop_conf; + struct dccp_opt_conf *dccpop_sc; +}; + +extern void dccp_minisock_init(struct dccp_minisock *dmsk); + /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -390,7 +420,6 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; - struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -462,28 +491,21 @@ struct dccp_ackvec; * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) + * @dccps_minisock - associated minisock (accessed via dccp_msk) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) + * @dccps_xmit_timer - timer for when CCID is not ready to send * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { @@ -507,26 +529,19 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; + __u16 dccps_pcslen; + __u16 dccps_pcrlen; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; - struct list_head dccps_featneg; + struct dccp_minisock dccps_minisock; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; @@ -535,6 +550,11 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) return (struct dccp_sock *)sk; } +static inline struct dccp_minisock *dccp_msk(const struct sock *sk) +{ + return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; +} + static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 6bc4b8148ca0..8983386356a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -782,21 +782,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) -/* - * Convert RFC3390 larger initial windows into an equivalent number of packets. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than a multiplier in the relevant range. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 bytes) -{ - return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); -} - extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 206c16ad9c3c..7aa2a7acc7ec 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -25,6 +25,9 @@ config INET_DCCP_DIAG def_tristate y if (IP_DCCP = y && INET_DIAG = y) def_tristate m +config IP_DCCP_ACKVEC + bool + source "net/dccp/ccids/Kconfig" menu "DCCP Kernel Hacking" diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 0c1c9af2bf7e..f4f8793aafff 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,7 +1,6 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o -dccp-y := ccid.o feat.o input.o minisocks.o options.o \ - qpolicy.o output.o proto.o timer.o ackvec.o +dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o dccp_ipv4-y := ipv4.o @@ -9,6 +8,8 @@ dccp_ipv4-y := ipv4.o obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o dccp_ipv6-y := ipv6.o +dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o + obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 41819848bdda..1e8be246ad15 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -1,375 +1,445 @@ /* * net/dccp/ackvec.c * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK + * An implementation of the DCCP protocol * Copyright (c) 2005 Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License; */ + +#include "ackvec.h" #include "dccp.h" + +#include +#include +#include #include +#include #include +#include + static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) +static struct dccp_ackvec_record *dccp_ackvec_record_new(void) { - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); + struct dccp_ackvec_record *avr = + kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; + if (avr != NULL) + INIT_LIST_HEAD(&avr->avr_node); + + return avr; } -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) +static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) { - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); + if (unlikely(avr == NULL)) + return; + /* Check if deleting a linked record */ + WARN_ON(!list_empty(&avr->avr_node)); + kmem_cache_free(dccp_ackvec_record_slab, avr); } -void dccp_ackvec_free(struct dccp_ackvec *av) +static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, + struct dccp_ackvec_record *avr) { - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); + /* + * AVRs are sorted by seqno. Since we are sending them in order, we + * just add the AVR at the head of the list. + * -sorbo. + */ + if (!list_empty(&av->av_records)) { + const struct dccp_ackvec_record *head = + list_entry(av->av_records.next, + struct dccp_ackvec_record, + avr_node); + BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); } + + list_add(&avr->avr_node, &av->av_records); } -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) +int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + /* Figure out how many options do we need to represent the ackvec */ + const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + u16 len = av->av_vec_len + 2 * nr_opts, i; + u32 elapsed_time; + const unsigned char *tail, *from; + unsigned char *to; struct dccp_ackvec_record *avr; + suseconds_t delta; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return -1; + + delta = ktime_us_delta(ktime_get_real(), av->av_time); + elapsed_time = delta / 10; - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); + if (elapsed_time != 0 && + dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) + return -1; + + avr = dccp_ackvec_record_new(); if (avr == NULL) - return -ENOBUFS; + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + len = av->av_vec_len; + from = av->av_buf + av->av_buf_head; + tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; + + for (i = 0; i < nr_opts; ++i) { + int copylen = len; + + if (len > DCCP_MAX_ACKVEC_OPT_LEN) + copylen = DCCP_MAX_ACKVEC_OPT_LEN; + + *to++ = DCCPO_ACK_VECTOR_0; + *to++ = copylen + 2; + + /* Check if buf_head wraps */ + if (from + copylen > tail) { + const u16 tailsize = tail - from; + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + copylen -= tailsize; + from = av->av_buf; + } + + memcpy(to, from, copylen); + from += copylen; + to += copylen; + len -= copylen; + } - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. + * From RFC 4340, A.2: + * + * For each acknowledgement it sends, the HC-Receiver will add an + * acknowledgement record. ack_seqno will equal the HC-Receiver + * sequence number it used for the ack packet; ack_ptr will equal + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will + * equal buf_nonce. */ - list_add(&avr->avr_node, &av->av_records); + avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = av->av_buf_nonce; + avr->avr_sent_len = av->av_vec_len; - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", + dccp_ackvec_insert_avr(av, avr); + + dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu\n", + dccp_role(sk), avr->avr_sent_len, (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); + (unsigned long long)avr->avr_ack_ackno); return 0; } -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) +struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) { - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; + struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); + + if (av != NULL) { + av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; + av->av_buf_ackno = UINT48_MAX + 1; + av->av_buf_nonce = 0; + av->av_time = ktime_set(0, 0); + av->av_vec_len = 0; + INIT_LIST_HEAD(&av->av_records); } - return NULL; + + return av; } -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) +void dccp_ackvec_free(struct dccp_ackvec *av) { - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; + if (unlikely(av == NULL)) + return; + + if (!list_empty(&av->av_records)) { + struct dccp_ackvec_record *avr, *next; + + list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { + list_del_init(&avr->avr_node); + dccp_ackvec_record_delete(avr); + } + } + + kmem_cache_free(dccp_ackvec_slab, av); } -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) +static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, + const u32 index) { - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); + return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; } -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) +static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, + const u32 index) { - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); + return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; } -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received +/* + * If several packets are missing, the HC-Receiver may prefer to enter multiple + * bytes with run length 0, rather than a single byte with a larger run length; + * this simplifies table updates if one of the missing packets arrives. */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) +static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, + const unsigned int packets, + const unsigned char state) { - u16 ptr = av->av_buf_head; + unsigned int gap; + long new_head; - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; + if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) + return -ENOBUFS; - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); + gap = packets - 1; + new_head = av->av_buf_head - packets; - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; + if (new_head < 0) { + if (gap > 0) { + memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, + gap + new_head + 1); + gap = -new_head; } + new_head += DCCP_MAX_ACKVEC_LEN; + } - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); + av->av_buf_head = new_head; - } while (ptr != av->av_buf_tail); -} + if (gap > 0) + memset(av->av_buf + av->av_buf_head + 1, + DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); + av->av_buf[av->av_buf_head] = state; + av->av_vec_len += packets; + return 0; } -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received +/* + * Implements the RFC 4340, Appendix A */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) +int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, + const u64 ackno, const u8 state) { - u32 num_cells = num_packets; + /* + * Check at the right places if the buffer is full, if it is, tell the + * caller to start dropping packets till the HC-Sender acks our ACK + * vectors, when we will free up space in av_buf. + * + * We may well decide to do buffer compression, etc, but for now lets + * just drop. + * + * From Appendix A.1.1 (`New Packets'): + * + * Of course, the circular buffer may overflow, either when the + * HC-Sender is sending data at a very high rate, when the + * HC-Receiver's acknowledgements are not reaching the HC-Sender, + * or when the HC-Sender is forgetting to acknowledge those acks + * (so the HC-Receiver is unable to clean up old state). In this + * case, the HC-Receiver should either compress the buffer (by + * increasing run lengths when possible), transfer its state to + * a larger buffer, or, as a last resort, drop all received + * packets, without processing them whatsoever, until its buffer + * shrinks again. + */ - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; + /* See if this is the first ackno being inserted */ + if (av->av_vec_len == 0) { + av->av_buf[av->av_buf_head] = state; + av->av_vec_len = 1; + } else if (after48(ackno, av->av_buf_ackno)) { + const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. + * Look if the state of this packet is the same as the + * previous ackno and if so if we can bump the head len. */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; + if (delta == 1 && + dccp_ackvec_state(av, av->av_buf_head) == state && + dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) + av->av_buf[av->av_buf_head]++; + else if (dccp_ackvec_set_buf_head_state(av, delta, state)) + return -ENOBUFS; + } else { + /* + * A.1.2. Old Packets + * + * When a packet with Sequence Number S <= buf_ackno + * arrives, the HC-Receiver will scan the table for + * the byte corresponding to S. (Indexing structures + * could reduce the complexity of this scan.) + */ + u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); + u32 index = av->av_buf_head; - lost_packets -= len; + while (1) { + const u8 len = dccp_ackvec_len(av, index); + const u8 av_state = dccp_ackvec_state(av, index); + /* + * valid packets not yet in av_buf have a reserved + * entry, with a len equal to 0. + */ + if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && + len == 0 && delta == 0) { /* Found our + reserved seat! */ + dccp_pr_debug("Found %llu reserved seat!\n", + (unsigned long long)ackno); + av->av_buf[index] = state; + goto out; + } + /* len == 0 means one packet */ + if (delta < len + 1) + goto out_duplicate; + + delta -= len + 1; + if (++index == DCCP_MAX_ACKVEC_LEN) + index = 0; } } - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; + av->av_buf_ackno = ackno; + av->av_time = ktime_get_real(); +out: + return 0; - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); +out_duplicate: + /* Duplicate packet */ + dccp_pr_debug("Received a dup or already considered lost " + "packet: %llu\n", (unsigned long long)ackno); + return -EILSEQ; } -/** - * dccp_ackvec_input - Register incoming packet in the buffer - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) +static void dccp_ackvec_throw_record(struct dccp_ackvec *av, + struct dccp_ackvec_record *avr) { - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; + struct dccp_ackvec_record *next; - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; + /* sort out vector length */ + if (av->av_buf_head <= avr->avr_ack_ptr) + av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; + else + av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - + av->av_buf_head + avr->avr_ack_ptr; - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { + /* free records */ + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del_init(&avr->avr_node); + dccp_ackvec_record_delete(avr); + } +} - *current_head += 1; - av->av_buf_ackno = seqno; +void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, + const u64 ackno) +{ + struct dccp_ackvec_record *avr; - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } + /* + * If we traverse backwards, it should be faster when we have large + * windows. We will be receiving ACKs for stuff we sent a while back + * -sorbo. + */ + list_for_each_entry_reverse(avr, &av->av_records, avr_node) { + if (ackno == avr->avr_ack_seqno) { + dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu, ACKED!\n", + dccp_role(sk), 1, + (unsigned long long)avr->avr_ack_seqno, + (unsigned long long)avr->avr_ack_ackno); + dccp_ackvec_throw_record(av, avr); + break; + } else if (avr->avr_ack_seqno > ackno) + break; /* old news */ } } -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) - { - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; +static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, + struct sock *sk, u64 *ackno, + const unsigned char len, + const unsigned char *vector) +{ + unsigned char i; + struct dccp_ackvec_record *avr; - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) + /* Check if we actually sent an ACK vector */ + if (list_empty(&av->av_records)) return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); + i = len; /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. + * XXX + * I think it might be more efficient to work backwards. See comment on + * rcv_ackno. -sorbo. */ - if (runlen_now > eff_runlen) { + avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); + while (i--) { + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + u64 ackno_end_rl; - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); + dccp_set_seqno(&ackno_end_rl, *ackno - rl); - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. + * If our AVR sequence number is greater than the ack, go + * forward in the AVR list until it is not so. */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); + list_for_each_entry_from(avr, &av->av_records, avr_node) { + if (!after48(avr->avr_ack_seqno, *ackno)) + goto found; + } + /* End of the av_records list, not found, exit */ + break; +found: + if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { + const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { + dccp_pr_debug("%s ACK vector 0, len=%d, " + "ack_seqno=%llu, ack_ackno=%llu, " + "ACKED!\n", + dccp_role(sk), len, + (unsigned long long) + avr->avr_ack_seqno, + (unsigned long long) + avr->avr_ack_ackno); + dccp_ackvec_throw_record(av, avr); + break; + } + /* + * If it wasn't received, continue scanning... we might + * find another one. + */ + } -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); + dccp_set_seqno(ackno, ackno_end_rl - 1); + ++vector; } } -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) +int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, + u64 *ackno, const u8 opt, const u8 *value, const u8 len) { - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; + if (len > DCCP_MAX_ACKVEC_OPT_LEN) + return -1; - list_add_tail(&new->node, head); + /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ + dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, + ackno, len, value); return 0; } -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); int __init dccp_ackvec_init(void) { @@ -379,9 +449,10 @@ int __init dccp_ackvec_init(void) if (dccp_ackvec_slab == NULL) goto out_err; - dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", - sizeof(struct dccp_ackvec_record), - 0, SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_record_slab = + kmem_cache_create("dccp_ackvec_record", + sizeof(struct dccp_ackvec_record), + 0, SLAB_HWCACHE_ALIGN, NULL); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 6cdca79a99f7..bcb64fb4acef 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -3,134 +3,156 @@ /* * net/dccp/ackvec.h * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK + * An implementation of the DCCP protocol * Copyright (c) 2005 Arnaldo Carvalho de Melo + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include #include +#include #include #include -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F +/* Read about the ECN nonce to see why it is 253 */ +#define DCCP_MAX_ACKVEC_OPT_LEN 253 +/* We can spread an ack vector across multiple options */ +#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} +#define DCCP_ACKVEC_STATE_RECEIVED 0 +#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) +#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} +#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ +#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ -/** struct dccp_ackvec - Ack Vector main data structure +/** struct dccp_ackvec - ack vector + * + * This data structure is the one defined in RFC 4340, Appendix A. * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. + * @av_buf_head - circular buffer head + * @av_buf_tail - circular buffer tail + * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the + * buffer (i.e. %av_buf_head) + * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked + * by the buffer with State 0 * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) + * Additionally, the HC-Receiver must keep some information about the + * Ack Vectors it has recently sent. For each packet sent carrying an + * Ack Vector, it remembers four variables: + * + * @av_records - list of dccp_ackvec_record + * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * + * @av_time - the time in usecs + * @av_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; + u64 av_buf_ackno; struct list_head av_records; + ktime_t av_time; + u16 av_buf_head; + u16 av_vec_len; + u8 av_buf_nonce; + u8 av_ack_nonce; + u8 av_buf[DCCP_MAX_ACKVEC_LEN]; }; -/** struct dccp_ackvec_record - Records information about sent Ack Vectors +/** struct dccp_ackvec_record - ack vector record * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. + * ACK vector record as defined in Appendix A of spec. * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent + * The list is sorted by avr_ack_seqno * - * The list as a whole is sorted in descending order by @avr_ack_seqno. + * @avr_node - node in av_records + * @avr_ack_seqno - sequence number of the packet this record was sent on + * @avr_ack_ackno - sequence number being acknowledged + * @avr_ack_ptr - pointer into av_buf where this record starts + * @avr_ack_nonce - av_ack_nonce at the time this record was sent + * @avr_sent_len - lenght of the record in av_buf */ struct dccp_ackvec_record { struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; + u64 avr_ack_seqno; + u64 avr_ack_ackno; u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; + u16 avr_sent_len; + u8 avr_ack_nonce; }; -extern int dccp_ackvec_init(void); +struct sock; +struct sk_buff; + +#ifdef CONFIG_IP_DCCP_ACKVEC +extern int dccp_ackvec_init(void); extern void dccp_ackvec_exit(void); extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); extern void dccp_ackvec_free(struct dccp_ackvec *av); -extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); +extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, + const u64 ackno, const u8 state); + +extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, + struct sock *sk, const u64 ackno); +extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, + u64 *ackno, const u8 opt, + const u8 *value, const u8 len); -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) +extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); + +static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) +{ + return av->av_vec_len; +} +#else /* CONFIG_IP_DCCP_ACKVEC */ +static inline int dccp_ackvec_init(void) { - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; + return 0; } -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; +static inline void dccp_ackvec_exit(void) +{ +} + +static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) +{ + return NULL; +} + +static inline void dccp_ackvec_free(struct dccp_ackvec *av) +{ +} + +static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, + const u64 ackno, const u8 state) +{ + return -1; +} -extern int dccp_ackvec_parsed_add(struct list_head *head, - u8 *vec, u8 len, u8 nonce); -extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); +static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, + struct sock *sk, const u64 ackno) +{ +} + +static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, + const u64 *ackno, const u8 opt, + const u8 *value, const u8 len) +{ + return -1; +} + +static inline int dccp_insert_option_ackvec(const struct sock *sk, + const struct sk_buff *skb) +{ + return -1; +} + +static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) +{ + return 0; +} +#endif /* CONFIG_IP_DCCP_ACKVEC */ #endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index e3fb52b4f5c6..4809753d12ae 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -13,13 +13,6 @@ #include "ccid.h" -static u8 builtin_ccids[] = { - DCCPC_CCID2, /* CCID2 is supported by default */ -#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) - DCCPC_CCID3, -#endif -}; - static struct ccid_operations *ccids[CCID_MAX]; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t ccids_lockct = ATOMIC_INIT(0); @@ -93,47 +86,6 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) } } -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - u8 i, j, found; - - for (i = 0, found = 0; i < array_len; i++, found = 0) { - for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) - found = (ccid_array[i] == builtin_ccids[j]); - if (!found) - return false; - } - return true; -} - -/** - * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - *array_len = ARRAY_SIZE(builtin_ccids); - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - if (len < sizeof(builtin_ccids)) - return -EINVAL; - - if (put_user(sizeof(builtin_ccids), optlen) || - copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) - return -EFAULT; - return 0; -} - int ccid_register(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; @@ -196,41 +148,22 @@ int ccid_unregister(struct ccid_operations *ccid_ops) EXPORT_SYMBOL_GPL(ccid_unregister); -/** - * ccid_request_module - Pre-load CCID module for later use - * This should be called only from process context (e.g. during connection - * setup) and is necessary for later calls to ccid_new (typically in software - * interrupt), so that it has the modules available when they are needed. - */ -static int ccid_request_module(u8 id) -{ - if (!in_atomic()) { - ccids_read_lock(); - if (ccids[id] == NULL) { - ccids_read_unlock(); - return request_module("net-dccp-ccid-%d", id); - } - ccids_read_unlock(); - } - return 0; -} - -int ccid_request_modules(u8 const *ccid_array, u8 array_len) -{ -#ifdef CONFIG_KMOD - while (array_len--) - if (ccid_request_module(ccid_array[array_len])) - return -1; -#endif - return 0; -} - struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) { struct ccid_operations *ccid_ops; struct ccid *ccid = NULL; ccids_read_lock(); +#ifdef CONFIG_KMOD + if (ccids[id] == NULL) { + /* We only try to load if in process context */ + ccids_read_unlock(); + if (gfp & GFP_ATOMIC) + goto out; + request_module("net-dccp-ccid-%d", id); + ccids_read_lock(); + } +#endif ccid_ops = ccids[id]; if (ccid_ops == NULL) goto out_unlock; @@ -272,6 +205,20 @@ out_module_put: EXPORT_SYMBOL_GPL(ccid_new); +struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) +{ + return ccid_new(id, sk, 1, gfp); +} + +EXPORT_SYMBOL_GPL(ccid_hc_rx_new); + +struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) +{ + return ccid_new(id, sk, 0, gfp); +} + +EXPORT_SYMBOL_GPL(ccid_hc_tx_new); + static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) { struct ccid_operations *ccid_ops; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index d27054ba2159..fdeae7b57319 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -60,18 +60,22 @@ struct ccid_operations { void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); + int (*ccid_hc_rx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); + int (*ccid_hc_tx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); + int more, unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, @@ -99,78 +103,31 @@ static inline void *ccid_priv(const struct ccid *ccid) return (void *)ccid->ccid_priv; } -extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); -extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} +extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, + gfp_t gfp); +extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, + gfp_t gfp); extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { + int rc = 0; if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; + rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return rc; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) + int more, unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, @@ -187,31 +144,27 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) { - if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); + int rc = 0; + if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) + rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, + value); + return rc; } -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) { - if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); + int rc = 0; + if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) + rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); + return rc; } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index fb168be2cb43..12275943eab8 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,8 +1,10 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on EXPERIMENTAL config IP_DCCP_CCID2 - tristate "CCID2 (TCP-Like)" + tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" def_tristate IP_DCCP + select IP_DCCP_ACKVEC ---help--- CCID 2, TCP-like Congestion Control, denotes Additive Increase, Multiplicative Decrease (AIMD) congestion control with behavior @@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG If in doubt, say N. config IP_DCCP_CCID3 - tristate "CCID3 (TCP-Friendly)" + tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" def_tristate IP_DCCP select IP_DCCP_TFRC_LIB ---help--- @@ -62,9 +64,9 @@ config IP_DCCP_CCID3 If in doubt, say M. -if IP_DCCP_CCID3 config IP_DCCP_CCID3_DEBUG bool "CCID3 debugging messages" + depends on IP_DCCP_CCID3 ---help--- Enable CCID3-specific debugging messages. @@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -choice - prompt "Select method for measuring the packet size s" - default IP_DCCP_CCID3_MEASURE_S_AS_MPS - -config IP_DCCP_CCID3_MEASURE_S_AS_MPS - bool "Always use MPS in place of s" - ---help--- - This use is recommended as it is consistent with the initialisation - of X and suggested when s varies (rfc3448bis, (1) in section 4.1). -config IP_DCCP_CCID3_MEASURE_S_AS_AVG - bool "Use moving average" - ---help--- - An alternative way of tracking s, also supported by rfc3448bis. - This used to be the default for CCID-3 in previous kernels. -config IP_DCCP_CCID3_MEASURE_S_AS_MAX - bool "Track the maximum payload length" - ---help--- - An experimental method based on tracking the maximum packet size. -endchoice - config IP_DCCP_CCID3_RTO int "Use higher bound for nofeedback timer" default 100 + depends on IP_DCCP_CCID3 && EXPERIMENTAL ---help--- Use higher lower bound for nofeedback timer expiration. @@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO The purpose of the nofeedback timer is to slow DCCP down when there is serious network congestion: experimenting with larger values should therefore not be performed on WANs. -endif # IP_DCCP_CCID3 config IP_DCCP_TFRC_LIB tristate diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index fa713227c66f..9a430734530c 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,7 +25,7 @@ /* * This implementation should follow RFC 4341 */ -#include "../feat.h" + #include "../ccid.h" #include "../dccp.h" #include "ccid2.h" @@ -34,8 +34,51 @@ #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) + +static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) +{ + int len = 0; + int pipe = 0; + struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; + + /* there is data in the chain */ + if (seqp != hctx->ccid2hctx_seqt) { + seqp = seqp->ccid2s_prev; + len++; + if (!seqp->ccid2s_acked) + pipe++; + + while (seqp != hctx->ccid2hctx_seqt) { + struct ccid2_seq *prev = seqp->ccid2s_prev; + + len++; + if (!prev->ccid2s_acked) + pipe++; + + /* packets are sent sequentially */ + BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, + prev->ccid2s_seq ) >= 0); + BUG_ON(time_before(seqp->ccid2s_sent, + prev->ccid2s_sent)); + + seqp = prev; + } + } + + BUG_ON(pipe != hctx->ccid2hctx_pipe); + ccid2_pr_debug("len of chain=%d\n", len); + + do { + seqp = seqp->ccid2s_prev; + len++; + } while (seqp != hctx->ccid2hctx_seqh); + + ccid2_pr_debug("total len=%d\n", len); + BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); +} #else #define ccid2_pr_debug(format, a...) +#define ccid2_hc_tx_check_sanity(hctx) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) @@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) int i; /* check if we have space to preserve the pointer to the buffer */ - if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) + if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / + sizeof(struct ccid2_seq*))) return -ENOMEM; /* allocate buffer and initialize linked list */ @@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; /* This is the first allocation. Initiate the head and tail. */ - if (hctx->seqbufc == 0) - hctx->seqh = hctx->seqt = seqp; + if (hctx->ccid2hctx_seqbufc == 0) + hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; else { /* link the existing list with the one we just created */ - hctx->seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hctx->seqh; + hctx->ccid2hctx_seqh->ccid2s_next = seqp; + seqp->ccid2s_prev = hctx->ccid2hctx_seqh; - hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; + hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; } /* store the original pointer to the buffer so we can free it */ - hctx->seqbuf[hctx->seqbufc] = seqp; - hctx->seqbufc++; + hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; + hctx->ccid2hctx_seqbufc++; return 0; } static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) + return 0; + + return 1; /* XXX CCID should dequeue when ready instead of polling */ } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); /* * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from @@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } - if (val > DCCPF_ACK_RATIO_MAX) - val = DCCPF_ACK_RATIO_MAX; + if (val > 0xFFFF) /* RFC 4340, 11.3 */ + val = 0xFFFF; if (val == dp->dccps_l_ack_ratio) return; @@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } +static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) +{ + ccid2_pr_debug("change SRTT to %ld\n", val); + hctx->ccid2hctx_srtt = val; +} + +static void ccid2_start_rto_timer(struct sock *sk); + static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); + long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, + jiffies + HZ / 5); goto out; } ccid2_pr_debug("RTO_EXPIRE\n"); + ccid2_hc_tx_check_sanity(hctx); + /* back-off timer */ - hctx->rto <<= 1; - if (hctx->rto > DCCP_RTO_MAX) - hctx->rto = DCCP_RTO_MAX; + hctx->ccid2hctx_rto <<= 1; + + s = hctx->ccid2hctx_rto / HZ; + if (s > 60) + hctx->ccid2hctx_rto = 60 * HZ; + + ccid2_start_rto_timer(sk); /* adjust pipe, cwnd etc */ - hctx->ssthresh = hctx->cwnd / 2; - if (hctx->ssthresh < 2) - hctx->ssthresh = 2; - hctx->cwnd = 1; - hctx->pipe = 0; + hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; + if (hctx->ccid2hctx_ssthresh < 2) + hctx->ccid2hctx_ssthresh = 2; + hctx->ccid2hctx_cwnd = 1; + hctx->ccid2hctx_pipe = 0; /* clear state about stuff we sent */ - hctx->seqt = hctx->seqh; - hctx->packets_acked = 0; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; + hctx->ccid2hctx_packets_acked = 0; /* clear ack ratio state. */ - hctx->rpseq = 0; - hctx->rpdupack = -1; + hctx->ccid2hctx_rpseq = 0; + hctx->ccid2hctx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - /* restart backed-off timer */ - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); + ccid2_hc_tx_check_sanity(hctx); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) +static void ccid2_start_rto_timer(struct sock *sk) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); + + BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, + jiffies + hctx->ccid2hctx_rto); +} + +static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); struct ccid2_seq *next; - hctx->pipe++; + hctx->ccid2hctx_pipe++; - hctx->seqh->ccid2s_seq = dp->dccps_gss; - hctx->seqh->ccid2s_acked = 0; - hctx->seqh->ccid2s_sent = jiffies; + hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; + hctx->ccid2hctx_seqh->ccid2s_acked = 0; + hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; - next = hctx->seqh->ccid2s_next; + next = hctx->ccid2hctx_seqh->ccid2s_next; /* check if we need to alloc more space */ - if (next == hctx->seqt) { + if (next == hctx->ccid2hctx_seqt) { if (ccid2_hc_tx_alloc_seq(hctx)) { DCCP_CRIT("packet history - out of memory!"); /* FIXME: find a more graceful way to bail out */ return; } - next = hctx->seqh->ccid2s_next; - BUG_ON(next == hctx->seqt); + next = hctx->ccid2hctx_seqh->ccid2s_next; + BUG_ON(next == hctx->ccid2hctx_seqt); } - hctx->seqh = next; + hctx->ccid2hctx_seqh = next; - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); + ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, + hctx->ccid2hctx_pipe); /* * FIXME: The code below is broken and the variables have been removed @@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) */ #if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hctx->arsent++; + hctx->ccid2hctx_arsent++; /* We had an ack loss in this window... */ - if (hctx->ackloss) { - if (hctx->arsent >= hctx->cwnd) { - hctx->arsent = 0; - hctx->ackloss = 0; + if (hctx->ccid2hctx_ackloss) { + if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_arsent = 0; + hctx->ccid2hctx_ackloss = 0; } } else { /* No acks lost up to now... */ @@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - dp->dccps_l_ack_ratio; - denom = hctx->cwnd * hctx->cwnd / denom; + denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; - if (hctx->arsent >= denom) { + if (hctx->ccid2hctx_arsent >= denom) { ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hctx->arsent = 0; + hctx->ccid2hctx_arsent = 0; } } else { /* we can't increase ack ratio further [1] */ - hctx->arsent = 0; /* or maybe set it to cwnd*/ + hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ } } #endif /* setup RTO timer */ - if (!timer_pending(&hctx->rtotimer)) - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); + if (!timer_pending(&hctx->ccid2hctx_rtotimer)) + ccid2_start_rto_timer(sk); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { - struct ccid2_seq *seqp = hctx->seqt; + struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; - while (seqp != hctx->seqh) { + while (seqp != hctx->ccid2hctx_seqh) { ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); @@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) } } while (0); ccid2_pr_debug("=========\n"); + ccid2_hc_tx_check_sanity(hctx); #endif } -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. +/* XXX Lame code duplication! + * returns -1 if none was found. + * else returns the next offset to use in the function call. */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, + unsigned char **vec, unsigned char *veclen) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hctx->srtt == 0) { - /* First measurement m */ - hctx->srtt = m << 3; - hctx->mdev = m << 1; - - hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); - hctx->rttvar = hctx->mdev_max; - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hctx->srtt >> 3); - hctx->srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hctx->mdev >> 2); + const struct dccp_hdr *dh = dccp_hdr(skb); + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr; + const unsigned char *opt_end = (unsigned char *)dh + + (dh->dccph_doff * 4); + unsigned char opt, len; + unsigned char *value; + + BUG_ON(offset < 0); + options += offset; + opt_ptr = options; + if (opt_ptr >= opt_end) + return -1; + + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_invalid_option; + + len = *opt_ptr++; + if (len < 3) + goto out_invalid_option; /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). + * Remove the type and len fields, leaving + * just the value size */ - if (m > 0) - m >>= 3; - } else { - m -= (hctx->mdev >> 2); - } - hctx->mdev += m; + len -= 2; + value = opt_ptr; + opt_ptr += len; - if (hctx->mdev > hctx->mdev_max) { - hctx->mdev_max = hctx->mdev; - if (hctx->mdev_max > hctx->rttvar) - hctx->rttvar = hctx->mdev_max; + if (opt_ptr > opt_end) + goto out_invalid_option; } - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe, AWL is probably - * too low as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { - if (hctx->mdev_max < hctx->rttvar) - hctx->rttvar -= (hctx->rttvar - - hctx->mdev_max) >> 2; - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; - hctx->mdev_max = TCP_RTO_MIN; + switch (opt) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + *vec = value; + *veclen = len; + return offset + (opt_ptr - options); } } - /* - * Set RTO from SRTT and RTTVAR - * Clock granularity is ignored since the minimum error for RTTVAR is - * clamped to 50msec (corresponding to HZ=20). This leads to a minimum - * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP - * does not retransmit data, DCCP does not require TCP's recommended - * minimum timeout of one second". - */ - hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; + return -1; - if (hctx->rto > DCCP_RTO_MAX) - hctx->rto = DCCP_RTO_MAX; +out_invalid_option: + DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); + return -1; } -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) +static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (hctx->cwnd < hctx->ssthresh) { - if (*maxincr > 0 && ++hctx->packets_acked == 2) { - hctx->cwnd += 1; - *maxincr -= 1; - hctx->packets_acked = 0; - } - } else if (++hctx->packets_acked >= hctx->cwnd) { - hctx->cwnd += 1; - hctx->packets_acked = 0; - } - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps - need to be resolved at some time. - */ - ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); + sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); + ccid2_pr_debug("deleted RTO timer\n"); } -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +static inline void ccid2_new_ack(struct sock *sk, + struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { + if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { + hctx->ccid2hctx_cwnd += 1; + *maxincr -= 1; + hctx->ccid2hctx_packets_acked = 0; + } + } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_cwnd += 1; + hctx->ccid2hctx_packets_acked = 0; } - hctx->last_cong = jiffies; + /* update RTO */ + if (hctx->ccid2hctx_srtt == -1 || + time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { + unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; + int s; + + /* first measurement */ + if (hctx->ccid2hctx_srtt == -1) { + ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", + r, jiffies, + (unsigned long long)seqp->ccid2s_seq); + ccid2_change_srtt(hctx, r); + hctx->ccid2hctx_rttvar = r >> 1; + } else { + /* RTTVAR */ + long tmp = hctx->ccid2hctx_srtt - r; + long srtt; + + if (tmp < 0) + tmp *= -1; + + tmp >>= 2; + hctx->ccid2hctx_rttvar *= 3; + hctx->ccid2hctx_rttvar >>= 2; + hctx->ccid2hctx_rttvar += tmp; + + /* SRTT */ + srtt = hctx->ccid2hctx_srtt; + srtt *= 7; + srtt >>= 3; + tmp = r >> 3; + srtt += tmp; + ccid2_change_srtt(hctx, srtt); + } + s = hctx->ccid2hctx_rttvar << 2; + /* clock granularity is 1 when based on jiffies */ + if (!s) + s = 1; + hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; + + /* must be at least a second */ + s = hctx->ccid2hctx_rto / HZ; + /* DCCP doesn't require this [but I like it cuz my code sux] */ +#if 1 + if (s < 1) + hctx->ccid2hctx_rto = HZ; +#endif + /* max 60 seconds */ + if (s > 60) + hctx->ccid2hctx_rto = HZ * 60; - hctx->cwnd = hctx->cwnd / 2 ? : 1U; - hctx->ssthresh = max(hctx->cwnd, 2U); + hctx->ccid2hctx_lastrtt = jiffies; - /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) - ccid2_change_l_ack_ratio(sk, hctx->cwnd); + ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", + hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, + hctx->ccid2hctx_rto, HZ, r); + } + + /* we got a new ack, so re-start RTO timer */ + ccid2_hc_tx_kill_rto_timer(sk); + ccid2_start_rto_timer(sk); } -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) +static void ccid2_hc_tx_dec_pipe(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); + if (hctx->ccid2hctx_pipe == 0) + DCCP_BUG("pipe == 0"); + else + hctx->ccid2hctx_pipe--; + + if (hctx->ccid2hctx_pipe == 0) + ccid2_hc_tx_kill_rto_timer(sk); +} + +static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { + ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); + return; } - return 0; + + hctx->ccid2hctx_last_cong = jiffies; + + hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; + hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); + + /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) + ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); - struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; + unsigned char *vector; + unsigned char veclen; + int offset = 0; int done = 0; unsigned int maxincr = 0; + ccid2_hc_tx_check_sanity(hctx); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * -sorbo. */ /* need to bootstrap */ - if (hctx->rpdupack == -1) { - hctx->rpdupack = 0; - hctx->rpseq = seqno; + if (hctx->ccid2hctx_rpdupack == -1) { + hctx->ccid2hctx_rpdupack = 0; + hctx->ccid2hctx_rpseq = seqno; } else { /* check if packet is consecutive */ - if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) - hctx->rpseq = seqno; + if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) + hctx->ccid2hctx_rpseq = seqno; /* it's a later packet */ - else if (after48(seqno, hctx->rpseq)) { - hctx->rpdupack++; + else if (after48(seqno, hctx->ccid2hctx_rpseq)) { + hctx->ccid2hctx_rpdupack++; /* check if we got enough dupacks */ - if (hctx->rpdupack >= NUMDUPACK) { - hctx->rpdupack = -1; /* XXX lame */ - hctx->rpseq = 0; + if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { + hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ + hctx->ccid2hctx_rpseq = 0; ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); } @@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) + /* still didn't send out new data packets */ + if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) return; - /* still didn't send out new data packets */ - if (hctx->seqh == hctx->seqt) - goto done; + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_ACK: + case DCCP_PKT_DATAACK: + break; + default: + return; + } ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hctx->high_ack)) - hctx->high_ack = ackno; + if (after48(ackno, hctx->ccid2hctx_high_ack)) + hctx->ccid2hctx_high_ack = ackno; - seqp = hctx->seqt; + seqp = hctx->ccid2hctx_seqt; while (before48(seqp->ccid2s_seq, ackno)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->seqh) { - seqp = hctx->seqh->ccid2s_prev; + if (seqp == hctx->ccid2hctx_seqh) { + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; break; } } @@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * packets per acknowledgement. Rounding up avoids that cwnd is not * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ - if (hctx->cwnd < hctx->ssthresh) + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ - list_for_each_entry(avp, &hctx->av_chunks, node) { + while ((offset = ccid2_ackvector(sk, skb, offset, + &vector, &veclen)) != -1) { /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); + while (veclen--) { + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + u64 ackno_end_rl = SUB48(ackno, rl); - ccid2_pr_debug("ackvec %llu |%u,%u|\n", + ccid2_pr_debug("ackvec start:%llu end:%llu\n", (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); + (unsigned long long)ackno_end_rl); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. */ while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hctx->seqt) { + if (seqp == hctx->ccid2hctx_seqt) { done = 1; break; } @@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); + const u8 state = *vector & + DCCP_ACKVEC_STATE_MASK; /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) + if (state == + DCCP_ACKVEC_STATE_ECN_MARKED) { ccid2_congestion_event(sk, seqp); - else + } else ccid2_new_ack(sk, seqp, &maxincr); seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - hctx->pipe--; + ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->seqt) { + if (seqp == hctx->ccid2hctx_seqt) { done = 1; break; } @@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) break; ackno = SUB48(ackno_end_rl, 1); + vector++; } if (done) break; @@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* The state about what is acked should be correct now * Check for NUMDUPACK */ - seqp = hctx->seqt; - while (before48(seqp->ccid2s_seq, hctx->high_ack)) { + seqp = hctx->ccid2hctx_seqt; + while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->seqh) { - seqp = hctx->seqh->ccid2s_prev; + if (seqp == hctx->ccid2hctx_seqh) { + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; break; } } @@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (done == NUMDUPACK) break; } - if (seqp == hctx->seqt) + if (seqp == hctx->ccid2hctx_seqt) break; seqp = seqp->ccid2s_prev; } @@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - hctx->pipe--; + ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->seqt) + if (seqp == hctx->ccid2hctx_seqt) break; seqp = seqp->ccid2s_prev; } - hctx->seqt = last_acked; + hctx->ccid2hctx_seqt = last_acked; } /* trim acked packets in tail */ - while (hctx->seqt != hctx->seqh) { - if (!hctx->seqt->ccid2s_acked) + while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { + if (!hctx->ccid2hctx_seqt->ccid2s_acked) break; - hctx->seqt = hctx->seqt->ccid2s_next; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; } - /* restart RTO timer if not all outstanding data has been acked */ - if (hctx->pipe == 0) - sk_stop_timer(sk, &hctx->rtotimer); - else - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - dccp_ackvec_parsed_cleanup(&hctx->av_chunks); + ccid2_hc_tx_check_sanity(hctx); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) u32 max_ratio; /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hctx->ssthresh = ~0U; + hctx->ccid2hctx_ssthresh = ~0U; - /* Use larger initial windows (RFC 3390, rfc2581bis) */ - hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); + /* + * RFC 4341, 5: "The cwnd parameter is initialized to at most four + * packets for new connections, following the rules from [RFC3390]". + * We need to convert the bytes of RFC3390 into the packets of RFC 4341. + */ + hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); + max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) dp->dccps_l_ack_ratio = max_ratio; @@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hctx)) return -ENOMEM; - hctx->rto = DCCP_TIMEOUT_INIT; - hctx->rpdupack = -1; - hctx->last_cong = jiffies; - setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - INIT_LIST_HEAD(&hctx->av_chunks); + hctx->ccid2hctx_rto = 3 * HZ; + ccid2_change_srtt(hctx, -1); + hctx->ccid2hctx_rttvar = -1; + hctx->ccid2hctx_rpdupack = -1; + hctx->ccid2hctx_last_cong = jiffies; + setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, + (unsigned long)sk); + + ccid2_hc_tx_check_sanity(hctx); return 0; } @@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); int i; - sk_stop_timer(sk, &hctx->rtotimer); + ccid2_hc_tx_kill_rto_timer(sk); - for (i = 0; i < hctx->seqbufc; i++) - kfree(hctx->seqbuf[i]); - hctx->seqbufc = 0; + for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) + kfree(hctx->ccid2hctx_seqbuf[i]); + hctx->ccid2hctx_seqbufc = 0; } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (DCCP_SKB_CB(skb)->dccpd_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: - hcrx->data++; - if (hcrx->data >= dp->dccps_r_ack_ratio) { + hcrx->ccid2hcrx_data++; + if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { dccp_send_ack(sk); - hcrx->data = 0; + hcrx->ccid2hcrx_data = 0; } break; } } static struct ccid_operations ccid2 = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_owner = THIS_MODULE, - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_owner = THIS_MODULE, + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 8b7a2dee2f6d..2c94ca029010 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,49 +42,34 @@ struct ccid2_seq { /** struct ccid2_hc_tx_sock - CCID2 TX half connection * - * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @srtt: smoothed RTT estimate, scaled by 2^3 - * @mdev: smoothed RTT variation, scaled by 2^2 - * @mdev_max: maximum of @mdev during one flight - * @rttvar: moving average/maximum of @mdev_max - * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @rtt_seq: to decay RTTVAR at most once per flight - * @rpseq: last consecutive seqno - * @rpdupack: dupacks since rpseq - * @av_chunks: list of Ack Vectors received on current skb - */ + * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 + * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) + * @ccid2hctx_lastrtt -time RTT was last measured + * @ccid2hctx_rpseq - last consecutive seqno + * @ccid2hctx_rpdupack - dupacks since rpseq +*/ struct ccid2_hc_tx_sock { - u32 cwnd; - u32 ssthresh; - u32 pipe; - u32 packets_acked; - struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; - int seqbufc; - struct ccid2_seq *seqh; - struct ccid2_seq *seqt; - /* RTT measurement: variables/principles are the same as in TCP */ - u32 srtt, - mdev, - mdev_max, - rttvar, - rto; - u64 rtt_seq:48; - struct timer_list rtotimer; - u64 rpseq; - int rpdupack; - unsigned long last_cong; - u64 high_ack; - struct list_head av_chunks; + u32 ccid2hctx_cwnd; + u32 ccid2hctx_ssthresh; + u32 ccid2hctx_pipe; + u32 ccid2hctx_packets_acked; + struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; + int ccid2hctx_seqbufc; + struct ccid2_seq *ccid2hctx_seqh; + struct ccid2_seq *ccid2hctx_seqt; + long ccid2hctx_rto; + long ccid2hctx_srtt; + long ccid2hctx_rttvar; + unsigned long ccid2hctx_lastrtt; + struct timer_list ccid2hctx_rtotimer; + u64 ccid2hctx_rpseq; + int ccid2hctx_rpdupack; + unsigned long ccid2hctx_last_cong; + u64 ccid2hctx_high_ack; }; -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) -{ - return (hctx->pipe >= hctx->cwnd); -} - struct ccid2_hc_rx_sock { - int data; + int ccid2hcrx_data; }; static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 06cfdad84a6a..3b8bd7ca6761 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,41 +49,75 @@ static int ccid3_debug; /* * Transmitter Half-Connection Routines */ -/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ -static int do_osc_prev = true; +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static char *ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + [TFRC_SSTATE_TERM] = "TERM", + }; + + return ccid3_state_names[state]; +} +#endif + +static void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) +{ + struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), + ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hctx->ccid3hctx_state = state; +} /* * Compute the initial sending rate X_init in the manner of RFC 3390: * - * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT + * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT * + * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis + * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. * For consistency with other parts of the code, X_init is scaled by 2^6. */ static inline u64 rfc3390_initial_rate(struct sock *sk) { - const u32 mps = dccp_sk(sk)->dccps_mss_cache, - w_init = clamp(4380U, 2 * mps, 4 * mps); + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + const __u32 w_init = clamp_t(__u32, 4380U, + 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); - return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); + return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); } -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X - * This respects the granularity of X (64 * bytes/second) and enforces the - * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. +/* + * Recalculate t_ipi and delta (should be called whenever X changes) */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { - if (unlikely(hctx->x <= hctx->s)) - hctx->x = hctx->s; - hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); + /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ + hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_x); + + /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, + TFRC_OPSYS_HALF_TIME_GRAN); + + ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", + hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, + hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); + } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->t_last_win_count); + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); - return delta / hctx->rtt; + return delta / hctx->ccid3hctx_rtt; } /** @@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - u64 min_rate = 2 * hctx->x_recv; - const u64 old_x = hctx->x; + __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; + const __u64 old_x = hctx->ccid3hctx_x; ktime_t now = stamp ? *stamp : ktime_get_real(); /* @@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) */ if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hctx->x_recv); + min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); } - if (hctx->p > 0) { + if (hctx->ccid3hctx_p > 0) { - hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); + hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, + min_rate); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); - } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { + } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) + - (s64)hctx->ccid3hctx_rtt >= 0) { - hctx->x = min(2 * hctx->x, min_rate); - hctx->x = max(hctx->x, - scaled_div(((u64)hctx->s) << 6, hctx->rtt)); - hctx->t_ld = now; + hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + scaled_div(((__u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_rtt)); + hctx->ccid3hctx_t_ld = now; } - if (hctx->x != old_x) { + if (hctx->ccid3hctx_x != old_x) { ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " "X_recv=%u\n", (unsigned)(old_x >> 6), - (unsigned)(hctx->x >> 6), hctx->x_calc, - (unsigned)(hctx->x_recv >> 6)); + (unsigned)(hctx->ccid3hctx_x >> 6), + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6)); ccid3_update_send_interval(hctx); } } /* - * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) - * @new_len: DCCP payload size in bytes (not used by all methods) + * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) + * @len: DCCP packet payload size in bytes */ -static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) +static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) { -#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) - return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); -#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) - return max(ccid3_hc_tx_sk(sk)->s, new_len); -#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ - return dccp_sk(sk)->dccps_mss_cache; -#endif + const u16 old_s = hctx->ccid3hctx_s; + + hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); + + if (hctx->ccid3hctx_s != old_s) + ccid3_update_send_interval(hctx); } /* @@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->t_last_win_count), - quarter_rtts = (4 * delta) / hctx->rtt; + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), + quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; if (quarter_rtts > 0) { - hctx->t_last_win_count = now; - hctx->last_win_count += min(quarter_rtts, 5U); - hctx->last_win_count &= 0xF; /* mod 16 */ + hctx->ccid3hctx_t_last_win_count = now; + hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); + hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ } } @@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) goto restart_timer; } - ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, - hctx->feedback ? "" : "out"); + ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) goto out; - /* Reset feedback state to "no feedback received" */ - hctx->feedback = false; - /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. */ - if (hctx->t_rto == 0 || hctx->p == 0) { + if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ + hctx->ccid3hctx_p == 0) { /* halve send rate directly */ - hctx->x /= 2; + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); ccid3_update_send_interval(hctx); - } else { /* * Modify the cached value of X_recv @@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * * Note that X_recv is scaled by 2^6 while X_calc is not */ - BUG_ON(hctx->p && !hctx->x_calc); + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - if (hctx->x_calc > (hctx->x_recv >> 5)) - hctx->x_recv /= 2; + if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) + hctx->ccid3hctx_x_recv = + max(hctx->ccid3hctx_x_recv / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + (2 * TFRC_T_MBI)); else { - hctx->x_recv = hctx->x_calc; - hctx->x_recv <<= 4; + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; + hctx->ccid3hctx_x_recv <<= 4; } ccid3_hc_tx_update_x(sk, NULL); } ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hctx->x); + (unsigned long long)hctx->ccid3hctx_x); /* * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ + if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); restart_timer: - sk_reset_timer(sk, &hctx->no_feedback_timer, + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); out: bh_unlock_sock(sk); sock_put(sk); } -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @skb: next packet candidate to send on @sk - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. +/* + * returns + * > 0: delay (in msecs) that should pass before actually sending + * = 0: can send immediately + * < 0: error condition; do not send packet */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - if (hctx->s == 0) { - sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hctx->last_win_count = 0; - hctx->t_last_win_count = now; + hctx->ccid3hctx_last_win_count = 0; + hctx->ccid3hctx_t_last_win_count = now; /* Set t_0 for initial packet */ - hctx->t_nom = now; + hctx->ccid3hctx_t_nom = now; + + hctx->ccid3hctx_s = skb->len; /* * Use initial RTT sample when available: recommended by erratum @@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) */ if (dp->dccps_syn_rtt) { ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hctx->rtt = dp->dccps_syn_rtt; - hctx->x = rfc3390_initial_rate(sk); - hctx->t_ld = now; + hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; } else { /* * Sender does not have RTT sample: @@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * is needed in several parts (e.g. window counter); * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. */ - hctx->rtt = DCCP_FALLBACK_RTT; - hctx->x = dp->dccps_mss_cache; - hctx->x <<= 6; + hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; + hctx->ccid3hctx_x = hctx->ccid3hctx_s; + hctx->ccid3hctx_x <<= 6; } - - /* Compute t_ipi = s / X */ - hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); ccid3_update_send_interval(hctx); - /* Seed value for Oscillation Prevention (sec. 4.5) */ - hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); - - } else { - delay = ktime_us_delta(hctx->t_nom, now); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + break; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* * Scheduling of packet transmissions [RFC 3448, 4.6] @@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; + if (delay - (s64)hctx->ccid3hctx_delta >= 1000) + return (u32)delay / 1000L; ccid3_hc_tx_update_win_count(hctx, now); + break; + case TFRC_SSTATE_TERM: + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); + return -EINVAL; } /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; /* set the nominal send time for the next following packet */ - hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); - return CCID_PACKET_SEND_AT_ONCE; + hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, + unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - /* Changes to s will become effective the next time X is computed */ - hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); + ccid3_hc_tx_update_s(hctx, len); - if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) + if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; + struct ccid3_options_received *opt_recv; ktime_t now; unsigned long t_nfb; - u32 r_sample; + u32 pinv, r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) + /* ... and only in the established state */ + if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && + hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + return; + + opt_recv = &hctx->ccid3hctx_options_received; + now = ktime_get_real(); + + /* Estimate RTT from history if ACK number is valid */ + r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, + DCCP_SKB_CB(skb)->dccpd_ack_seq, now); + if (r_sample == 0) { + DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); + } - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); + /* Update receive rate in units of 64 * bytes/second */ + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; + hctx->ccid3hctx_x_recv <<= 6; + /* Update loss event rate (which is scaled by 1e6) */ + pinv = opt_recv->ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + hctx->ccid3hctx_p = 0; + else /* can not exceed 100% */ + hctx->ccid3hctx_p = scaled_div(1, pinv); + /* + * Validate new RTT sample and update moving average + */ + r_sample = dccp_sample_rtt(sk, r_sample); + hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ - if (!hctx->feedback) { - hctx->feedback = true; + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - if (hctx->t_rto == 0) { + if (hctx->ccid3hctx_t_rto == 0) { /* * Initial feedback packet: Larger Initial Windows (4.2) */ - hctx->x = rfc3390_initial_rate(sk); - hctx->t_ld = now; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; ccid3_update_send_interval(hctx); goto done_computing_x; - } else if (hctx->p == 0) { + } else if (hctx->ccid3hctx_p == 0) { /* * First feedback after nofeedback timer expiry (4.3) */ @@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hctx->p > 0) - hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); + if (hctx->ccid3hctx_p > 0) + hctx->ccid3hctx_x_calc = + tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); ccid3_hc_tx_update_x(sk, &now); done_computing_x: ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hctx->rtt, r_sample, - hctx->s, hctx->p, hctx->x_calc, - (unsigned)(hctx->x_recv >> 6), - (unsigned)(hctx->x >> 6)); - /* - * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to - * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This - * can be useful if few connections share a link, avoiding that buffer - * fill levels (RTT) oscillate as a result of frequent adjustments to X. - * A useful presentation with background information is in - * Joerg Widmer, "Equation-Based Congestion Control", - * MSc Thesis, University of Mannheim, Germany, 2000 - * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). - */ - if (do_osc_prev) { - r_sample = tfrc_scaled_sqrt(r_sample); - /* - * The modulation can work in both ways: increase/decrease t_ipi - * according to long-term increases/decreases of the RTT. The - * former is a useful measure, since it works against queue - * build-up. The latter temporarily increases the sending rate, - * so that buffers fill up more quickly. This in turn causes - * the RTT to increase, so that either later reduction becomes - * necessary or the RTT stays at a very high level. Decreasing - * t_ipi is therefore not supported. - * Furthermore, during the initial slow-start phase the RTT - * naturally increases, where using the algorithm would cause - * delays. Hence it is disabled during the initial slow-start. - */ - if (r_sample > hctx->r_sqmean && hctx->p > 0) - hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, - hctx->r_sqmean); - hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); - /* update R_sqmean _after_ computing the modulation factor */ - hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); - } + dccp_role(sk), + sk, hctx->ccid3hctx_rtt, r_sample, + hctx->ccid3hctx_s, hctx->ccid3hctx_p, + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6), + (unsigned)(hctx->ccid3hctx_x >> 6)); /* unschedule no feedback timer */ - sk_stop_timer(sk, &hctx->no_feedback_timer); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); /* * As we have calculated new ipi, delta, t_nom it is possible @@ -453,66 +488,95 @@ done_computing_x: * This can help avoid triggering the nofeedback timer too * often ('spinning') on LANs with small RTTs. */ - hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + (CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC / 1000))); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); + dccp_role(sk), + sk, usecs_to_jiffies(t_nfb), t_nfb); - sk_reset_timer(sk, &hctx->no_feedback_timer, + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); } -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) +static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, + unsigned char len, u16 idx, + unsigned char *value) { + int rc = 0; + const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + struct ccid3_options_received *opt_recv; __be32 opt_val; - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); + opt_recv = &hctx->ccid3hctx_options_received; - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hctx->x_recv = opt_val; - hctx->x_recv <<= 6; + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { + opt_recv->ccid3or_seqno = dp->dccps_gsr; + opt_recv->ccid3or_loss_event_rate = ~0; + opt_recv->ccid3or_loss_intervals_idx = 0; + opt_recv->ccid3or_loss_intervals_len = 0; + opt_recv->ccid3or_receive_rate = 0; + } - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); + switch (option) { + case TFRC_OPT_LOSS_EVENT_RATE: + if (unlikely(len != 4)) { + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk, len); + rc = -EINVAL; } else { - /* Update the fixpoint Loss Event Rate fraction */ - hctx->p = tfrc_invert_loss_event_rate(opt_val); - + opt_val = get_unaligned((__be32 *)value); + opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); + dccp_role(sk), sk, + opt_recv->ccid3or_loss_event_rate); } + break; + case TFRC_OPT_LOSS_INTERVALS: + opt_recv->ccid3or_loss_intervals_idx = idx; + opt_recv->ccid3or_loss_intervals_len = len; + ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_intervals_idx, + opt_recv->ccid3or_loss_intervals_len); + break; + case TFRC_OPT_RECEIVE_RATE: + if (unlikely(len != 4)) { + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk, len); + rc = -EINVAL; + } else { + opt_val = get_unaligned((__be32 *)value); + opt_recv->ccid3or_receive_rate = ntohl(opt_val); + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_receive_rate); + } + break; } - return 0; + + return rc; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); - hctx->hist = NULL; - setup_timer(&hctx->no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; + hctx->ccid3hctx_hist = NULL; + setup_timer(&hctx->ccid3hctx_no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + return 0; } @@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - sk_stop_timer(sk, &hctx->no_feedback_timer); - tfrc_tx_hist_purge(&hctx->hist); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; + struct ccid3_hc_tx_sock *hctx; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + hctx = ccid3_hc_tx_sk(sk); + info->tcpi_rto = hctx->ccid3hctx_t_rto; + info->tcpi_rtt = hctx->ccid3hctx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; + const struct ccid3_hc_tx_sock *hctx; const void *val; + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + hctx = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) + if (len < sizeof(hctx->ccid3hctx_tfrc)) return -EINVAL; - tfrc.tfrctx_x = hctx->x; - tfrc.tfrctx_x_recv = hctx->x_recv; - tfrc.tfrctx_x_calc = hctx->x_calc; - tfrc.tfrctx_rtt = hctx->rtt; - tfrc.tfrctx_p = hctx->p; - tfrc.tfrctx_rto = hctx->t_rto; - tfrc.tfrctx_ipi = hctx->t_ipi; - len = sizeof(tfrc); - val = &tfrc; + len = sizeof(hctx->ccid3hctx_tfrc); + val = &hctx->ccid3hctx_tfrc; break; default: return -ENOPROTOOPT; @@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, /* * Receiver Half-Connection Routines */ + +/* CCID3 feedback types */ +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE +}; + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static char *ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + [TFRC_RSTATE_TERM] = "TERM", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) +{ + struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), + ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hcrx->ccid3hcrx_state = state; +} + static void ccid3_hc_rx_send_feedback(struct sock *sk, const struct sk_buff *skb, enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + ktime_t now; + s64 delta = 0; + + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) + return; + + now = ktime_get_real(); switch (fbtype) { case CCID3_FBACK_INITIAL: - hcrx->x_recv = 0; - hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ + hcrx->ccid3hcrx_x_recv = 0; + hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { - /* - * rfc3448bis-06, 6.3.1: First packet(s) lost or marked - * FIXME: in rfc3448bis the receiver returns X_recv=0 - * here as it normally would in the first feedback packet. - * However this is not possible yet, since the code still - * uses RFC 3448, i.e. - * If (p > 0) - * Calculate X_calc using the TCP throughput equation. - * X = max(min(X_calc, 2*X_recv), s/t_mbi); - * would bring X down to s/t_mbi. That is why we return - * X_recv according to rfc3448bis-06 for the moment. - */ - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist); - - hcrx->x_recv = scaled_div32(s, 2 * rtt); - break; - } /* * When parameters change (new loss or p > p_prev), we do not * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * always check whether at least RTT time units were covered. + * need to reuse the previous value of X_recv. However, when + * X_recv was 0 (due to early loss), this would kill X down to + * s/t_mbi (i.e. one packet in 64 seconds). + * To avoid such drastic reduction, we approximate X_recv as + * the number of bytes since last feedback. + * This is a safe fallback, since X is bounded above by X_calc. */ - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); - break; + if (hcrx->ccid3hcrx_x_recv > 0) + break; + /* fall through */ case CCID3_FBACK_PERIODIC: - /* - * Step (2) of rfc3448bis-06, 6.2: - * - if no data packets have been received, just restart timer - * - if data packets have been received, re-compute X_recv - */ - if (hcrx->hist.bytes_recvd == 0) - goto prepare_for_next_time; - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); + delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); + if (delta <= 0) + DCCP_BUG("delta (%ld) <= 0", (long)delta); + else + hcrx->ccid3hcrx_x_recv = + scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); break; default: return; } - ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); - dccp_sk(sk)->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); + hcrx->ccid3hcrx_tstamp_last_feedback = now; + hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; + hcrx->ccid3hcrx_bytes_recv = 0; -prepare_for_next_time: - tfrc_rx_hist_restart_byte_counter(&hcrx->hist); - hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; - hcrx->feedback = fbtype; + dp->dccps_hc_rx_insert_options = 1; + dccp_send_ack(sk); } static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + const struct ccid3_hc_rx_sock *hcrx; __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; + hcrx = ccid3_hc_rx_sk(sk); + if (dccp_packet_without_ack(skb)) return 0; - x_recv = htonl(hcrx->x_recv); - pinv = htonl(hcrx->p_inverse); + x_recv = htonl(hcrx->ccid3hcrx_x_recv); + pinv = htonl(hcrx->ccid3hcrx_pinv); if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)) || @@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; + u32 x_recv, p, delta; u64 fval; - /* - * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p - * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p - * is about 20.64%. This yields an interval length of 4.84 (rounded up). - */ - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) - return 5; + if (hcrx->ccid3hcrx_rtt == 0) { + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); + hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; + } - x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); - if (x_recv == 0) - goto failed; + delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); + x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN("X_recv==0\n"); + if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { + DCCP_BUG("stored value of X_recv is zero"); + return ~0U; + } + } - fval = scaled_div32(scaled_div(s, rtt), x_recv); + fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); + fval = scaled_div32(fval, x_recv); p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - if (p > 0) - return scaled_div(1, p); -failed: - return UINT_MAX; + return p == 0 ? ~0U : scaled_div(1, p); } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + do_feedback = CCID3_FBACK_INITIAL; + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + hcrx->ccid3hcrx_s = payload; + /* + * Not necessary to update ccid3hcrx_bytes_recv here, + * since X_recv = 0 for the first feedback packet (cf. + * RFC 3448, 6.3) -- gerrit + */ + } + goto update_records; + } + + if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) + return; /* done receiving */ + + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + /* + * Update moving-average of s and the sum of received payload bytes + */ + hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); + hcrx->ccid3hcrx_bytes_recv += payload; + } + /* * Perform loss detection and handle pending losses */ - if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, - skb, ndp, ccid3_first_li, sk)) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); + if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, + skb, ndp, ccid3_first_li, sk)) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; + } + + if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) + return; /* done receiving */ + /* - * Feedback for first non-empty data packet (RFC 3448, 6.3) + * Handle data packets: RTT sampling and monitoring p */ - else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); + if (unlikely(!is_data_packet)) + goto update_records; + + if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { + const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); + /* + * Empty loss history: no loss so far, hence p stays 0. + * Sample RTT values, since an RTT estimate is required for the + * computation of p when the first loss occurs; RFC 3448, 6.3.1. + */ + if (sample != 0) + hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); + + } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { + /* + * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean + * has decreased (resp. p has increased), send feedback now. + */ + do_feedback = CCID3_FBACK_PARAM_CHANGE; + } + /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ - else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && - SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); + if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) + do_feedback = CCID3_FBACK_PERIODIC; + +update_records: + tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); + +done_receiving: + if (do_feedback) + ccid3_hc_rx_send_feedback(sk, skb, do_feedback); } static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - tfrc_lh_init(&hcrx->li_hist); - return tfrc_rx_hist_init(&hcrx->hist, sk); + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; + tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); + return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); } static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - tfrc_rx_hist_purge(&hcrx->hist); - tfrc_lh_cleanup(&hcrx->li_hist); + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); + + tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); + tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { + const struct ccid3_hc_rx_sock *hcrx; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + hcrx = ccid3_hc_rx_sk(sk); + info->tcpi_ca_state = hcrx->ccid3hcrx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + const struct ccid3_hc_rx_sock *hcrx; struct tfrc_rx_info rx_info; const void *val; + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + hcrx = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) return -EINVAL; - rx_info.tfrcrx_x_recv = hcrx->x_recv; - rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); + rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; + rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; + rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : + scaled_div(1, hcrx->ccid3hcrx_pinv); len = sizeof(rx_info); val = &rx_info; break; @@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = { .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, }; -module_param(do_osc_prev, bool, 0644); -MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); - #ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, bool, 0644); MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); @@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); static __init int ccid3_module_init(void) { - struct timespec tp; - - /* - * Without a fine-grained clock resolution, RTTs/X_recv are not sampled - * correctly and feedback is sent either too early or too late. - */ - hrtimer_get_res(CLOCK_MONOTONIC, &tp); - if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { - printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" - " resolution - check your clocksource.\n", __func__, - tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); - return -ESOCKTNOSUPPORT; - } return ccid_register(&ccid3); } module_init(ccid3_module_init); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index af6e1bf937d9..49ca32bd7e79 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -47,22 +47,11 @@ /* Two seconds as per RFC 3448 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ -#define TFRC_T_MBI (64 * USEC_PER_SEC) +/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) -/* - * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. -#endif +/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ +#define TFRC_T_MBI 64 enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, @@ -70,43 +59,62 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; +struct ccid3_options_received { + u64 ccid3or_seqno:48, + ccid3or_loss_intervals_idx:16; + u16 ccid3or_loss_intervals_len; + u32 ccid3or_loss_event_rate; + u32 ccid3or_receive_rate; +}; + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, + TFRC_SSTATE_TERM, +}; + /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket * - * @x - Current sending rate in 64 * bytes per second - * @x_recv - Receive rate in 64 * bytes per second - * @x_calc - Calculated rate in bytes per second - * @rtt - Estimate of current round trip time in usecs - * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) - * @p - Current loss event rate (0-1) scaled by 1000000 - * @s - Packet size in bytes - * @t_rto - Nofeedback Timer setting in usecs - * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @feedback - Whether feedback has been received or not - * @last_win_count - Last window counter sent - * @t_last_win_count - Timestamp of earliest packet with - * last_win_count value sent - * @no_feedback_timer - Handle to no feedback timer - * @t_ld - Time last doubled during slow start - * @t_nom - Nominal send time of next packet - * @hist - Packet history + * @ccid3hctx_x - Current sending rate in 64 * bytes per second + * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second + * @ccid3hctx_x_calc - Calculated rate in bytes per second + * @ccid3hctx_rtt - Estimate of current round trip time in usecs + * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 + * @ccid3hctx_s - Packet size in bytes + * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs + * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs + * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states + * @ccid3hctx_last_win_count - Last window counter sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet + * with last_win_count value sent + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer + * @ccid3hctx_t_ld - Time last doubled during slow start + * @ccid3hctx_t_nom - Nominal send time of next packet + * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs + * @ccid3hctx_hist - Packet history + * @ccid3hctx_options_received - Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - u64 x; - u64 x_recv; - u32 x_calc; - u32 rtt; - u16 r_sqmean; - u32 p; - u32 t_rto; - u32 t_ipi; - u16 s; - bool feedback:1; - u8 last_win_count; - ktime_t t_last_win_count; - struct timer_list no_feedback_timer; - ktime_t t_ld; - ktime_t t_nom; - struct tfrc_tx_hist_entry *hist; + struct tfrc_tx_info ccid3hctx_tfrc; +#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x +#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv +#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc +#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt +#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p +#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto +#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi + u16 ccid3hctx_s; + enum ccid3_hc_tx_states ccid3hctx_state:8; + u8 ccid3hctx_last_win_count; + ktime_t ccid3hctx_t_last_win_count; + struct timer_list ccid3hctx_no_feedback_timer; + ktime_t ccid3hctx_t_ld; + ktime_t ccid3hctx_t_nom; + u32 ccid3hctx_delta; + struct tfrc_tx_hist_entry *ccid3hctx_hist; + struct ccid3_options_received ccid3hctx_options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) @@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) return hctx; } - -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, + TFRC_RSTATE_TERM = 127, }; /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket * - * @last_counter - Tracks window counter (RFC 4342, 8.1) - * @feedback - The type of the feedback last sent - * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @tstamp_last_feedback - Time at which last feedback was sent - * @hist - Packet history (loss detection + RTT sampling) - * @li_hist - Loss Interval database - * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) + * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) + * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) + * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) + * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states + * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @ccid3hcrx_rtt - Receiver estimate of RTT + * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent + * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent + * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) + * @ccid3hcrx_li_hist - Loss Interval database + * @ccid3hcrx_s - Received packet size in bytes + * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) */ struct ccid3_hc_rx_sock { - u8 last_counter:4; - enum ccid3_fback_type feedback:4; - u32 x_recv; - ktime_t tstamp_last_feedback; - struct tfrc_rx_hist hist; - struct tfrc_loss_hist li_hist; -#define p_inverse li_hist.i_mean + u8 ccid3hcrx_last_counter:4; + enum ccid3_hc_rx_states ccid3hcrx_state:8; + u32 ccid3hcrx_bytes_recv; + u32 ccid3hcrx_x_recv; + u32 ccid3hcrx_rtt; + ktime_t ccid3hcrx_tstamp_last_feedback; + struct tfrc_rx_hist ccid3hcrx_hist; + struct tfrc_loss_hist ccid3hcrx_li_hist; + u16 ccid3hcrx_s; +#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean }; static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index b1ae8f8259e5..5b3ce0688c5c 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) /** * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * This updates I_mean as the sequence numbers increase. As a consequence, the - * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) - * decreases, and thus there is no need to send renewed feedback. + * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev */ -void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) +u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); + u32 old_i_mean = lh->i_mean; s64 len; if (cur == NULL) /* not initialised */ - return; - - /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ - if (!dccp_data_packet(skb)) - return; + return 0; len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return; + return 0; if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) /* @@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) cur->li_is_closed = 1; if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return; + return 0; cur->li_length = len; tfrc_lh_calc_i_mean(lh); + + return (lh->i_mean < old_i_mean); } +EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, @@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, * @sk: Used by @calc_first_li in caller-specific way (subtyping) * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. */ -bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) +int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return false; + return 0; new = tfrc_lh_demand_next(lh); if (unlikely(new == NULL)) { DCCP_CRIT("Cannot allocate/add loss record."); - return false; + return 0; } new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; @@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, tfrc_lh_calc_i_mean(lh); } - return true; + return 1; } EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index d08a226db43e..246018a3b269 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) struct tfrc_rx_hist; -extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, +extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, u32 (*first_li)(struct sock *), struct sock *); -extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); +extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); #endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index cce9f03bda3e..6cc108afdc3b 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -40,6 +40,18 @@ #include "packet_history.h" #include "../../dccp.h" +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + /* * Transmitter History Routines */ @@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void) } } +static struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + + return head; +} + int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); @@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) } EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); +u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, + const ktime_t now) +{ + u32 rtt = 0; + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); + + if (packet != NULL) { + rtt = ktime_us_delta(now, packet->stamp); + /* + * Garbage-collect older (irrelevant) entries: + */ + tfrc_tx_hist_purge(&packet->next); + } + + return rtt; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); + + /* * Receiver History Routines */ @@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); - -static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - struct tfrc_rx_hist_entry *tmp = h->ring[a]; - - h->ring[a] = h->ring[b]; - h->ring[b] = tmp; -} - static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { - __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), - tfrc_rx_hist_index(h, b)); -} + const u8 idx_a = tfrc_rx_hist_index(h, a), + idx_b = tfrc_rx_hist_index(h, b); + struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; -/** - * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling - * This is called after loss detection has finished, when the history entry - * with the index of `loss_count' holds the highest-received sequence number. - * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). - */ -static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) -{ - __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); - h->loss_count = h->loss_start = 0; + h->ring[idx_a] = h->ring[idx_b]; + h->ring[idx_b] = tmp; } /* @@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = DCCP_SKB_CB(skb)->dccpd_seq; - if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ + if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ h->loss_count = 1; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); + } } static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) @@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2 if (dccp_loss_free(s2, s1, n1)) { /* hole is filled: S0, S2, and S1 are consecutive */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_count = 0; + h->loss_start = tfrc_rx_hist_index(h, 1); } else /* gap between S2 and S1: just update loss_prev */ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); @@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) if (dccp_loss_free(s1, s2, n2)) { /* entire hole filled by S0, S3, S1, S2 */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 0; } else { /* gap remains between S1 and S2 */ h->loss_start = tfrc_rx_hist_index(h, 1); @@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h) if (dccp_loss_free(s2, s3, n3)) { /* no gap between S2 and S3: entire hole is filled */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_start = tfrc_rx_hist_index(h, 3); + h->loss_count = 0; } else { /* gap between S2 and S3 */ h->loss_start = tfrc_rx_hist_index(h, 2); @@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h) } /** - * tfrc_rx_congestion_event - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * tfrc_rx_handle_loss - Loss detection and further processing + * @h: The non-empty RX history object + * @lh: Loss Intervals database to update + * @skb: Currently received packet + * @ndp: The NDP count belonging to @skb + * @calc_first_li: Caller-dependent computation of first loss interval in @lh + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) * Chooses action according to pending loss, updates LI database when a new * loss was detected, and does required post-processing. Returns 1 when caller * should send feedback, 0 otherwise. @@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h) * records accordingly, the caller should not perform any more RX history * operations when loss_count is greater than 0 after calling this function. */ -bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *), struct sock *sk) +int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { - bool new_event = false; - - if (tfrc_rx_hist_duplicate(h, skb)) - return 0; + int is_new_loss = 0; if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); - tfrc_rx_hist_sample_rtt(h, skb); - tfrc_rx_hist_add_packet(h, skb, ndp); } else if (h->loss_count == 1) { __one_after_loss(h, skb, ndp); } else if (h->loss_count != 2) { @@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, /* * Update Loss Interval database and recycle RX records */ - new_event = tfrc_lh_interval_add(lh, h, first_li, sk); + is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); __three_after_loss(h); } - - /* - * Update moving-average of `s' and the sum of received payload bytes. - */ - if (dccp_data_packet(skb)) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - - h->packet_size = tfrc_ewma(h->packet_size, payload, 9); - h->bytes_recvd += payload; - } - - /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ - if (!new_event) - tfrc_lh_update_i_mean(lh, skb); - - return new_event; + return is_new_loss; } -EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); +EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); -/* Compute the sending rate X_recv measured between feedback intervals */ -u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) +int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { - u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; - s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); - - WARN_ON(delta <= 0); - /* - * Ensure that the sampling interval for X_recv is at least one RTT, - * by extending the sampling interval backwards in time, over the last - * R_(m-1) seconds, as per rfc3448bis-06, 6.2. - * To reduce noise (e.g. when the RTT changes often), this is only - * done when delta is smaller than RTT/2. - */ - if (last_x_recv > 0 && delta < last_rtt/2) { - tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", - (long)delta, (unsigned)last_rtt); + int i; - delta = (bytes ? delta : 0) + last_rtt; - bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); + for (i = 0; i <= TFRC_NDUPACK; i++) { + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); + if (h->ring[i] == NULL) + goto out_free; } - if (unlikely(bytes == 0)) { - DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); - return last_x_recv; + h->loss_count = h->loss_start = 0; + return 0; + +out_free: + while (i-- != 0) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; } - return scaled_div32(bytes, delta); + return -ENOBUFS; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); +EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { @@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); -static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) +/** + * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) { - int i; - - memset(h, 0, sizeof(*h)); - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) { - tfrc_rx_hist_purge(h); - return -ENOBUFS; - } - } - return 0; + return h->ring[0]; } -int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) +/** + * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) { - if (tfrc_rx_hist_alloc(h)) - return -ENOBUFS; - /* - * Initialise first entry with GSR to start loss detection as early as - * possible. Code using this must not use any other fields. The entry - * will be overwritten once the CCID updates its received packets. - */ - tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; - return 0; + return h->ring[h->rtt_sample_prev]; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss - * is pending and uses the following history entries (via rtt_sample_prev): - * - h->ring[0] contains the most recent history entry prior to @skb; - * - h->ring[1] is an unused `dummy' entry when the current difference is 0; + * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able + * to compute a sample with given data - calling function should check this. */ -void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) +u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) { - struct tfrc_rx_hist_entry *last = h->ring[0]; - u32 sample, delta_v; - - /* - * When not to sample: - * - on non-data packets - * (RFC 4342, 8.1: CCVal only fully defined for data packets); - * - when no data packets have been received yet - * (FIXME: using sampled packet size as indicator here); - * - as long as there are gaps in the sequence space (pending loss). - */ - if (!dccp_data_packet(skb) || h->packet_size == 0 || - tfrc_rx_hist_loss_pending(h)) - return; + u32 sample = 0, + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + + if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ + if (h->rtt_sample_prev == 2) { /* previous candidate stored */ + sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + if (sample) + sample = 4 / sample * + ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); + else /* + * FIXME: This condition is in principle not + * possible but occurs when CCID is used for + * two-way data traffic. I have tried to trace + * it, but the cause does not seem to be here. + */ + DCCP_BUG("please report to dccp@vger.kernel.org" + " => prev = %u, last = %u", + tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + } else if (delta_v < 1) { + h->rtt_sample_prev = 1; + goto keep_ref_for_next_time; + } - h->rtt_sample_prev = 0; /* reset previous candidate */ + } else if (delta_v == 4) /* optimal match */ + sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); + else { /* suboptimal match */ + h->rtt_sample_prev = 2; + goto keep_ref_for_next_time; + } - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); - if (delta_v == 0) { /* less than RTT/4 difference */ - h->rtt_sample_prev = 1; - return; + if (unlikely(sample > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %u too large, using max\n", sample); + sample = DCCP_SANE_RTT_MAX; } - sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); - if (delta_v <= 4) /* between RTT/4 and RTT */ - sample *= 4 / delta_v; - else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) - /* - * Optimisation: CCVal difference is greater than 1 RTT, yet the - * sample is less than the local RTT estimate; which means that - * the RTT estimate is too high. - * To avoid noise, it is not done if the sample is below RTT/2. - */ - return; + h->rtt_sample_prev = 0; /* use current entry as next reference */ +keep_ref_for_next_time: - /* Use a lower weight than usual to increase responsiveness */ - h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); + return sample; } EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 555e65cd73a0..461cc91cce88 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -40,28 +40,12 @@ #include #include "tfrc.h" -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} +struct tfrc_tx_hist_entry; extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); +extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, + const u64 seqno, const ktime_t now); /* Subtraction a-b modulo-16, respects circular wrap-around */ #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) @@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry { * @loss_count: Number of entries in circular history * @loss_start: Movable index (for loss detection) * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - * @rtt_estimate: Receiver RTT estimate - * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) - * @bytes_recvd: Number of bytes received since @bytes_start - * @bytes_start: Start time for counting @bytes_recvd */ struct tfrc_rx_hist { struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; u8 loss_count:2, loss_start:2; - /* Receiver RTT sampling */ #define rtt_sample_prev loss_start - u32 rtt_estimate; - /* Receiver sampling of application payload lengths */ - u32 packet_size, - bytes_recvd; - ktime_t bytes_start; }; /** @@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) return h->loss_count > 0; } -/* - * Accessor functions to retrieve parameters sampled by the RX history - */ -static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) -{ - if (h->packet_size == 0) { - DCCP_WARN("No sample for s, using fallback\n"); - return TCP_MIN_RCVMSS; - } - return h->packet_size; - -} -static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) -{ - if (h->rtt_estimate == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - return DCCP_FALLBACK_RTT; - } - return h->rtt_estimate; -} - -static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) -{ - h->bytes_recvd = 0; - h->bytes_start = ktime_get_real(); -} - -extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); - - extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u64 ndp); extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); struct tfrc_loss_hist; -extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), - struct sock *sk); -extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, - const struct sk_buff *skb); -extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); +extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*first_li)(struct sock *sk), + struct sock *sk); +extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, + const struct sk_buff *skb); +extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); #endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ede12f53de5a..ed9857527acf 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -47,21 +47,6 @@ static inline u32 scaled_div32(u64 a, u64 b) return result; } -/** - * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 - * Uses scaling to improve accuracy of the integer approximation of sqrt(). The - * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for - * clamped RTT samples (dccp_sample_rtt). - * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the - * scaling factor is neutralised. For this purpose, it avoids returning zero. - */ -static inline u16 tfrc_scaled_sqrt(const u32 sample) -{ - const unsigned long non_zero_sample = sample ? : 1; - - return int_sqrt(non_zero_sample << 10); -} - /** * tfrc_ewma - Exponentially weighted moving average * @weight: Weight to be used as damping factor, in units of 1/10 @@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); extern int tfrc_tx_packet_history_init(void); extern void tfrc_tx_packet_history_exit(void); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 38239c4d5e14..2f20a29cffe4 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - /* - * In the congestion-avoidance phase p decays towards 0 - * when there are no further losses, so this case is - * natural. Truncating to p_min = 0.01% means that the - * maximum achievable throughput is limited to about - * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. - * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. - */ - tfrc_pr_debug("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); + DCCP_WARN("Value of p (%d) below resolution. " + "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; @@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) result = scaled_div(s, R); return scaled_div32(result, f); } + EXPORT_SYMBOL_GPL(tfrc_calc_x); /** @@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } -EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} -EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); +EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 5281190aa19c..b4bc6e095a0e 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -42,11 +42,9 @@ extern int dccp_debug; #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else #define dccp_pr_debug(format, a...) #define dccp_pr_debug_cat(format, a...) -#define dccp_debug(format, a...) #endif extern struct inet_hashinfo dccp_hashinfo; @@ -63,14 +61,11 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) +#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ @@ -86,13 +81,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); */ #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) -/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ -#define DCCP_TIME_RESOLUTION 10 - /* * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 */ -#define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) +#define DCCP_SANE_RTT_MIN 100 #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) @@ -103,6 +95,12 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; +extern int sysctl_dccp_feat_sequence_window; +extern int sysctl_dccp_feat_rx_ccid; +extern int sysctl_dccp_feat_tx_ccid; +extern int sysctl_dccp_feat_ack_ratio; +extern int sysctl_dccp_feat_send_ack_vector; +extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -237,22 +235,8 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -/* - * TX Packet Dequeueing Interface - */ -extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -extern bool dccp_qpolicy_full(struct sock *sk); -extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); -extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -extern void dccp_write_xmit(struct sock *sk); +extern void dccp_write_xmit(struct sock *sk, int block); extern void dccp_write_space(struct sock *sk); -extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) @@ -268,8 +252,7 @@ extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); -extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); +extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); @@ -334,14 +317,7 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); extern int dccp_invalid_packet(struct sk_buff *skb); - -static inline u32 dccp_sane_rtt(long usec_sample) -{ - if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) - DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); - return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); -} -extern u32 dccp_sample_rtt(struct sock *sk, long delta); +extern u32 dccp_sample_rtt(struct sock *sk, long delta); static inline int dccp_bad_service_code(const struct sock *sk, const __be32 service) @@ -435,62 +411,36 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); + const struct dccp_minisock *dmsk = dccp_msk(sk); dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); + dccp_set_seqno(&dp->dccps_swl, + dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); + dccp_set_seqno(&dp->dccps_swh, + dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); + dp->dccps_awh = dp->dccps_gss = seq; + dccp_set_seqno(&dp->dccps_awl, + (dp->dccps_gss - + dccp_msk(sk)->dccpms_sequence_window + 1)); } static inline int dccp_ack_pending(const struct sock *sk) { - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); + const struct dccp_sock *dp = dccp_sk(sk); + return dp->dccps_timestamp_echo != 0 || +#ifdef CONFIG_IP_DCCP_ACKVEC + (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || +#endif + inet_csk_ack_scheduled(sk); } -extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -extern int dccp_feat_finalise_settings(struct dccp_sock *dp); -extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -extern void dccp_feat_list_purge(struct list_head *fn_list); - extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sock *sk, diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 93aae7c95550..d8a3509b26f6 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dp->dccps_hc_rx_ackvec != NULL) + if (dccp_msk(sk)->dccpms_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index f94c7c9d1a7f..933a0ecf8d46 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1,19 +1,11 @@ /* * net/dccp/feat.c * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 The University of Aberdeen, Scotland, UK - * Copyright (c) 2008 Gerrit Renker - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau - * + * An implementation of the DCCP protocol + * Andrea Bittau * * ASSUMPTIONS * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. * o All currently known SP features have 1-byte quantities. If in the future * extensions of RFCs 4340..42 define features with item lengths larger than * one byte, a feature-specific extension of the code will be required. @@ -23,1510 +15,635 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + #include + #include "ccid.h" #include "feat.h" -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; +#define DCCP_FEAT_SP_NOAGREE (-123) -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) +int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, + u8 *val, u8 len, gfp_t gfp) { - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); + struct dccp_opt_pend *opt; - if (new_ccid == NULL) - return -ENOMEM; + dccp_feat_debug(type, feature, *val); - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; + if (len > 3) { + DCCP_WARN("invalid length %d\n", len); + return -EINVAL; + } + /* XXX add further sanity checks */ + + /* check if that feature is already being negotiated */ + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + /* ok we found a negotiation for this option already */ + if (opt->dccpop_feat == feature && opt->dccpop_type == type) { + dccp_pr_debug("Replacing old\n"); + /* replace */ + BUG_ON(opt->dccpop_val == NULL); + kfree(opt->dccpop_val); + opt->dccpop_val = val; + opt->dccpop_len = len; + opt->dccpop_conf = 0; + return 0; + } } - return 0; -} -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); + /* negotiation for a new feature */ + opt = kmalloc(sizeof(*opt), gfp); + if (opt == NULL) + return -ENOMEM; - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} + opt->dccpop_type = type; + opt->dccpop_feat = feature; + opt->dccpop_len = len; + opt->dccpop_val = val; + opt->dccpop_conf = 0; + opt->dccpop_sc = NULL; -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ -#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ - /* - * FIXME: This is required until several problems in the CCID-2 code are - * resolved. The CCID-2 code currently does not cope well; using dynamic - * Ack Ratios greater than 1 caused instabilities. These were manifest - * in hangups and long RTO timeouts (1...3 seconds). Until this has been - * stabilised, it is safer not to activate dynamic Ack Ratio changes. - */ - dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", - rx ? "RX" : "TX", (u16)ratio); - ratio = 1; -#endif - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; + BUG_ON(opt->dccpop_val == NULL); + + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); return 0; } -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) +EXPORT_SYMBOL_GPL(dccp_feat_change); + +static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); + /* figure out if we are changing our CCID or the peer's */ + const int rx = type == DCCPO_CHANGE_R; + const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; + struct ccid *new_ccid; + + /* Check if nothing is being changed. */ + if (ccid_nr == new_ccid_nr) + return 0; + + new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); + if (new_ccid == NULL) + return -ENOMEM; if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + dp->dccps_hc_rx_ccid = new_ccid; + dmsk->dccpms_rx_ccid = new_ccid_nr; + } else { + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = new_ccid; + dmsk->dccpms_tx_ccid = new_ccid_nr; } - return 0; -} -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); return 0; } -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) +static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) { - struct dccp_sock *dp = dccp_sk(sk); + dccp_feat_debug(type, feat, val); - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); + switch (feat) { + case DCCPF_CCID: + return dccp_feat_update_ccid(sk, type, val); + default: + dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", + dccp_feat_typename(type), feat); + break; } return 0; } -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) +static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, + u8 *rpref, u8 rlen) { - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; + struct dccp_sock *dp = dccp_sk(sk); + u8 *spref, slen, *res = NULL; + int i, j, rc, agree = 1; + BUG_ON(rpref == NULL); + + /* check if we are the black sheep */ + if (dp->dccps_role == DCCP_ROLE_CLIENT) { + spref = rpref; + slen = rlen; + rpref = opt->dccpop_val; + rlen = opt->dccpop_len; + } else { + spref = opt->dccpop_val; + slen = opt->dccpop_len; + } /* - * Other features: add cases for new feature types here after adding - * them to the above table. + * Now we have server preference list in spref and client preference in + * rpref */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} + BUG_ON(spref == NULL); + BUG_ON(rpref == NULL); -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); + /* FIXME sanity check vals */ - return idx < 0 ? : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", - "UNSTABLE", "STABLE" }; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; + /* Are values in any order? XXX Lame "algorithm" here */ + for (i = 0; i < slen; i++) { + for (j = 0; j < rlen; j++) { + if (spref[i] == rpref[j]) { + res = &spref[i]; + break; + } + } + if (res) + break; } - return NULL; -} -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} + /* we didn't agree on anything */ + if (res == NULL) { + /* confirm previous value */ + switch (opt->dccpop_feat) { + case DCCPF_CCID: + /* XXX did i get this right? =P */ + if (opt->dccpop_type == DCCPO_CHANGE_L) + res = &dccp_msk(sk)->dccpms_tx_ccid; + else + res = &dccp_msk(sk)->dccpms_rx_ccid; + break; -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} + default: + DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); + /* XXX implement res */ + return -EFAULT; + } -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif + dccp_pr_debug("Don't agree... reconfirming %d\n", *res); + agree = 0; /* this is used for mandatory options... */ + } -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; + /* need to put result and our preference list */ + rlen = 1 + opt->dccpop_len; + rpref = kmalloc(rlen, GFP_ATOMIC); + if (rpref == NULL) + return -ENOMEM; - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; + *rpref = *res; + memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; + /* put it in the "confirm queue" */ + if (opt->dccpop_sc == NULL) { + opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); + if (opt->dccpop_sc == NULL) { + kfree(rpref); + return -ENOMEM; } } else { - val = fval->nn; + /* recycle the confirm slot */ + BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); + kfree(opt->dccpop_sc->dccpoc_val); + dccp_pr_debug("recycling confirm slot\n"); + } + memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); + + opt->dccpop_sc->dccpoc_val = rpref; + opt->dccpop_sc->dccpoc_len = rlen; + + /* update the option on our side [we are about to send the confirm] */ + rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); + if (rc) { + kfree(opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc); + opt->dccpop_sc = NULL; + return rc; } - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} + dccp_pr_debug("Will confirm %d\n", *rpref); -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOBUFS; - } + /* say we want to change to X but we just got a confirm X, suppress our + * change + */ + if (!opt->dccpop_conf) { + if (*opt->dccpop_val == *res) + opt->dccpop_conf = 1; + dccp_pr_debug("won't ask for change of same feature\n"); } - return 0; -} -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); + return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ } -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) +static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; + struct dccp_minisock *dmsk = dccp_msk(sk); + struct dccp_opt_pend *opt; + int rc = 1; + u8 t; - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; + /* + * We received a CHANGE. We gotta match it against our own preference + * list. If we got a CHANGE_R it means it's a change for us, so we need + * to compare our CHANGE_L list. + */ + if (type == DCCPO_CHANGE_L) + t = DCCPO_CHANGE_R; + else + t = DCCPO_CHANGE_L; - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} + /* find our preference list for this feature */ + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + if (opt->dccpop_type != t || opt->dccpop_feat != feature) + continue; -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); + /* find the winner from the two preference lists */ + rc = dccp_feat_reconcile(sk, opt, val, len); + break; } -} -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; + /* We didn't deal with the change. This can happen if we have no + * preference list for the feature. In fact, it just shouldn't + * happen---if we understand a feature, we should have a preference list + * with at least the default value. + */ + BUG_ON(rc == 1); - list_for_each_entry(entry, fn_list, node) - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - return NULL; + return rc; } -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) +static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } + struct dccp_opt_pend *opt; + struct dccp_minisock *dmsk = dccp_msk(sk); + u8 *copy; + int rc; - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); + /* NN features must be Change L (sec. 6.3.2) */ + if (type != DCCPO_CHANGE_L) { + dccp_pr_debug("received %s for NN feature %d\n", + dccp_feat_typename(type), feature); + return -EFAULT; } - return entry; -} -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @needs_mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + /* XXX sanity check opt val */ - if (new == NULL) + /* copy option so we can confirm it */ + opt = kzalloc(sizeof(*opt), GFP_ATOMIC); + if (opt == NULL) return -ENOMEM; - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = 0; - new->empty_confirm = 0; - new->val = *fval; - new->needs_mandatory = mandatory; + copy = kmemdup(val, len, GFP_ATOMIC); + if (copy == NULL) { + kfree(opt); + return -ENOMEM; + } - return 0; -} + opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ + opt->dccpop_feat = feature; + opt->dccpop_val = copy; + opt->dccpop_len = len; -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + /* change feature */ + rc = dccp_feat_update(sk, type, feature, *val); + if (rc) { + kfree(opt->dccpop_val); + kfree(opt); + return rc; + } - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; + dccp_feat_debug(type, feature, *copy); - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = 1; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = 0; + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); return 0; } -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) +static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, + u8 type, u8 feature) { - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} + /* XXX check if other confirms for that are queued and recycle slot */ + struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); + if (opt == NULL) { + /* XXX what do we do? Ignoring should be fine. It's a change + * after all =P + */ + return; } - return 0; -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} + switch (type) { + case DCCPO_CHANGE_L: + opt->dccpop_type = DCCPO_CONFIRM_R; + break; + case DCCPO_CHANGE_R: + opt->dccpop_type = DCCPO_CONFIRM_L; + break; + default: + DCCP_WARN("invalid type %d\n", type); + kfree(opt); + return; + } + opt->dccpop_feat = feature; + opt->dccpop_val = NULL; + opt->dccpop_len = 0; -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} + /* change feature */ + dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ + list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); } -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) +static void dccp_feat_flush_confirm(struct sock *sk) { - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} + struct dccp_minisock *dmsk = dccp_msk(sk); + /* Check if there is anything to confirm in the first place */ + int yes = !list_empty(&dmsk->dccpms_conf); -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} + if (!yes) { + struct dccp_opt_pend *opt; -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + if (opt->dccpop_conf) { + yes = 1; + break; } } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - /* - * Enter CHANGING after transmitting the Change option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; + if (!yes) + return; - return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); + /* OK there is something to confirm... */ + /* XXX check if packet is in flight? Send delayed ack?? */ + if (sk->sk_state == DCCP_OPEN) + dccp_send_ack(sk); } -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ -int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) +int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { - /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_NN) - return -EINVAL; - return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); -} + int rc; -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * This function is used to communicate NN updates out-of-band. The difference - * to feature negotiation during connection setup is that values are activated - * immediately after validation, i.e. we don't wait for the Confirm: either the - * value is accepted by the peer (and then the waiting is futile), or it is not - * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values - * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; + dccp_feat_debug(type, feature, *val); - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; + /* figure out if it's SP or NN feature */ + switch (feature) { + /* deal with SP features */ + case DCCPF_CCID: + rc = dccp_feat_sp(sk, type, feature, val, len); + break; - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; + /* deal with NN features */ + case DCCPF_ACK_RATIO: + rc = dccp_feat_nn(sk, type, feature, val, len); + break; - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", - (unsigned long long)nn_val, - (unsigned long long)entry->val.nn, - dccp_feat_sname[entry->state]); - return 0; + /* XXX implement other features */ + default: + dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", + dccp_feat_typename(type), feature); + rc = -EFAULT; + break; } - if (dccp_feat_activate(sk, feat, 1, &fval)) - return -EADV; - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. + /* check if there were problems changing features */ + if (rc) { + /* If we don't agree on SP, we sent a confirm for old value. + * However we propagate rc to caller in case option was + * mandatory */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; + if (rc != DCCP_FEAT_SP_NOAGREE) + dccp_feat_empty_confirm(dccp_msk(sk), type, feature); } -} -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; + /* generate the confirm [if required] */ + dccp_feat_flush_confirm(sk); - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; + return rc; } -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} +EXPORT_SYMBOL_GPL(dccp_feat_change_recv); -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) +int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, + u8 *val, u8 len) { - u8 c, s; + u8 t; + struct dccp_opt_pend *opt; + struct dccp_minisock *dmsk = dccp_msk(sk); + int found = 0; + int all_confirmed = 1; - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} + dccp_feat_debug(type, feature, *val); -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; + /* locate our change request */ + switch (type) { + case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; + case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; + default: DCCP_WARN("invalid type %d\n", type); + return 1; - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; } - return does_occur; -} + /* XXX sanity check feature value */ -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fval: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + if (!opt->dccpop_conf && opt->dccpop_type == t && + opt->dccpop_feat == feature) { + found = 1; + dccp_pr_debug("feature %d found\n", opt->dccpop_feat); - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } + /* XXX do sanity check */ - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; + opt->dccpop_conf = 1; - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} + /* We got a confirmation---change the option */ + dccp_feat_update(sk, opt->dccpop_type, + opt->dccpop_feat, *val); -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; + /* XXX check the return value of dccp_feat_update */ + break; + } - return dccp_feat_push_confirm(fn, feat, local, &fval); + if (!opt->dccpop_conf) + all_confirmed = 0; } - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) + /* fix re-transmit timer */ + /* XXX gotta make sure that no option negotiation occurs during + * connection shutdown. Consider that the CLOSEREQ is sent and timer is + * on. if all options are confirmed it might kill timer which should + * remain alive until close is received. */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - return dccp_feat_push_confirm(fn, feat, local, &fval); - - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; + if (all_confirmed) { + dccp_pr_debug("clear feat negotiation timer %p\n", sk); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = 0; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = 1; - } - entry->needs_confirm = 1; - entry->needs_mandatory = 0; - entry->state = FEAT_STABLE; + if (!found) + dccp_pr_debug("%s(%d, ...) never requested\n", + dccp_feat_typename(type), feature); return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; } -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } +EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; +void dccp_feat_clean(struct dccp_minisock *dmsk) +{ + struct dccp_opt_pend *opt, *next; - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; + list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, + dccpop_node) { + BUG_ON(opt->dccpop_val == NULL); + kfree(opt->dccpop_val); - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } + if (opt->dccpop_sc != NULL) { + BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); + kfree(opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc); + } - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; + kfree(opt); } + INIT_LIST_HEAD(&dmsk->dccpms_pending); - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; + list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { + BUG_ON(opt == NULL); + if (opt->dccpop_val != NULL) + kfree(opt->dccpop_val); + kfree(opt); } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; + INIT_LIST_HEAD(&dmsk->dccpms_conf); } -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). +EXPORT_SYMBOL_GPL(dccp_feat_clean); + +/* this is to be called only when a listening sock creates its child. It is + * assumed by the function---the confirm is not duplicated, but rather it is + * "passed on". */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) +int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) { - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; + struct dccp_minisock *olddmsk = dccp_msk(oldsk); + struct dccp_minisock *newdmsk = dccp_msk(newsk); + struct dccp_opt_pend *opt; + int rc = 0; - dccp_feat_print_opt(opt, feat, val, len, mandatory); + INIT_LIST_HEAD(&newdmsk->dccpms_pending); + INIT_LIST_HEAD(&newdmsk->dccpms_conf); - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; + list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { + struct dccp_opt_pend *newopt; + /* copy the value of the option */ + u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; + if (val == NULL) + goto out_clean; - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - if (fval.nn != entry->val.nn) { - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto fast_path_failed; + newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); + if (newopt == NULL) { + kfree(val); + goto out_clean; } - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); + /* insert the option */ + newopt->dccpop_val = val; + list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; + /* XXX what happens with backlogs and multiple connections at + * once... + */ + /* the master socket no longer needs to worry about confirms */ + opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ + + /* reset state for a new socket */ + opt->dccpop_conf = 0; } - return 0; -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); + /* XXX not doing anything about the conf queue */ + +out: + return rc; -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; +out_clean: + dccp_feat_clean(newdmsk); + rc = -ENOMEM; + goto out; } -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) +EXPORT_SYMBOL_GPL(dccp_feat_clone); + +static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, + u8 *val, u8 len) { - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; + int rc = -ENOMEM; + u8 *copy = kmemdup(val, len, GFP_KERNEL); - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; /* fall through */ - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection - * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); + if (copy != NULL) { + rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); + if (rc) + kfree(copy); } - return 0; /* ignore FN options in all other states */ + return rc; } -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) +int dccp_feat_init(struct dccp_minisock *dmsk) { - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; + INIT_LIST_HEAD(&dmsk->dccpms_pending); + INIT_LIST_HEAD(&dmsk->dccpms_conf); - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); + /* CCID L */ + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, + &dmsk->dccpms_tx_ccid, 1); if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || - ccid_get_builtin_ccids(&rx.val, &rx.len)) - return -ENOBUFS; - - /* Pre-load all CCID modules that are going to be advertised */ - rc = -EUNATCH; - if (ccid_request_modules(tx.val, tx.len)) - goto free_ccid_lists; - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; + goto out; - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); + /* CCID R */ + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, + &dmsk->dccpms_rx_ccid, 1); if (rc) - goto free_ccid_lists; + goto out; - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); + /* Ack ratio */ + rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, + &dmsk->dccpms_ack_ratio, 1); +out: return rc; } -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; +EXPORT_SYMBOL_GPL(dccp_feat_init); - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; +#ifdef CONFIG_IP_DCCP_DEBUG +const char *dccp_feat_typename(const u8 type) +{ + switch(type) { + case DCCPO_CHANGE_L: return("ChangeL"); + case DCCPO_CONFIRM_L: return("ConfirmL"); + case DCCPO_CHANGE_R: return("ChangeR"); + case DCCPO_CONFIRM_R: return("ConfirmR"); + /* the following case must not appear in feature negotation */ + default: dccp_pr_debug("unknown type %d [BUG!]\n", type); } + return NULL; +} - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } +EXPORT_SYMBOL_GPL(dccp_feat_typename); - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); +const char *dccp_feat_name(const u8 feat) +{ + static const char *feature_names[] = { + [DCCPF_RESERVED] = "Reserved", + [DCCPF_CCID] = "CCID", + [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", + [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", + [DCCPF_ECN_INCAPABLE] = "ECN Incapable", + [DCCPF_ACK_RATIO] = "Ack Ratio", + [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", + [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", + [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", + [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", + }; + if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) + return feature_names[DCCPF_RESERVED]; - dccp_pr_debug("Activation OK\n"); - return 0; + if (feat >= DCCPF_MIN_CCID_SPECIFIC) + return "CCID-specific"; -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; + return feature_names[feat]; } + +EXPORT_SYMBOL_GPL(dccp_feat_name); +#endif /* CONFIG_IP_DCCP_DEBUG */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 2217066e22d7..e272222c7ace 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -3,134 +3,38 @@ /* * net/dccp/feat.h * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker + * An implementation of the DCCP protocol * Copyright (c) 2005 Andrea Bittau * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ + #include #include "dccp.h" -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; +#ifdef CONFIG_IP_DCCP_DEBUG +extern const char *dccp_feat_typename(const u8 type); +extern const char *dccp_feat_name(const u8 feat); -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @feat_num: one of %dccp_feature_numbers - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - u8 feat_num; - dccp_feat_val val; - enum dccp_feat_state state:8; - bool needs_mandatory:1, - needs_confirm:1, - empty_confirm:1, - is_local:1; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) +static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) { - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; + dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), + dccp_feat_name(feat), feat, val); } +#else +#define dccp_feat_debug(type, feat, val) +#endif /* CONFIG_IP_DCCP_DEBUG */ + +extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, + u8 *val, u8 len, gfp_t gfp); +extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, + u8 *val, u8 len); +extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, + u8 *val, u8 len); +extern void dccp_feat_clean(struct dccp_minisock *dmsk); +extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); +extern int dccp_feat_init(struct dccp_minisock *dmsk); -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -extern int dccp_feat_init(struct sock *sk); -extern void dccp_feat_initialise_sysctls(void); -extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); -extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); - -extern int dccp_insert_option_mandatory(struct sk_buff *skb); -extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c index df0e6714aa11..779d0ed9ae94 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -159,15 +159,13 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) +static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; + struct dccp_sock *dp = dccp_sk(sk); - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); + if (dccp_msk(sk)->dccpms_send_ack_vector) + dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) @@ -366,13 +364,22 @@ discard: int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len) { + struct dccp_sock *dp = dccp_sk(sk); + if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; - dccp_handle_ackvec_processing(sk, skb); + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); @@ -414,33 +421,40 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ if (dccp_parse_options(sk, NULL, skb)) - return 1; + goto out_invalid_packet; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto out_invalid_packet; /* FIXME: change error code */ + /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_update_gsr(sk, dp->dccps_isr); /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + * + * AWL was adjusted in dccp_v4_connect -acme */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_set_seqno(&dp->dccps_swl, + max48(dp->dccps_swl, dp->dccps_isr)); dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -461,15 +475,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, */ dccp_set_state(sk, DCCP_PARTOPEN); - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -504,16 +509,6 @@ out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, @@ -595,6 +590,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) return 1; + + /* FIXME: do congestion control initialization */ goto discard; } if (dh->dccph_type == DCCP_PKT_RESET) @@ -603,35 +600,29 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; } - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; + if (sk->sk_state != DCCP_REQUESTING) { + if (dccp_check_seqno(sk, skb)) + goto discard; - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } + /* + * Step 8: Process options and mark acknowledgeable + */ + if (dccp_parse_options(sk, NULL, skb)) + return 1; - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; + if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; + + dccp_deliver_input_to_ccids(sk, skb); + } /* * Step 9: Process Reset @@ -640,22 +631,44 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return - */ + */ if (dh->dccph_type == DCCP_PKT_RESET) { dccp_rcv_reset(sk, skb); return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_RESPONSE) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && + dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { if (dccp_rcv_close(sk, skb)) return 0; goto discard; } switch (sk->sk_state) { + case DCCP_CLOSED: + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; + case DCCP_REQUESTING: + /* FIXME: do congestion control initialization */ + queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; @@ -663,12 +676,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, __kfree_skb(skb); return 0; - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - /* fall through */ case DCCP_RESPOND: + case DCCP_PARTOPEN: queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); break; @@ -707,7 +716,16 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) /* dccpor_elapsed_time is either zeroed out or set and > 0 */ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - return dccp_sane_rtt(delta); + if (unlikely(delta <= 0)) { + DCCP_WARN("unusable RTT sample %ld, using min\n", delta); + return DCCP_SANE_RTT_MIN; + } + if (unlikely(delta > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %ld too large, using max\n", delta); + return DCCP_SANE_RTT_MAX; + } + + return delta; } EXPORT_SYMBOL_GPL(dccp_sample_rtt); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b623f6b25482..882c5c4de69e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -545,7 +545,6 @@ out: static void dccp_v4_reqsk_destructor(struct request_sock *req) { - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); kfree(inet_rsk(req)->opt); } @@ -596,8 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; + dccp_reqsk_init(req, skb); dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ad6212e00435..5e1ee0da2c40 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -302,7 +302,6 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); if (inet6_rsk(req)->pktopts != NULL) kfree_skb(inet6_rsk(req)->pktopts); } @@ -425,8 +424,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; + dccp_reqsk_init(req, skb); dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index f4d9c8f60ede..b2804e2d1b8c 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -42,6 +42,16 @@ struct inet_timewait_death_row dccp_death_row = { EXPORT_SYMBOL_GPL(dccp_death_row); +void dccp_minisock_init(struct dccp_minisock *dmsk) +{ + dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; + dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; + dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; + dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; + dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; + dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; +} + void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -102,9 +112,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); + const struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); + struct dccp_minisock *newdmsk = dccp_msk(newsk); newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; @@ -114,32 +125,65 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - INIT_LIST_HEAD(&newdp->dccps_featneg); + if (dccp_feat_clone(sk, newsk)) + goto out_free; + + if (newdmsk->dccpms_send_ack_vector) { + newdp->dccps_hc_rx_ackvec = + dccp_ackvec_alloc(GFP_ATOMIC); + if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) + goto out_free; + } + + newdp->dccps_hc_rx_ccid = + ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, + newsk, GFP_ATOMIC); + newdp->dccps_hc_tx_ccid = + ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, + newsk, GFP_ATOMIC); + if (unlikely(newdp->dccps_hc_rx_ccid == NULL || + newdp->dccps_hc_tx_ccid == NULL)) { + dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); + ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); + ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); +out_free: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + /* * Step 3: Process LISTEN state * * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies */ - newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; + + /* See dccp_v4_conn_request */ + newdmsk->dccpms_sequence_window = req->rcv_wnd; + + newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; + dccp_update_gss(newsk, dreq->dreq_iss); + + newdp->dccps_isr = dreq->dreq_isr; + dccp_update_gsr(newsk, dreq->dreq_isr); /* - * Activate features: initialise CCIDs, sequence windows etc. + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } + dccp_set_seqno(&newdp->dccps_swl, + max48(newdp->dccps_swl, newdp->dccps_isr)); + dccp_set_seqno(&newdp->dccps_awl, + max48(newdp->dccps_awl, newdp->dccps_iss)); + dccp_init_xmit_timers(newsk); DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); @@ -260,17 +304,14 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) +void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) { struct dccp_request_sock *dreq = dccp_rsk(req); inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; inet_rsk(req)->acked = 0; + req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c index e5a32979d7d7..0809b63cb055 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -23,20 +23,23 @@ #include "dccp.h" #include "feat.h" -u64 dccp_decode_value_var(const u8 *bf, const u8 len) +int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; +int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; +int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; +int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; +int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; +int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; + +static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) { - u64 value = 0; + u32 value = 0; - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; if (len > 3) - value += ((u64)*bf++) << 24; + value += *bf++ << 24; if (len > 2) - value += ((u64)*bf++) << 16; + value += *bf++ << 16; if (len > 1) - value += ((u64)*bf++) << 8; + value += *bf++ << 8; if (len > 0) value += *bf; @@ -54,6 +57,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; + u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; const unsigned char *opt_end = (unsigned char *)dh + @@ -95,11 +99,18 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, } /* + * CCID-Specific Options (from RFC 4340, sec. 10.3): + * + * Option numbers 128 through 191 are for options sent from the + * HC-Sender to the HC-Receiver; option numbers 192 through 255 + * are for options sent from the HC-Receiver to the HC-Sender. + * * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. + * */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || + if (dreq != NULL && (opt >= 128 || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; @@ -120,13 +131,43 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), (unsigned long long)opt_recv->dccpor_ndp); break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ + case DCCPO_CHANGE_L: + /* fall through */ + case DCCPO_CHANGE_R: + if (pkt_type == DCCP_PKT_DATA) break; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; + if (len < 2) + goto out_invalid_option; + rc = dccp_feat_change_recv(sk, opt, *value, value + 1, + len - 1); + /* + * When there is a change error, change_recv is + * responsible for dealing with it. i.e. reply with an + * empty confirm. + * If the change was mandatory, then we need to die. + */ + if (rc && mandatory) + goto out_invalid_option; + break; + case DCCPO_CONFIRM_L: + /* fall through */ + case DCCPO_CONFIRM_R: + if (pkt_type == DCCP_PKT_DATA) + break; + if (len < 2) /* FIXME this disallows empty confirm */ + goto out_invalid_option; + if (dccp_feat_confirm_recv(sk, opt, *value, + value + 1, len - 1)) + goto out_invalid_option; + break; + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ + break; + + if (dccp_msk(sk)->dccpms_send_ack_vector && + dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) + goto out_invalid_option; break; case DCCPO_TIMESTAMP: if (len != 4) @@ -154,8 +195,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); break; case DCCPO_TIMESTAMP_ECHO: if (len != 4 && len != 6 && len != 8) @@ -212,25 +251,23 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: + case 128 ... 191: { + const u16 idx = value - options; + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) + opt, len, idx, + value) != 0) goto out_invalid_option; + } break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - * Fall through. - */ - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: + case 192 ... 255: { + const u16 idx = value - options; + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) + opt, len, idx, + value) != 0) goto out_invalid_option; + } break; default: DCCP_CRIT("DCCP(%p): option %d(len=%d) not " @@ -252,10 +289,8 @@ out_nonsensical_length: out_invalid_option: DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; + DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; @@ -264,12 +299,9 @@ out_featneg_failed: EXPORT_SYMBOL_GPL(dccp_parse_options); -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) +static void dccp_encode_value_var(const u32 value, unsigned char *to, + const unsigned int len) { - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; if (len > 3) *to++ = (value & 0xFF000000) >> 24; if (len > 2) @@ -429,140 +461,92 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, return 0; } -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) +static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; + u8 *to; - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { + DCCP_WARN("packet too small for feature %d option!\n", feat); return -1; } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} + to = skb_push(skb, len + 3); + *to++ = type; + *to++ = len + 3; + *to++ = feat; -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; + if (len) + memcpy(to, val, len); - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *skb_push(skb, 1) = DCCPO_MANDATORY; + dccp_pr_debug("%s(%s (%d), ...), length %d\n", + dccp_feat_typename(type), + dccp_feat_name(feat), feat, len); return 0; } -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) +static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) { - u8 tot_len, *to; + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); + struct dccp_opt_pend *opt, *next; + int change = 0; + + /* confirm any options [NN opts] */ + list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { + dccp_insert_feat_opt(skb, opt->dccpop_type, + opt->dccpop_feat, opt->dccpop_val, + opt->dccpop_len); + /* fear empty confirms */ + if (opt->dccpop_val) + kfree(opt->dccpop_val); + kfree(opt); + } + INIT_LIST_HEAD(&dmsk->dccpms_conf); + + /* see which features we need to send */ + list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { + /* see if we need to send any confirm */ + if (opt->dccpop_sc) { + dccp_insert_feat_opt(skb, opt->dccpop_type + 1, + opt->dccpop_feat, + opt->dccpop_sc->dccpoc_val, + opt->dccpop_sc->dccpoc_len); + + BUG_ON(!opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc->dccpoc_val); + kfree(opt->dccpop_sc); + opt->dccpop_sc = NULL; + } - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; + /* any option not confirmed, re-send it */ + if (!opt->dccpop_conf) { + dccp_insert_feat_opt(skb, opt->dccpop_type, + opt->dccpop_feat, opt->dccpop_val, + opt->dccpop_len); + change++; + } } - if (unlikely(val == NULL || len == 0)) - len = repeat_first = 0; - tot_len = 3 + repeat_first + len; + /* Retransmit timer. + * If this is the master listening sock, we don't set a timer on it. It + * should be fine because if the dude doesn't receive our RESPONSE + * [which will contain the CHANGE] he will send another REQUEST which + * will "retrnasmit" the change. + */ + if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { + dccp_pr_debug("reset feat negotiation timer %p\n", sk); - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; + /* XXX don't reset the timer on re-transmissions. I.e. reset it + * only when sending new stuff i guess. Currently the timer + * never backs off because on re-transmission it just resets it! + */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); return 0; } @@ -581,30 +565,19 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) + if (dmsk->dccpms_send_ndp_count && + dccp_insert_option_ndp(sk, skb)) return -1; - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) + if (!dccp_packet_without_ack(skb)) { + if (dmsk->dccpms_send_ack_vector && + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && + dccp_insert_option_ackvec(sk, skb)) return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used in CCID 3 initialisation. - */ - if (dccp_insert_option_timestamp(sk, skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } } if (dp->dccps_hc_rx_insert_options) { @@ -613,6 +586,21 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dp->dccps_hc_rx_insert_options = 0; } + /* Feature negotiation */ + /* Data packets can't do feat negotiation */ + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && + DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && + dccp_insert_options_feat(sk, skb)) + return -1; + + /* + * Obtain RTT sample from Request/Response exchange. + * This is currently used in CCID 3 initialisation. + */ + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && + dccp_insert_option_timestamp(sk, skb)) + return -1; + if (dp->dccps_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(dp, NULL, skb)) return -1; @@ -625,9 +613,6 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) { DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - if (dreq->dreq_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(NULL, dreq, skb)) return -1; diff --git a/net/dccp/output.c b/net/dccp/output.c index 2532797a8009..d06945c7d3df 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -26,13 +26,11 @@ static inline void dccp_event_ack_sent(struct sock *sk) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) +static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) { skb_set_owner_w(skb, sk); WARN_ON(sk->sk_send_head); sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); } /* @@ -163,27 +161,21 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; + int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; /* Account for header lengths and IPv4/v6 option overhead */ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) + * FIXME: this should come from the CCID infrastructure, where, say, + * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets + * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED + * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to + * make it a multiple of 4 */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); + + cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -208,158 +200,95 @@ void dccp_write_space(struct sock *sk) } /** - * dccp_wait_for_ccid - Await CCID send permission + * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet * @sk: socket to wait for - * @delay: timeout in jiffies - * This is used by CCIDs which need to delay the send time in process context. + * @skb: current skb to pass on for waiting + * @delay: sleep timeout in milliseconds (> 0) + * This function is called by default when the socket is closed, and + * when a non-zero linger time is set on the socket. For consistency */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) +static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) { + struct dccp_sock *dp = dccp_sk(sk); DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); + unsigned long jiffdelay; + int rc; - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk->sk_sleep, &wait); + do { + dccp_pr_debug("delayed send by %d msec\n", delay); + jiffdelay = msecs_to_jiffies(delay); - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - if (unlikely(skb == NULL)) - return; - len = skb->len; + sk->sk_write_pending++; + release_sock(sk); + schedule_timeout(jiffdelay); + lock_sock(sk); + sk->sk_write_pending--; - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } + if (sk->sk_err) + goto do_error; + if (signal_pending(current)) + goto do_interrupted; - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + } while ((delay = rc) > 0); +out: + finish_wait(sk->sk_sleep, &wait); + return rc; + +do_error: + rc = -EPIPE; + goto out; +do_interrupted: + rc = -EINTR; + goto out; } -/** - * dccp_flush_write_queue - Drain queue at end of connection - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) +void dccp_write_xmit(struct sock *sk, int block) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); + while ((skb = skb_peek(&sk->sk_write_queue))) { + int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + + if (err > 0) { + if (!block) { + sk_reset_timer(sk, &dp->dccps_xmit_timer, + msecs_to_jiffies(err)+jiffies); + break; + } else + err = dccp_wait_for_ccid(sk, skb, err); + if (err && err != -EINTR) + DCCP_BUG("err=%d after dccp_wait_for_ccid", err); } - } -} -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; + skb_dequeue(&sk->sk_write_queue); + if (err == 0) { + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const int len = skb->len; - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); + if (sk->sk_state == DCCP_PARTOPEN) { + /* See 8.1.5. Handshake Completion */ + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + dcb->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) + dcb->dccpd_type = DCCP_PKT_DATAACK; + else + dcb->dccpd_type = DCCP_PKT_DATA; + + err = dccp_transmit_skb(sk, skb); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + if (err) + DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", + err); + } else { + dccp_pr_debug("packet discarded due to err=%d\n", err); + kfree_skb(skb); } } } @@ -410,12 +339,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; + if (dccp_insert_options_rsk(dreq, skb)) { + kfree_skb(skb); + return NULL; + } /* Build and checksum header */ dh = dccp_zeroed_hdr(skb, dccp_header_size); @@ -436,9 +363,6 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, inet_rsk(req)->acked = 1; DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; -response_failed: - kfree_skb(skb); - return NULL; } EXPORT_SYMBOL_GPL(dccp_make_response); @@ -523,9 +447,8 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) /* * Do all connect socket setups that can be done AF independent. */ -int dccp_connect(struct sock *sk) +static inline void dccp_connect_init(struct sock *sk) { - struct sk_buff *skb; struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -535,13 +458,19 @@ int dccp_connect(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ dp->dccps_gar = dp->dccps_iss; + icsk->icsk_retransmits = 0; +} + +int dccp_connect(struct sock *sk) +{ + struct sk_buff *skb; + struct inet_connection_sock *icsk = inet_csk(sk); + + dccp_connect_init(sk); + skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); if (unlikely(skb == NULL)) return -ENOBUFS; @@ -551,11 +480,11 @@ int dccp_connect(struct sock *sk) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); + dccp_skb_entail(sk, skb); + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); return 0; @@ -642,12 +571,6 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - dccp_transmit_skb(sk, skb); } @@ -676,7 +599,9 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - skb = dccp_skb_entail(sk, skb); + dccp_write_xmit(sk, 1); + dccp_skb_entail(sk, skb); + dccp_transmit_skb(sk, skb_clone(skb, prio)); /* * Retransmission timer for active-close: RFC 4340, 8.3 requires * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ @@ -689,6 +614,6 @@ void dccp_send_close(struct sock *sk, const int active) */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); + } else + dccp_transmit_skb(sk, skb); } diff --git a/net/dccp/probe.c b/net/dccp/probe.c index eaa59d82ab0f..81368a7f5379 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -46,54 +46,75 @@ static struct { struct kfifo *fifo; spinlock_t lock; wait_queue_head_t wait; - ktime_t start; + struct timespec tstart; } dccpw; -static void jdccp_write_xmit(struct sock *sk) +static void printl(const char *fmt, ...) { - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hctx = NULL; - struct timespec tv; - char buf[256]; - int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); + va_list args; + int len; + struct timespec now; + char tbuf[256]; - if (ccid == DCCPC_CCID3) - hctx = ccid3_hc_tx_sk(sk); + va_start(args, fmt); + getnstimeofday(&now); - if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { + now = timespec_sub(now, dccpw.tstart); - tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); - len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", - (unsigned long)tv.tv_sec, - (unsigned long)tv.tv_nsec, - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_nsec / NSEC_PER_USEC); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + kfifo_put(dccpw.fifo, tbuf, len); + wake_up(&dccpw.wait); +} + +static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct dccp_minisock *dmsk = dccp_msk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ccid3_hc_tx_sock *hctx; + + if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) + hctx = ccid3_hc_tx_sk(sk); + else + hctx = NULL; + + if (port == 0 || ntohs(inet->dport) == port || + ntohs(inet->sport) == port) { if (hctx) - len += sprintf(buf + len, " %d %d %d %u %u %u %d", - hctx->s, hctx->rtt, hctx->p, hctx->x_calc, - (unsigned)(hctx->x_recv >> 6), - (unsigned)(hctx->x >> 6), hctx->t_ipi); - - len += sprintf(buf + len, "\n"); - kfifo_put(dccpw.fifo, buf, len); - wake_up(&dccpw.wait); + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " + "%llu %llu %d\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), size, + hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, + hctx->ccid3hctx_x_recv >> 6, + hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); + else + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), size); } jprobe_return(); + return 0; } static struct jprobe dccp_send_probe = { .kp = { - .symbol_name = "dccp_write_xmit", + .symbol_name = "dccp_sendmsg", }, - .entry = jdccp_write_xmit, + .entry = jdccp_sendmsg, }; static int dccpprobe_open(struct inode *inode, struct file *file) { kfifo_reset(dccpw.fifo); - dccpw.start = ktime_get(); + getnstimeofday(&dccpw.tstart); return 0; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ecf3be961e11..d0bd34819761 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -67,9 +67,6 @@ void dccp_set_state(struct sock *sk, const int state) case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: @@ -178,25 +175,63 @@ EXPORT_SYMBOL_GPL(dccp_state_name); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + dccp_minisock_init(&dp->dccps_minisock); + icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = TCP_MIN_RCVMSS; + dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; + dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; dccp_init_xmit_timers(sk); - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); + /* + * FIXME: We're hardcoding the CCID, and doing this at this point makes + * the listening (master) sock get CCID control blocks, which is not + * necessary, but for now, to not mess with the test userspace apps, + * lets leave it here, later the real solution is to do this in a + * setsockopt(CCIDs-I-want/accept). -acme + */ + if (likely(ctl_sock_initialized)) { + int rc = dccp_feat_init(dmsk); + + if (rc) + return rc; + + if (dmsk->dccpms_send_ack_vector) { + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); + if (dp->dccps_hc_rx_ackvec == NULL) + return -ENOMEM; + } + dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, + sk, GFP_KERNEL); + dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, + sk, GFP_KERNEL); + if (unlikely(dp->dccps_hc_rx_ccid == NULL || + dp->dccps_hc_tx_ccid == NULL)) { + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + if (dmsk->dccpms_send_ack_vector) { + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + } + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + return -ENOMEM; + } + } else { + /* control socket doesn't need feat nego */ + INIT_LIST_HEAD(&dmsk->dccpms_pending); + INIT_LIST_HEAD(&dmsk->dccpms_conf); + } + return 0; } @@ -205,6 +240,7 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -222,7 +258,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dp->dccps_hc_rx_ackvec != NULL) { + if (dmsk->dccpms_send_ack_vector) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } @@ -231,7 +267,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); + dccp_feat_clean(dmsk); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); @@ -241,9 +277,6 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) - return -EPROTO; return inet_csk_listen_start(sk, backlog); } @@ -433,70 +466,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) +/* byte 1 is feature. the rest is the preference list */ +static int dccp_setsockopt_change(struct sock *sk, int type, + struct dccp_so_feat __user *optval) { - u8 *list, len; - int i, rc; + struct dccp_so_feat opt; + u8 *val; + int rc; - if (cscov < 0 || cscov > 15) - return -EINVAL; + if (copy_from_user(&opt, optval, sizeof(opt))) + return -EFAULT; /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. + * rfc4340: 6.1. Change Options */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - char __user *optval, int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) + if (opt.dccpsf_len < 1) return -EINVAL; - val = kmalloc(optlen, GFP_KERNEL); - if (val == NULL) + val = kmalloc(opt.dccpsf_len, GFP_KERNEL); + if (!val) return -ENOMEM; - if (copy_from_user(val, optval, optlen)) { - kfree(val); - return -EFAULT; + if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { + rc = -EFAULT; + goto out_free_val; } - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); + rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, + val, opt.dccpsf_len, GFP_KERNEL); + if (rc) + goto out_free_val; - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); +out: + return rc; +out_free_val: kfree(val); - return rc; + goto out; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, @@ -505,21 +510,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) + if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -530,38 +521,53 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); + err = 0; + break; + case DCCP_SOCKOPT_CHANGE_L: + if (optlen != sizeof(struct dccp_so_feat)) + err = -EINVAL; + else + err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, + (struct dccp_so_feat __user *) + optval); + break; + case DCCP_SOCKOPT_CHANGE_R: + if (optlen != sizeof(struct dccp_so_feat)) + err = -EINVAL; + else + err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, + (struct dccp_so_feat __user *) + optval); + break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) + case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ + if (val < 0 || val > 15) err = -EINVAL; else - dp->dccps_qpolicy = val; + dp->dccps_pcslen = val; break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) + case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ + if (val < 0 || val > 15) err = -EINVAL; - else - dp->dccps_tx_qlen = val; + else { + dp->dccps_pcrlen = val; + /* FIXME: add feature negotiation, + * ChangeL(MinimumChecksumCoverage, val) */ + } break; default: err = -ENOPROTOOPT; break; } - release_sock(sk); + release_sock(sk); return err; } @@ -642,18 +648,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; @@ -663,12 +657,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); @@ -711,47 +699,6 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); #endif -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { - - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -767,7 +714,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - if (dccp_qpolicy_full(sk)) { + if (sysctl_dccp_tx_qlen && + (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { rc = -EAGAIN; goto out_release; } @@ -795,12 +743,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (rc != 0) goto out_discard; - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - dccp_write_xmit(sk); + skb_queue_tail(&sk->sk_write_queue, skb); + dccp_write_xmit(sk,0); out_release: release_sock(sk); return rc ? : len; @@ -1023,22 +967,9 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - sk_stream_wait_close(sk, timeout); adjudge_to_death: diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 27383f88c75f..000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License v2 - * as published by the Free Software Foundation. - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - */ -static struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; - -} qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - /* Clear any skb fields that we used internally */ - skb->priority = 0; - - if (skb) - skb_unlink(skb, &sk->sk_write_queue); - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index a5a1856234e7..21295993fdb8 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -18,72 +18,76 @@ #error This file should not be compiled without CONFIG_SYSCTL defined #endif -/* Boundary values */ -static int zero = 0, - u8_max = 0xFF; -static unsigned long seqw_min = 32; - static struct ctl_table dccp_default_table[] = { { .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), + .data = &sysctl_dccp_feat_sequence_window, + .maxlen = sizeof(sysctl_dccp_feat_sequence_window), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ + .proc_handler = proc_dointvec, }, { .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), + .data = &sysctl_dccp_feat_rx_ccid, + .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, /* RFC 4340, 10. */ + .proc_handler = proc_dointvec, }, { .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), + .data = &sysctl_dccp_feat_tx_ccid, + .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ack_ratio", + .data = &sysctl_dccp_feat_ack_ratio, + .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "send_ackvec", + .data = &sysctl_dccp_feat_send_ack_vector, + .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "send_ndp", + .data = &sysctl_dccp_feat_send_ndp_count, + .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, /* RFC 4340, 10. */ + .proc_handler = proc_dointvec, }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, + .proc_handler = proc_dointvec, }, { .procname = "retries1", .data = &sysctl_dccp_retries1, .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, + .proc_handler = proc_dointvec, }, { .procname = "retries2", .data = &sysctl_dccp_retries2, .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &u8_max, + .proc_handler = proc_dointvec, }, { .procname = "tx_qlen", .data = &sysctl_dccp_tx_qlen, .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_dointvec, }, { .procname = "sync_ratelimit", diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 16359e29e7f5..54b3c7e9e016 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -87,6 +87,17 @@ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + /* retransmit timer is used for feature negotiation throughout + * connection. In this case, no packet is re-transmitted, but rather an + * ack is generated and pending changes are placed into its options. + */ + if (sk->sk_send_head == NULL) { + dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); + if (sk->sk_state == DCCP_OPEN) + dccp_send_ack(sk); + goto backoff; + } + /* * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. @@ -115,6 +126,7 @@ static void dccp_retransmit_timer(struct sock *sk) return; } +backoff: icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); @@ -237,35 +249,32 @@ out: sock_put(sk); } -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(unsigned long data) +/* Transmit-delay timer: used by the CCIDs to delay actual send time */ +static void dccp_write_xmit_timer(unsigned long data) { struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); + sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); else - dccp_write_xmit(sk); + dccp_write_xmit(sk, 0); bh_unlock_sock(sk); + sock_put(sk); } -static void dccp_write_xmit_timer(unsigned long data) +static void dccp_init_write_xmit_timer(struct sock *sk) { - dccp_write_xmitlet(data); - sock_put((struct sock *)data); + struct dccp_sock *dp = dccp_sk(sk); + + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); } void dccp_init_xmit_timers(struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_init_write_xmit_timer(sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } @@ -281,7 +290,8 @@ u32 dccp_timestamp(void) { s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - return div_u64(delta, DCCP_TIME_RESOLUTION); + do_div(delta, 10); + return delta; } EXPORT_SYMBOL_GPL(dccp_timestamp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9da9f19ece8a..f79a51607292 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -811,12 +811,25 @@ void tcp_update_metrics(struct sock *sk) } } +/* Numbers are taken from RFC3390. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than use a multiplier in the relevant range. + */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); + if (!cwnd) { + if (tp->mss_cache > 1460) + cwnd = 2; + else + cwnd = (tp->mss_cache > 1095) ? 3 : 4; + } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -- cgit v1.2.3