From b4eec206370b7154dc354dc30f0a3f02ea8468b2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Implement lookup table for feature-negotiation information A lookup table for feature-negotiation information, extracted from RFC 4340/42, is provided by this patch. All currently known features can be found in this table, along with their feature location, their default value, and type. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6080449fbec9..3978aff197d9 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -176,19 +176,20 @@ enum { }; /* DCCP features (RFC 4340 section 6.4) */ -enum { +enum dccp_feature_numbers { DCCPF_RESERVED = 0, DCCPF_CCID = 1, - DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ + DCCPF_SHORT_SEQNOS = 2, DCCPF_SEQUENCE_WINDOW = 3, - DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ + DCCPF_ECN_INCAPABLE = 4, DCCPF_ACK_RATIO = 5, DCCPF_SEND_ACK_VECTOR = 6, DCCPF_SEND_NDP_COUNT = 7, DCCPF_MIN_CSUM_COVER = 8, - DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ + DCCPF_DATA_CHECKSUM = 9, /* 10-127 reserved */ DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ DCCPF_MAX_CCID_SPECIFIC = 255, }; -- cgit v1.2.3 From 828755cee087e4a34f45d6c9db661ccd0631cc6d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Per-socket initialisation of feature negotiation This provides feature-negotiation initialisation for both DCCP sockets and DCCP request_sockets, to support feature negotiation during connection setup. It also resolves a FIXME regarding the congestion control initialisation. Thanks to Wei Yongjun for help with the IPv6 side of this patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 4 ++++ net/dccp/dccp.h | 3 ++- net/dccp/feat.c | 19 +++++++++++++++++++ net/dccp/feat.h | 1 + net/dccp/input.c | 2 -- net/dccp/ipv4.c | 3 ++- net/dccp/ipv6.c | 3 ++- net/dccp/minisocks.c | 7 ++++++- net/dccp/proto.c | 1 + 9 files changed, 37 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 3978aff197d9..484b8a1fb023 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -412,6 +412,7 @@ extern void dccp_minisock_init(struct dccp_minisock *dmsk); * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) + * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -421,6 +422,7 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; + struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -498,6 +500,7 @@ struct dccp_ackvec; * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) * @dccps_minisock - associated minisock (accessed via dccp_msk) + * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) @@ -535,6 +538,7 @@ struct dccp_sock { __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; + struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b4bc6e095a0e..ab096c06bba0 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -252,7 +252,8 @@ extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); -extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); +extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, + struct sk_buff const *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 2ec2cd117699..faade82856fe 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -279,6 +279,25 @@ void dccp_feat_list_purge(struct list_head *fn_list) } EXPORT_SYMBOL_GPL(dccp_feat_list_purge); +/* generate @to as full clone of @from - @to must not contain any nodes */ +int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) +{ + struct dccp_feat_entry *entry, *new; + + INIT_LIST_HEAD(to); + list_for_each_entry(entry, from, node) { + new = dccp_feat_clone_entry(entry); + if (new == NULL) + goto cloning_failed; + list_add_tail(&new->node, to); + } + return 0; + +cloning_failed: + dccp_feat_list_purge(to); + return -ENOMEM; +} + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 94203c230c65..7e953fd0a79b 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -95,6 +95,7 @@ extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len); extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); +extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); extern int dccp_feat_init(struct dccp_minisock *dmsk); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c index 779d0ed9ae94..3070015edc75 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -590,8 +590,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) return 1; - - /* FIXME: do congestion control initialization */ goto discard; } if (dh->dccph_type == DCCP_PKT_RESET) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 882c5c4de69e..0ce84ea89119 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -595,7 +595,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5e1ee0da2c40..33e8a1ea3041 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -424,7 +424,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b2804e2d1b8c..e487133ae079 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -125,6 +125,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; newicsk->icsk_rto = DCCP_TIMEOUT_INIT; + INIT_LIST_HEAD(&newdp->dccps_featneg); if (dccp_feat_clone(sk, newsk)) goto out_free; @@ -304,7 +305,8 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); -void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) +int dccp_reqsk_init(struct request_sock *req, + struct dccp_sock const *dp, struct sk_buff const *skb) { struct dccp_request_sock *dreq = dccp_rsk(req); @@ -312,6 +314,9 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) inet_rsk(req)->acked = 0; req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; + + /* inherit feature negotiation options from listening socket */ + return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d0bd34819761..1cdf4ae99605 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -193,6 +193,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_init_xmit_timers(sk); + INIT_LIST_HEAD(&dp->dccps_featneg); /* * FIXME: We're hardcoding the CCID, and doing this at this point makes * the listening (master) sock get CCID control blocks, which is not -- cgit v1.2.3 From 71bb49596bbf4e5a3328e1704d18604e822ba181 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Query supported CCIDs This provides a data structure to record which CCIDs are locally supported and three accessor functions: - a test function for internal use which is used to validate CCID requests made by the user; - a copy function so that the list can be used for feature-negotiation; - documented getsockopt() support so that the user can query capabilities. The data structure is a table which is filled in at compile-time with the list of available CCIDs (which in turn depends on the Kconfig choices). Using the copy function for cloning the list of supported CCIDs is useful for feature negotiation, since the negotiation is now with the full list of available CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation will not fail if the peer requests to use CCID3 instead of CCID2. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 4 ++++ include/linux/dccp.h | 1 + net/dccp/ccid.c | 48 +++++++++++++++++++++++++++++++++++++++ net/dccp/ccid.h | 5 ++++ net/dccp/feat.c | 4 ++++ net/dccp/proto.c | 2 ++ 6 files changed, 64 insertions(+) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 39131a3c78f8..f0aeb20fa63b 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -57,6 +57,10 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. +DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs +supported by the endpoint (see include/linux/dccp.h for symbolic constants). +The caller needs to provide a sufficiently large (> 2) array of type uint8_t. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 484b8a1fb023..d3ac1bde60b4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -209,6 +209,7 @@ struct dccp_so_feat { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 +#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 4809753d12ae..f72ca83df552 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -13,6 +13,13 @@ #include "ccid.h" +static u8 builtin_ccids[] = { + DCCPC_CCID2, /* CCID2 is supported by default */ +#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) + DCCPC_CCID3, +#endif +}; + static struct ccid_operations *ccids[CCID_MAX]; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t ccids_lockct = ATOMIC_INIT(0); @@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) } } +/* check that up to @array_len members in @ccid_array are supported */ +bool ccid_support_check(u8 const *ccid_array, u8 array_len) +{ + u8 i, j, found; + + for (i = 0, found = 0; i < array_len; i++, found = 0) { + for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) + found = (ccid_array[i] == builtin_ccids[j]); + if (!found) + return false; + } + return true; +} + +/** + * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array + * @ccid_array: pointer to copy into + * @array_len: value to return length into + * This function allocates memory - caller must see that it is freed after use. + */ +int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) +{ + *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); + if (*ccid_array == NULL) + return -ENOBUFS; + *array_len = ARRAY_SIZE(builtin_ccids); + return 0; +} + +int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + if (len < sizeof(builtin_ccids)) + return -EINVAL; + + if (put_user(sizeof(builtin_ccids), optlen) || + copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) + return -EFAULT; + return 0; +} + int ccid_register(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index fdeae7b57319..259f5469d7d0 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -103,6 +103,11 @@ static inline void *ccid_priv(const struct ccid *ccid) return (void *)ccid->ccid_priv; } +extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); +extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); +extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *, int __user *); + extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index b859722fba72..9399554878cc 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -383,6 +383,10 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) return -EINVAL; + /* Avoid negotiating alien CCIDs by only advertising supported ones */ + if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) + return -EOPNOTSUPP; + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) return -ENOMEM; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 01332fe7a99a..b4b10cbd8880 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -649,6 +649,8 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; break; + case DCCP_SOCKOPT_AVAILABLE_CCIDS: + return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; -- cgit v1.2.3 From 668144f7b41716a9efe1b398e15ead32a26cd101 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Deprecate old setsockopt framework The previous setsockopt interface, which passed socket options via struct dccp_so_feat, is complicated/difficult to use. Continuing to support it leads to ugly code since the old approach did not distinguish between NN and SP values. This patch removes the old setsockopt interface and replaces it with two new functions to register NN/SP values for feature negotiation. These are essentially wrappers around the internal __feat_register functions, with checking added to avoid * wrong usage (type); * changing values while the connection is in progress. Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 7 ----- net/dccp/feat.c | 72 ++++++++++++++++++++-------------------------------- net/dccp/feat.h | 5 ++-- net/dccp/proto.c | 53 ++------------------------------------ 4 files changed, 32 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index d3ac1bde60b4..6eaaca9b037a 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -193,13 +193,6 @@ enum dccp_feature_numbers { DCCPF_MAX_CCID_SPECIFIC = 255, }; -/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ -struct dccp_so_feat { - __u8 dccpsf_feat; - __u8 __user *dccpsf_val; - __u8 dccpsf_len; -}; - /* DCCP socket options */ #define DCCP_SOCKOPT_PACKET_SIZE 1 /* XXX deprecated, without effect */ #define DCCP_SOCKOPT_SERVICE 2 diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 6852960bb0a9..44b10afd3fb6 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -393,53 +393,35 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); } -int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, - u8 *val, u8 len, gfp_t gfp) -{ - struct dccp_opt_pend *opt; - - dccp_feat_debug(type, feature, *val); - - if (len > 3) { - DCCP_WARN("invalid length %d\n", len); +/** + * dccp_feat_register_sp - Register requests to change SP feature values + * @sk: client or listening socket + * @feat: one of %dccp_feature_numbers + * @is_local: whether the local (1) or remote (0) @feat is meant + * @list: array of preferred values, in descending order of preference + * @len: length of @list in bytes + */ +int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len) +{ /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_SP) return -EINVAL; - } - /* XXX add further sanity checks */ - - /* check if that feature is already being negotiated */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - /* ok we found a negotiation for this option already */ - if (opt->dccpop_feat == feature && opt->dccpop_type == type) { - dccp_pr_debug("Replacing old\n"); - /* replace */ - BUG_ON(opt->dccpop_val == NULL); - kfree(opt->dccpop_val); - opt->dccpop_val = val; - opt->dccpop_len = len; - opt->dccpop_conf = 0; - return 0; - } - } - - /* negotiation for a new feature */ - opt = kmalloc(sizeof(*opt), gfp); - if (opt == NULL) - return -ENOMEM; - - opt->dccpop_type = type; - opt->dccpop_feat = feature; - opt->dccpop_len = len; - opt->dccpop_val = val; - opt->dccpop_conf = 0; - opt->dccpop_sc = NULL; - - BUG_ON(opt->dccpop_val == NULL); - - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); - return 0; + return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, + 0, list, len); } -EXPORT_SYMBOL_GPL(dccp_feat_change); +/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ +int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) +{ + /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_NN) + return -EINVAL; + return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); +} /* * Tracking features whose value depend on the choice of CCID @@ -1137,7 +1119,7 @@ int dccp_feat_init(struct sock *sk) /* Ack ratio */ rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, - dmsk->dccpms_ack_ratio); + dp->dccps_l_ack_ratio); out: return rc; } diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 9eefdb43783e..2c92bd18e5f4 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -110,8 +110,9 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #define dccp_feat_debug(type, feat, val) #endif /* CONFIG_IP_DCCP_DEBUG */ -extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, - u8 *val, u8 len, gfp_t gfp); +extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len); +extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len); extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 46cb3490d48e..108d56bd25c5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -470,44 +470,6 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } -/* byte 1 is feature. the rest is the preference list */ -static int dccp_setsockopt_change(struct sock *sk, int type, - struct dccp_so_feat __user *optval) -{ - struct dccp_so_feat opt; - u8 *val; - int rc; - - if (copy_from_user(&opt, optval, sizeof(opt))) - return -EFAULT; - /* - * rfc4340: 6.1. Change Options - */ - if (opt.dccpsf_len < 1) - return -EINVAL; - - val = kmalloc(opt.dccpsf_len, GFP_KERNEL); - if (!val) - return -ENOMEM; - - if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { - rc = -EFAULT; - goto out_free_val; - } - - rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, - val, opt.dccpsf_len, GFP_KERNEL); - if (rc) - goto out_free_val; - -out: - return rc; - -out_free_val: - kfree(val); - goto out; -} - static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -530,20 +492,9 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, err = 0; break; case DCCP_SOCKOPT_CHANGE_L: - if (optlen != sizeof(struct dccp_so_feat)) - err = -EINVAL; - else - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, - (struct dccp_so_feat __user *) - optval); - break; case DCCP_SOCKOPT_CHANGE_R: - if (optlen != sizeof(struct dccp_so_feat)) - err = -EINVAL; - else - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, - (struct dccp_so_feat __user *) - optval); + DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); + err = 0; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) -- cgit v1.2.3 From 20f41eee82864e308a5499308a1722dc3181cc3a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Feature negotiation for minimum-checksum-coverage This provides feature negotiation for server minimum checksum coverage which so far has been missing. Since sender/receiver coverage values range only from 0...15, their type has also been reduced in size from u16 to u4. Feature-negotiation options are now generated for both sender and receiver coverage, i.e. when the peer has `forgotten' to enable partial coverage then feature negotiation will automatically enable (negotiate) the partial coverage value for this connection. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 4 ++-- net/dccp/proto.c | 53 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6eaaca9b037a..5a5a89935dbc 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -527,8 +527,8 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; - __u16 dccps_pcslen; - __u16 dccps_pcrlen; + __u8 dccps_pcslen:4; + __u8 dccps_pcrlen:4; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 108d56bd25c5..47b137a3feaf 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -470,6 +470,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } +static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) +{ + u8 *list, len; + int i, rc; + + if (cscov < 0 || cscov > 15) + return -EINVAL; + /* + * Populate a list of permissible values, in the range cscov...15. This + * is necessary since feature negotiation of single values only works if + * both sides incidentally choose the same value. Since the list starts + * lowest-value first, negotiation will pick the smallest shared value. + */ + if (cscov == 0) + return 0; + len = 16 - cscov; + + list = kmalloc(len, GFP_KERNEL); + if (list == NULL) + return -ENOBUFS; + + for (i = 0; i < len; i++) + list[i] = cscov++; + + rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); + + if (rc == 0) { + if (rx) + dccp_sk(sk)->dccps_pcrlen = cscov; + else + dccp_sk(sk)->dccps_pcslen = cscov; + } + kfree(list); + return rc; +} + static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -502,20 +538,11 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, else dp->dccps_server_timewait = (val != 0); break; - case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ - if (val < 0 || val > 15) - err = -EINVAL; - else - dp->dccps_pcslen = val; + case DCCP_SOCKOPT_SEND_CSCOV: + err = dccp_setsockopt_cscov(sk, val, false); break; - case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ - if (val < 0 || val > 15) - err = -EINVAL; - else { - dp->dccps_pcrlen = val; - /* FIXME: add feature negotiation, - * ChangeL(MinimumChecksumCoverage, val) */ - } + case DCCP_SOCKOPT_RECV_CSCOV: + err = dccp_setsockopt_cscov(sk, val, true); break; default: err = -ENOPROTOOPT; -- cgit v1.2.3 From 17c30b40ed79e9f3955e884632c8f01e577b204a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Deprecate Ack Ratio sysctl This patch deprecates the Ack Ratio sysctl, since * Ack Ratio is entirely ignored by CCID-3 and CCID-4, * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1); * even if it would work in CCID-2, there is no point for a user to change it: - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2), - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts (since waiting for Acks which will never arrive in this window), - cwnd is not a user-configurable value. The only reasonable place for Ack Ratio is to print it for debugging. It is planned to do this later on, as part of e.g. dccp_probe. With this patch Ack Ratio is now under full control of feature negotiation: * Ack Ratio is resolved as a dependency of the selected CCID; * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to the default of 2, following RFC 4340, 11.3 - "New connections start with Ack Ratio 2 for both endpoints"; * what happens then is part of another patch set, since it concerns the dynamic update of Ack Ratio while the connection is in full flight. Thanks to Tomasz Grobelny for discussion leading up to this patch. Signed-off-by: Gerrit Renker Acked-by: Arnaldo Carvalho de Melo --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 2 -- net/dccp/dccp.h | 1 - net/dccp/minisocks.c | 1 - net/dccp/options.c | 1 - net/dccp/sysctl.c | 7 ------- 6 files changed, 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index f0aeb20fa63b..43df4487379b 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -125,9 +125,6 @@ send_ndp = 1 send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). -ack_ratio = 2 - The default Ack Ratio (sec. 11.3) to use. - tx_ccid = 2 Default CCID for the sender-receiver half-connection. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 5a5a89935dbc..eda389ce04f4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -368,7 +368,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) - * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ @@ -378,7 +377,6 @@ struct dccp_minisock { __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; - __u8 dccpms_ack_ratio; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e656dafb5d96..031ce350d3c1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_ack_ratio; extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index e487133ae079..ee7f40f8ddfa 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -47,7 +47,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk) dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; - dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } diff --git a/net/dccp/options.c b/net/dccp/options.c index 67a171a1268c..515ad45013ad 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 21295993fdb8..f6e54f433e29 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "ack_ratio", - .data = &sysctl_dccp_feat_ack_ratio, - .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "send_ackvec", .data = &sysctl_dccp_feat_send_ack_vector, -- cgit v1.2.3 From fade756f18d42694e3acb00e3471ab43002cba16 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Set per-connection CCIDs via socket options With this patch, TX/RX CCIDs can now be changed on a per-connection basis, which overrides the defaults set by the global sysctl variables for TX/RX CCIDs. To make full use of this facility, the remaining patches of this patch set are needed, which track dependencies and activate negotiated feature values. Note on the maximum number of CCIDs that can be registered: ----------------------------------------------------------- The maximum number of CCIDs that can be registered on the socket is constrained by the space in a Confirm/Change feature negotiation option. The space in these in turn depends on the size of header options as defined in RFC 4340, 5.8. Since this is a recurring constant, it has been moved from ackvec.h into linux/dccp.h, clarifying its purpose. Relative to this size, the maximum number of CCID identifiers that can be present in a Confirm option (which always consumes 1 byte more than a Change option, cf. 6.1) is 2 bytes less than the maximum TLV size: one for the CCID-feature-type and one for the selected value. Signed-off-by: Gerrit Renker --- Documentation/networking/dccp.txt | 14 ++++++++++++++ include/linux/dccp.h | 5 +++++ net/dccp/ackvec.c | 9 ++++----- net/dccp/ackvec.h | 5 ++--- net/dccp/feat.h | 2 ++ net/dccp/proto.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 61 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 43df4487379b..610083ff73f6 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -61,6 +61,20 @@ DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs supported by the endpoint (see include/linux/dccp.h for symbolic constants). The caller needs to provide a sufficiently large (> 2) array of type uint8_t. +DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same +time, combining the operation of the next two socket options. This option is +preferrable over the latter two, since often applications will use the same +type of CCID for both directions; and mixed use of CCIDs is not currently well +understood. This socket option takes as argument at least one uint8_t value, or +an array of uint8_t values, which must match available CCIDS (see above). CCIDs +must be registered on the socket before calling connect() or listen(). + +DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets +the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. +Please note that the getsockopt argument type here is `int', not uint8_t. + +DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait diff --git a/include/linux/dccp.h b/include/linux/dccp.h index eda389ce04f4..6a72ff52a8a4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -168,6 +168,8 @@ enum { DCCPO_MIN_CCID_SPECIFIC = 128, DCCPO_MAX_CCID_SPECIFIC = 255, }; +/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ +#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -203,6 +205,9 @@ enum dccp_feature_numbers { #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 +#define DCCP_SOCKOPT_CCID 13 +#define DCCP_SOCKOPT_TX_CCID 14 +#define DCCP_SOCKOPT_RX_CCID 15 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1e8be246ad15..01e4d39fa232 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -12,7 +12,6 @@ #include "ackvec.h" #include "dccp.h" -#include #include #include #include @@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); u16 len = av->av_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; @@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) for (i = 0; i < nr_opts; ++i) { int copylen = len; - if (len > DCCP_MAX_ACKVEC_OPT_LEN) - copylen = DCCP_MAX_ACKVEC_OPT_LEN; + if (len > DCCP_SINGLE_OPT_MAXLEN) + copylen = DCCP_SINGLE_OPT_MAXLEN; *to++ = DCCPO_ACK_VECTOR_0; *to++ = copylen + 2; @@ -432,7 +431,7 @@ found: int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, u64 *ackno, const u8 opt, const u8 *value, const u8 len) { - if (len > DCCP_MAX_ACKVEC_OPT_LEN) + if (len > DCCP_SINGLE_OPT_MAXLEN) return -1; /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index bcb64fb4acef..4ccee030524e 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -11,15 +11,14 @@ * published by the Free Software Foundation. */ +#include #include #include #include #include -/* Read about the ECN nonce to see why it is 253 */ -#define DCCP_MAX_ACKVEC_OPT_LEN 253 /* We can spread an ack vector across multiple options */ -#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) +#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 2c92bd18e5f4..b53b11717c40 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -22,6 +22,8 @@ /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ #define DCCPF_SEQ_WMIN 32 #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull +/* Maximum number of SP values that fit in a single (Confirm) option */ +#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) enum dccp_feat_type { FEAT_AT_RX = 1, /* located at RX side of half-connection */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e29bbf914057..2cd56df44d8e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -506,6 +506,36 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) return rc; } +static int dccp_setsockopt_ccid(struct sock *sk, int type, + char __user *optval, int optlen) +{ + u8 *val; + int rc = 0; + + if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) + return -EINVAL; + + val = kmalloc(optlen, GFP_KERNEL); + if (val == NULL) + return -ENOMEM; + + if (copy_from_user(val, optval, optlen)) { + kfree(val); + return -EFAULT; + } + + lock_sock(sk); + if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); + + if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); + release_sock(sk); + + kfree(val); + return rc; +} + static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -520,6 +550,10 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; + case DCCP_SOCKOPT_CCID: + case DCCP_SOCKOPT_RX_CCID: + case DCCP_SOCKOPT_TX_CCID: + return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) -- cgit v1.2.3 From 78673e24df27c76ec75565f4024d45c2c74ef148 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Remove obsolete parts of the old CCID interface The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector case, their value equals initially that of the sysctl, but at the end of feature negotiation may be something different. The old interface removed by this patch thus has been replaced by the newer interface to dynamically query the currently loaded CCIDs earlier in this patch set. Also removed the constructors for the TX CCID and the RX CCID, since the switch rx/non-rx is done by the handler in minisocks.c (and the handler is the only place in the code where CCIDs are loaded). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 5 +++-- include/linux/dccp.h | 3 --- net/dccp/ccid.c | 14 -------------- net/dccp/ccid.h | 5 ----- net/dccp/feat.c | 12 ------------ net/dccp/minisocks.c | 2 -- 6 files changed, 3 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 610083ff73f6..a203d132dbef 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -140,10 +140,11 @@ send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). tx_ccid = 2 - Default CCID for the sender-receiver half-connection. + Default CCID for the sender-receiver half-connection. Depending on the + choice of CCID, the Send Ack Vector feature is enabled automatically. rx_ccid = 2 - Default CCID for the receiver-sender half-connection. + Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 The initial sequence window (sec. 7.5.2). diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6a72ff52a8a4..46daea312d92 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -370,7 +370,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated @@ -378,8 +377,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_rx_ccid; - __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index f72ca83df552..330372a1a0b6 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -253,20 +253,6 @@ out_module_put: EXPORT_SYMBOL_GPL(ccid_new); -struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) -{ - return ccid_new(id, sk, 1, gfp); -} - -EXPORT_SYMBOL_GPL(ccid_hc_rx_new); - -struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) -{ - return ccid_new(id, sk, 0, gfp); -} - -EXPORT_SYMBOL_GPL(ccid_hc_tx_new); - static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) { struct ccid_operations *ccid_ops; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 803343aed004..18f69423a708 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -111,11 +111,6 @@ extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); -extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, - gfp_t gfp); -extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, - gfp_t gfp); - static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_rx_ccid; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 6c82dea409d9..cb2ddd2b894b 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1119,18 +1119,6 @@ int dccp_feat_init(struct sock *sk) INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ - /* CCID L */ - rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0, - &dmsk->dccpms_tx_ccid, 1); - if (rc) - goto out; - - /* CCID R */ - rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0, - &dmsk->dccpms_rx_ccid, 1); - if (rc) - goto out; - /* Ack ratio */ rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, dp->dccps_l_ack_ratio); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 258195972bce..486d61df2604 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,8 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; - dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } -- cgit v1.2.3 From 68e074bfcef269bc61006c2740d7f89ccbbd93d7 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Remove manual influence on NDP Count feature Updating the NDP count feature is handled automatically now: * for CCID-2 it is disabled, since the code does not use NDP counts; * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths. Allowing the user to change NDP values leads to unpredictable and failing behaviour, since it is then possible to disable NDP counts even when they are needed (e.g. in CCID-3). This means that only those user settings are sensible that agree with the values for Send NDP Count implied by the choice of CCID. But those settings are already activated by the feature negotiation (CCID dependency tracking), hence this form of support is redundant. At startup the initialisation of the NDP count feature is with the default value of 0, which is done implicitly by the zeroing-out of the socket when it is allocated. If the choice of CCID or feature negotiation enables NDP count, this will then be updated via the NDP activation handler. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 4 ++-- net/dccp/dccp.h | 1 - net/dccp/feat.c | 2 +- net/dccp/minisocks.c | 1 - net/dccp/options.c | 4 +--- net/dccp/sysctl.c | 7 ------- 7 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index a203d132dbef..1403745ab406 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ndp = 1 - Whether or not to send NDP count options (sec. 7.7.2). - send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 46daea312d92..60e94438eadd 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -371,14 +371,12 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) - * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; __u8 dccpms_send_ack_vector; - __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; struct list_head dccpms_conf; }; @@ -490,6 +488,7 @@ struct dccp_ackvec; * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) + * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) @@ -529,6 +528,7 @@ struct dccp_sock { __u16 dccps_r_ack_ratio; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; + __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1baed789bcc2..51436c825655 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -99,7 +99,6 @@ extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; extern int sysctl_dccp_feat_send_ack_vector; -extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index cb2ddd2b894b..35a57ab3bb1e 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -86,7 +86,7 @@ static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) { if (!rx) - dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0); + dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); return 0; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 486d61df2604..9e2232572662 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -46,7 +46,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; - dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index 3a9a22f0ac1a..6b0704497e83 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -27,7 +27,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; -int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -531,8 +530,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dmsk->dccpms_send_ndp_count && - dccp_insert_option_ndp(sk, skb)) + if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) return -1; if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index f6e54f433e29..587c12f915c1 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -47,13 +47,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ndp", - .data = &sysctl_dccp_feat_send_ndp_count, - .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, -- cgit v1.2.3 From b235dc4abbc1356284bd0dc730efa711f394e0e2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl This removes the use of the sysctl and the minisock variable for the Send Ack Vector feature, which is now handled fully dynamically via feature negotiation; i.e. when CCID2 is enabled, Ack Vectors are automatically enabled (as per RFC 4341, 4.). Using a sysctl in parallel to this implementation would open the door to crashes, since much of the code relies on tests of the boolean minisock / sysctl variable. Thus, this patch replaces all tests of type if (dccp_msk(sk)->dccpms_send_ack_vector) /* ... */ with if (dp->dccps_hc_rx_ackvec != NULL) /* ... */ The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature negotiation concluded that Ack Vectors are to be used on the half-connection. Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child), so that the test is a valid one. The activation handler for Ack Vectors is called as soon as the feature negotiation has concluded at the * server when the Ack marking the transition RESPOND => OPEN arrives; * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN. Adding the sequence number of the Response packet to the Ack Vector has been removed, since (a) connection establishment implies that the Response has been received; (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e. this entry will always be ignored; (c) it can not be used for anything useful - to detect loss for instance, only packets received after the loss can serve as pseudo-dupacks. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 3 --- net/dccp/dccp.h | 3 +-- net/dccp/diag.c | 2 +- net/dccp/input.c | 12 +++--------- net/dccp/minisocks.c | 1 - net/dccp/options.c | 7 ++----- net/dccp/proto.c | 3 +-- net/dccp/sysctl.c | 7 ------- 9 files changed, 8 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 1403745ab406..7a3bb1abb830 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ackvec = 1 - Whether or not to send Ack Vector options (sec. 11.5). - tx_ccid = 2 Default CCID for the sender-receiver half-connection. Depending on the choice of CCID, the Send Ack Vector feature is enabled automatically. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 60e94438eadd..61734e27abb7 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 #define DCCPF_INITIAL_ACK_RATIO 2 #define DCCPF_INITIAL_CCID DCCPC_CCID2 -#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 @@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_send_ack_vector; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 51436c825655..3fd16e82c003 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk) const struct dccp_sock *dp = dccp_sk(sk); return dp->dccps_timestamp_echo != 0 || #ifdef CONFIG_IP_DCCP_ACKVEC - (dccp_msk(sk)->dccpms_send_ack_vector && + (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || #endif inet_csk_ack_scheduled(sk); diff --git a/net/dccp/diag.c b/net/dccp/diag.c index d8a3509b26f6..93aae7c95550 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) info->tcpi_options |= TCPI_OPT_SACK; ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); diff --git a/net/dccp/input.c b/net/dccp/input.c index 0672b7e1763e..5eb443f656c1 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_ack_seq); } @@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) @@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); - if (dccp_msk(sk)->dccpms_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto out_invalid_packet; /* FIXME: change error code */ - /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); @@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 9e2232572662..0ebf8ebcf3de 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index 6b0704497e83..aca309e16632 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) goto out_invalid_option; break; @@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; @@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) if (dccp_insert_option_timestamp(sk, skb)) return -1; - } else if (dmsk->dccpms_send_ack_vector && + } else if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && dccp_insert_option_ackvec(sk, skb)) { return -1; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d420790b795..775eaa3d0c49 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -207,7 +207,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -225,7 +224,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dmsk->dccpms_send_ack_vector) { + if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 587c12f915c1..018e210875e1 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ackvec", - .data = &sysctl_dccp_feat_send_ack_vector, - .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, -- cgit v1.2.3 From 5d3dac267a7fd0811ec777e76a81f97f5cdcb395 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Initialisation framework for feature negotiation This initialises feature negotiation from two tables, which are initialised from sysctls. As a novel feature, specifics of the implementation (e.g. currently short seqnos and ECN are not supported) are advertised for robustness. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 19 --------------- net/dccp/feat.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-------- net/dccp/feat.h | 2 +- 3 files changed, 57 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 61734e27abb7..990e97fa1f07 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -369,28 +369,9 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_pending - List of features being negotiated - * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - struct list_head dccpms_pending; - struct list_head dccpms_conf; -}; - -struct dccp_opt_conf { - __u8 *dccpoc_val; - __u8 dccpoc_len; -}; - -struct dccp_opt_pend { - struct list_head dccpop_node; - __u8 dccpop_type; - __u8 dccpop_feat; - __u8 *dccpop_val; - __u8 dccpop_len; - int dccpop_conf; - struct dccp_opt_conf *dccpop_sc; }; extern void dccp_minisock_init(struct dccp_minisock *dmsk); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 35a57ab3bb1e..a687740e4420 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1110,24 +1110,70 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, return 0; /* ignore FN options in all other states */ } +/** + * dccp_feat_init - Seed feature negotiation with host-specific defaults + * This initialises global defaults, depending on the value of the sysctls. + * These can later be overridden by registering changes via setsockopt calls. + * The last link in the chain is finalise_settings, to make sure that between + * here and the start of actual feature negotiation no inconsistencies enter. + * + * All features not appearing below use either defaults or are otherwise + * later adjusted through dccp_feat_finalise_settings(). + */ int dccp_feat_init(struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; + u8 on = 1, off = 0; int rc; + struct { + u8 *val; + u8 len; + } tx, rx; + + /* Non-negotiable (NN) features */ + rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, + sysctl_dccp_feat_sequence_window); + if (rc) + return rc; + + /* Server-priority (SP) features */ + + /* Advertise that short seqnos are not supported (7.6.1) */ + rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); + if (rc) + return rc; - INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ - INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ + /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ + rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); + if (rc) + return rc; + + /* + * We advertise the available list of CCIDs and reorder according to + * preferences, to avoid failure resulting from negotiating different + * singleton values (which always leads to failure). + * These settings can still (later) be overridden via sockopts. + */ + if (ccid_get_builtin_ccids(&tx.val, &tx.len) || + ccid_get_builtin_ccids(&rx.val, &rx.len)) + return -ENOBUFS; - /* Ack ratio */ - rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, - dp->dccps_l_ack_ratio); -out: + if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) || + !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len)) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); + if (rc) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); + +free_ccid_lists: + kfree(tx.val); + kfree(rx.val); return rc; } -EXPORT_SYMBOL_GPL(dccp_feat_init); - int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) { struct dccp_sock *dp = dccp_sk(sk); diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 177e9c39ea9f..f73b47abca93 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -112,13 +112,13 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #define dccp_feat_debug(type, feat, val) #endif /* CONFIG_IP_DCCP_DEBUG */ +extern int dccp_feat_init(struct sock *sk); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); -extern int dccp_feat_init(struct sock *sk); /* * Encoding variable-length options and their maximum length. -- cgit v1.2.3 From 51c7d4fa2675c106a980ddcdbe308b54b5151945 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Implement both feature-local and feature-remote Sequence Window feature This adds full support for local/remote Sequence Window feature, from which the * sequence-number-validity (W) and * acknowledgment-number-validity (W') windows derive as specified in RFC 4340, 7.5.3. Specifically, the following changes are introduced: * integrated new socket fields into dccp_sk; * updated the update_gsr/gss routines with regard to these fields; * updated handler code: the Sequence Window feature is located at the TX side, so the local feature is meant if the handler-rx flag is false; * the initialisation of `rcv_wnd' in reqsk is removed, since - rcv_wnd is not used by the code anywhere; - sequence number checks are not done in the LISTEN state (cf. 7.5.3); - dccp_check_req checks the Ack number validity more rigorously; * the `struct dccp_minisock' became empty and is now removed. Until the handshake completes with activating negotiated values, the local/remote Sequence-Window values are undefined and thus can not reliably be estimated. This issue is addressed in a separate patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 ++- include/linux/dccp.h | 24 ++++-------------------- net/dccp/dccp.h | 16 +++++++--------- net/dccp/feat.c | 13 +++++++++++-- net/dccp/minisocks.c | 11 ----------- net/dccp/proto.c | 2 -- 6 files changed, 24 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 7a3bb1abb830..b132e4a3cf0f 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -141,7 +141,8 @@ rx_ccid = 2 Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 - The initial sequence window (sec. 7.5.2). + The initial sequence window (sec. 7.5.2) of the sender. This influences + the local ackno validity and the remote seqno validity windows (7.5.1). tx_qlen = 5 The size of the transmit buffer in packets. A value of 0 corresponds diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 990e97fa1f07..7a0502ab383a 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -363,19 +363,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 -/** - * struct dccp_minisock - Minimal DCCP connection representation - * - * Will be used to pass the state from dccp_request_sock to dccp_sock. - * - * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - */ -struct dccp_minisock { - __u64 dccpms_sequence_window; -}; - -extern void dccp_minisock_init(struct dccp_minisock *dmsk); - /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from @@ -464,13 +451,14 @@ struct dccp_ackvec; * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio + * @dccps_l_seq_win - local Sequence Window (influences ack number validity) + * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_minisock - associated minisock (accessed via dccp_msk) * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) @@ -504,12 +492,13 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; + __u64 dccps_l_seq_win:48; + __u64 dccps_r_seq_win:48; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; - struct dccp_minisock dccps_minisock; struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; @@ -527,11 +516,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) return (struct dccp_sock *)sk; } -static inline struct dccp_minisock *dccp_msk(const struct sock *sk) -{ - return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; -} - static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3fd16e82c003..6101ecdb85b6 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -409,23 +409,21 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_minisock *dmsk = dccp_msk(sk); dp->dccps_gsr = seq; - dccp_set_seqno(&dp->dccps_swl, - dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); - dccp_set_seqno(&dp->dccps_swh, - dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); + /* Sequence validity window depends on remote Sequence Window (7.5.1) */ + dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - dp->dccps_awh = dp->dccps_gss = seq; - dccp_set_seqno(&dp->dccps_awl, - (dp->dccps_gss - - dccp_msk(sk)->dccpms_sequence_window + 1)); + dp->dccps_gss = seq; + /* Ack validity window depends on local Sequence Window value (7.5.1) */ + dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + dp->dccps_awh = dp->dccps_gss; } static inline int dccp_ack_pending(const struct sock *sk) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 9a4938092783..843465999854 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -52,8 +52,17 @@ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) { - if (!rx) - dccp_msk(sk)->dccpms_sequence_window = seq_win; + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + dp->dccps_r_seq_win = seq_win; + /* propagate changes to update SWL/SWH */ + dccp_update_gsr(sk, dp->dccps_gsr); + } else { + dp->dccps_l_seq_win = seq_win; + /* propagate changes to update AWL */ + dccp_update_gss(sk, dp->dccps_gss); + } return 0; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 0ebf8ebcf3de..0ecb19c5e8ce 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -42,11 +42,6 @@ struct inet_timewait_death_row dccp_death_row = { EXPORT_SYMBOL_GPL(dccp_death_row); -void dccp_minisock_init(struct dccp_minisock *dmsk) -{ - dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; -} - void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -110,7 +105,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); - struct dccp_minisock *newdmsk = dccp_msk(newsk); newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; @@ -128,10 +122,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies */ - - /* See dccp_v4_conn_request */ - newdmsk->dccpms_sequence_window = req->rcv_wnd; - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; dccp_update_gss(newsk, dreq->dreq_iss); @@ -289,7 +279,6 @@ int dccp_reqsk_init(struct request_sock *req, inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; inet_rsk(req)->acked = 0; - req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; /* inherit feature negotiation options from listening socket */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 775eaa3d0c49..392a5d822b33 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -180,8 +180,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - dccp_minisock_init(&dp->dccps_minisock); - icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; -- cgit v1.2.3 From 0a4822679d94e2b0117aeead06a19fad59533905 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Initialisation and type-checking of feature sysctls This patch takes care of initialising and type-checking sysctls related to feature negotiation. Type checking is important since some of the sysctls now directly act on the feature-negotiation process. The sysctls are initialised with the known default values for each feature. For the type-checking the value constraints from RFC 4340 are used: * Sequence Window uses the specified Wmin=32, the maximum is ulong (4 bytes), tested and confirmed that it works up to 4294967295 - for Gbps speed; * Ack Ratio is between 0 .. 0xffff (2-byte unsigned integer); * CCIDs are between 0 .. 255; * request_retries, retries1, retries2 also between 0..255 for good measure; * tx_qlen is checked to be non-negative; * sync_ratelimit remains as before. Further changes: ---------------- Performed s@sysctl_dccp_feat@sysctl_dccp@g since the sysctls are now in feat.c. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 8 -------- net/dccp/dccp.h | 3 --- net/dccp/feat.c | 11 ++++++++--- net/dccp/feat.h | 8 ++++++++ net/dccp/options.c | 4 ---- net/dccp/sysctl.c | 43 ++++++++++++++++++++++++++++++------------- 6 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7a0502ab383a..7434a8353e23 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -355,14 +355,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) return __dccp_hdr_len(dccp_hdr(skb)); } - -/* initial values for each feature */ -#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 -#define DCCPF_INITIAL_ACK_RATIO 2 -#define DCCPF_INITIAL_CCID DCCPC_CCID2 -/* FIXME: for now we're default to 1 but it should really be 0 */ -#define DCCPF_INITIAL_SEND_NDP_COUNT 1 - /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 6101ecdb85b6..4f681f1a16cc 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -95,9 +95,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; -extern int sysctl_dccp_feat_sequence_window; -extern int sysctl_dccp_feat_rx_ccid; -extern int sysctl_dccp_feat_tx_ccid; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 843465999854..4c95cbdb0e01 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -26,6 +26,11 @@ #include "ccid.h" #include "feat.h" +/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ +unsigned long sysctl_dccp_sequence_window __read_mostly = 100; +int sysctl_dccp_rx_ccid __read_mostly = 2, + sysctl_dccp_tx_ccid __read_mostly = 2; + /* * Feature activation handlers. * @@ -1141,7 +1146,7 @@ int dccp_feat_init(struct sock *sk) /* Non-negotiable (NN) features */ rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_feat_sequence_window); + sysctl_dccp_sequence_window); if (rc) return rc; @@ -1172,8 +1177,8 @@ int dccp_feat_init(struct sock *sk) if (ccid_request_modules(tx.val, tx.len)) goto free_ccid_lists; - if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len)) + if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || + !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) goto free_ccid_lists; rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); diff --git a/net/dccp/feat.h b/net/dccp/feat.h index f73b47abca93..f8456bca4eeb 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -99,6 +99,13 @@ struct ccid_dependency { u8 val; }; +/* + * Sysctls to seed defaults for feature negotiation + */ +extern unsigned long sysctl_dccp_sequence_window; +extern int sysctl_dccp_rx_ccid; +extern int sysctl_dccp_tx_ccid; + #ifdef CONFIG_IP_DCCP_DEBUG extern const char *dccp_feat_typename(const u8 type); extern const char *dccp_feat_name(const u8 feat); @@ -113,6 +120,7 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #endif /* CONFIG_IP_DCCP_DEBUG */ extern int dccp_feat_init(struct sock *sk); +extern void dccp_feat_initialise_sysctls(void); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); diff --git a/net/dccp/options.c b/net/dccp/options.c index aca309e16632..5906e96eedde 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -23,10 +23,6 @@ #include "dccp.h" #include "feat.h" -int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; -int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; - u64 dccp_decode_value_var(const u8 *bf, const u8 len) { u64 value = 0; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 018e210875e1..a5a1856234e7 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -18,55 +18,72 @@ #error This file should not be compiled without CONFIG_SYSCTL defined #endif +/* Boundary values */ +static int zero = 0, + u8_max = 0xFF; +static unsigned long seqw_min = 32; + static struct ctl_table dccp_default_table[] = { { .procname = "seq_window", - .data = &sysctl_dccp_feat_sequence_window, - .maxlen = sizeof(sysctl_dccp_feat_sequence_window), + .data = &sysctl_dccp_sequence_window, + .maxlen = sizeof(sysctl_dccp_sequence_window), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ }, { .procname = "rx_ccid", - .data = &sysctl_dccp_feat_rx_ccid, - .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), + .data = &sysctl_dccp_rx_ccid, + .maxlen = sizeof(sysctl_dccp_rx_ccid), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ }, { .procname = "tx_ccid", - .data = &sysctl_dccp_feat_tx_ccid, - .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), + .data = &sysctl_dccp_tx_ccid, + .maxlen = sizeof(sysctl_dccp_tx_ccid), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "retries1", .data = &sysctl_dccp_retries1, .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "retries2", .data = &sysctl_dccp_retries2, .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "tx_qlen", .data = &sysctl_dccp_tx_qlen, .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "sync_ratelimit", -- cgit v1.2.3 From f10ecaee6dc2c6d56783462b2a82e98bc81b55f4 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Replace magic CCID-specific numbers by symbolic constants The constants DCCPO_{MIN,MAX}_CCID_SPECIFIC are nowhere used in the code, but instead for the CCID-specific options numbers are used. This patch unifies the use of CCID-specific option numbers, by adding symbolic names reflecting the definitions in RFC 4340, 10.3. Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 6 ++++-- net/dccp/options.c | 13 +++---------- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7434a8353e23..7187bd8a75f6 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -165,8 +165,10 @@ enum { DCCPO_TIMESTAMP_ECHO = 42, DCCPO_ELAPSED_TIME = 43, DCCPO_MAX = 45, - DCCPO_MIN_CCID_SPECIFIC = 128, - DCCPO_MAX_CCID_SPECIFIC = 255, + DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ + DCCPO_MAX_RX_CCID_SPECIFIC = 191, + DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ + DCCPO_MAX_TX_CCID_SPECIFIC = 255, }; /* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ #define DCCP_SINGLE_OPT_MAXLEN 253 diff --git a/net/dccp/options.c b/net/dccp/options.c index b102774694c6..e1c94e2b8be0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, } /* - * CCID-Specific Options (from RFC 4340, sec. 10.3): - * - * Option numbers 128 through 191 are for options sent from the - * HC-Sender to the HC-Receiver; option numbers 192 through 255 - * are for options sent from the HC-Receiver to the HC-Sender. - * * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. - * */ - if (dreq != NULL && (opt >= 128 || + if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; @@ -226,12 +219,12 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: + case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; break; - case 192 ... 255: + case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; -- cgit v1.2.3 From c2f42077bd06f300ae959204f3c007f820f5e769 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp ccid-2: Schedule Sync as out-of-band mechanism The problem with Ack Vectors is that i) their length is variable and can in principle grow quite large, ii) it is hard to predict exactly how large they will be. Due to the second point it seems not a good idea to reduce the MPS; in particular when on average there is enough room for the Ack Vector and an increase in length is momentarily due to some burst loss, after which the Ack Vector returns to its normal/average length. The solution taken by this patch is to subtract a minimum-expected Ack Vector length from the MPS (previous patch), and to defer any larger Ack Vectors onto a separate Sync - but only if indeed there is no space left on the skb. This patch provides the infrastructure to schedule Sync-packets for transporting (urgent) out-of-band data. Its signalling is quicker than scheduling an Ack, since it does not need to wait for new application data. It can thus serve other parts of the DCCP code as well. Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 2 ++ net/dccp/options.c | 24 ++++++++++++++++++++---- net/dccp/output.c | 8 ++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7187bd8a75f6..83197b601a4f 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -462,6 +462,7 @@ struct dccp_ackvec; * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) + * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" * @dccps_xmit_timer - timer for when CCID is not ready to send * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ @@ -502,6 +503,7 @@ struct dccp_sock { __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; + __u8 dccps_sync_scheduled:1; struct timer_list dccps_xmit_timer; }; diff --git a/net/dccp/options.c b/net/dccp/options.c index b11d7b7167f0..791e07853a79 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -430,6 +430,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const u16 buflen = dccp_ackvec_buflen(av); /* Figure out how many options do we need to represent the ackvec */ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); @@ -438,10 +439,25 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) const unsigned char *tail, *from; unsigned char *to; - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, + dccp_packet_name(dcb->dccpd_type)); return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; + } + /* + * Since Ack Vectors are variable-length, we can not always predict + * their size. To catch exception cases where the space is running out + * on the skb, a separate Sync is scheduled to carry the Ack Vector. + */ + if (len > DCCPAV_MIN_OPTLEN && + len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { + DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " + "MPS=%u ==> reduce payload size?\n", len, skb->len, + dcb->dccpd_opt_len, dp->dccps_mss_cache); + dp->dccps_sync_scheduled = 1; + return 0; + } + dcb->dccpd_opt_len += len; to = skb_push(skb, len); len = buflen; @@ -482,7 +498,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) /* * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. */ - if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce)) + if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) return -ENOBUFS; return 0; } diff --git a/net/dccp/output.c b/net/dccp/output.c index 1b3168307586..bfda071559f4 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -305,6 +305,8 @@ void dccp_write_xmit(struct sock *sk, int block) if (err) DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", err); + if (dp->dccps_sync_scheduled) + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } else { dccp_pr_debug("packet discarded due to err=%d\n", err); kfree_skb(skb); @@ -591,6 +593,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; + /* + * Clear the flag in case the Sync was scheduled for out-of-band data, + * such as carrying a long Ack Vector. + */ + dccp_sk(sk)->dccps_sync_scheduled = 0; + dccp_transmit_skb(sk, skb); } -- cgit v1.2.3 From e7937772d7a2b0127cc4cbc67bc594e139fdaf63 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Extend CCID packet dequeueing interface This extends the packet dequeuing interface of dccp_write_xmit() to allow 1. CCIDs to take care of timing when the next packet may be sent; 2. delayed sending (as before, with an inter-packet gap up to 65.535 seconds). The main purpose is to take CCID2 out of its polling mode (when it is network- limited, it tries every millisecond to send, without interruption). The interface can also be used to support other CCIDs. The mode of operation for (2) is as follows: * new packet is enqueued via dccp_sendmsg() => dccp_write_xmit(), * ccid_hc_tx_send_packet() detects that it may not send (e.g. window full), * it signals this condition via `CCID_PACKET_WILL_DEQUEUE_LATER', * dccp_write_xmit() returns without further action; * after some time the wait-condition for CCID becomes true, * that CCID schedules the tasklet, * tasklet function calls ccid_hc_tx_send_packet() via dccp_write_xmit(), * since the wait-condition is now true, ccid_hc_tx_packet() returns "send now", * packet is sent, and possibly more (since dccp_write_xmit() loops). Code reuse: the taskled function calls dccp_write_xmit(), the timer function reduces to a wrapper around the same code. If the tasklet finds that the socket is locked, it re-schedules the tasklet function (not the tasklet) after one jiffy. Changed DCCP_BUG to dccp_pr_debug when transmit_skb returns an error (e.g. when a local qdisc is used, NET_XMIT_DROP=1 can be returned for many packets). Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 4 +- net/dccp/output.c | 129 ++++++++++++++++++++++++++++++++------------------- net/dccp/timer.c | 25 +++++----- 3 files changed, 98 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 83197b601a4f..eed52bcd35d0 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -463,7 +463,8 @@ struct dccp_ackvec; * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmit_timer - timer for when CCID is not ready to send + * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets + * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { @@ -504,6 +505,7 @@ struct dccp_sock { __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; __u8 dccps_sync_scheduled:1; + struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; diff --git a/net/dccp/output.c b/net/dccp/output.c index bfda071559f4..9afd58e39e23 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -251,65 +251,98 @@ do_interrupted: goto out; } +/** + * dccp_xmit_packet - Send data packet under control of CCID + * Transmits next-queued payload and informs CCID to account for the packet. + */ +static void dccp_xmit_packet(struct sock *sk) +{ + int err, len; + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); + + if (unlikely(skb == NULL)) + return; + len = skb->len; + + if (sk->sk_state == DCCP_PARTOPEN) { + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } + + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; + } + + err = dccp_transmit_skb(sk, skb); + if (err) + dccp_pr_debug("transmit_skb() returned err=%d\n", err); + /* + * Register this one as sent even if an error occurred. To the remote + * end a local packet drop is indistinguishable from network loss, i.e. + * any local drop will eventually be reported via receiver feedback. + */ + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); + + /* + * If the CCID needs to transfer additional header options out-of-band + * (e.g. Ack Vectors or feature-negotiation options), it activates this + * flag to schedule a Sync. The Sync will automatically incorporate all + * currently pending header options, thus clearing the backlog. + */ + if (dp->dccps_sync_scheduled) + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); +} + void dccp_write_xmit(struct sock *sk, int block) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_write_queue))) { - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - if (err > 0) { + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + return; + case CCID_PACKET_DELAY: if (!block) { sk_reset_timer(sk, &dp->dccps_xmit_timer, - msecs_to_jiffies(err)+jiffies); + msecs_to_jiffies(rc)+jiffies); + return; + } + rc = dccp_wait_for_ccid(sk, skb, rc); + if (rc && rc != -EINTR) { + DCCP_BUG("err=%d after dccp_wait_for_ccid", rc); + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); break; - } else - err = dccp_wait_for_ccid(sk, skb, err); - if (err && err != -EINTR) - DCCP_BUG("err=%d after dccp_wait_for_ccid", err); - } - - skb_dequeue(&sk->sk_write_queue); - if (err == 0) { - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - dcb->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) - dcb->dccpd_type = DCCP_PKT_DATAACK; - else - dcb->dccpd_type = DCCP_PKT_DATA; - - err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - if (err) - DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", - err); - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } else { - dccp_pr_debug("packet discarded due to err=%d\n", err); + } + /* fall through */ + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%d\n", rc); } } } diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 162d1e683c39..9369aca4b0e9 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -237,32 +237,35 @@ out: sock_put(sk); } -/* Transmit-delay timer: used by the CCIDs to delay actual send time */ -static void dccp_write_xmit_timer(unsigned long data) +/** + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * See the comments above %ccid_dequeueing_decision for supported modes. + */ +static void dccp_write_xmitlet(unsigned long data) { struct sock *sk = (struct sock *)data; - struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else dccp_write_xmit(sk, 0); bh_unlock_sock(sk); - sock_put(sk); } -static void dccp_init_write_xmit_timer(struct sock *sk) +static void dccp_write_xmit_timer(unsigned long data) { - struct dccp_sock *dp = dccp_sk(sk); - - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_write_xmitlet(data); + sock_put((struct sock *)data); } void dccp_init_xmit_timers(struct sock *sk) { - dccp_init_write_xmit_timer(sk); + struct dccp_sock *dp = dccp_sk(sk); + + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } -- cgit v1.2.3 From d6da3511d6b558d0b017777b61dc08b8fbc06ea4 Mon Sep 17 00:00:00 2001 From: Tomasz Grobelny Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: dccp: Policy-based packet dequeueing infrastructure This patch adds a generic infrastructure for policy-based dequeueing of TX packets and provides two policies: * a simple FIFO policy (which is the default) and * a priority based policy (set via socket options). Both policies honour the tx_qlen sysctl for the maximum size of the write queue (can be overridden via socket options). The priority policy uses skb->priority internally to assign an u32 priority identifier, using the same ranking as SO_PRIORITY. The skb->priority field is set to 0 when the packet leaves DCCP. The priority is supplied as ancillary data using cmsg(3), the patch also provides the requisite parsing routines. Signed-off-by: Tomasz Grobelny Signed-off-by: Gerrit Renker --- Documentation/networking/dccp.txt | 19 ++++++ include/linux/dccp.h | 21 +++++++ net/dccp/Makefile | 2 +- net/dccp/dccp.h | 12 ++++ net/dccp/output.c | 7 +-- net/dccp/proto.c | 67 +++++++++++++++++++- net/dccp/qpolicy.c | 126 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 246 insertions(+), 8 deletions(-) create mode 100644 net/dccp/qpolicy.c (limited to 'include/linux') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index b132e4a3cf0f..fcfc12534428 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree Socket options ============== +DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes +a policy ID as argument and can only be set before the connection (i.e. changes +during an established connection are not supported). Currently, two policies are +defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, +and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an +u32 priority value as ancillary data to sendmsg(), where higher numbers indicate +a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to +be formatted using a cmsg(3) message header filled in as follows: + cmsg->cmsg_level = SOL_DCCP; + cmsg->cmsg_type = DCCP_SCM_PRIORITY; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ + +DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero +value is always interpreted as unbounded queue length. If different from zero, +the interpretation of this parameter depends on the current dequeuing policy +(see above): the "simple" policy will enforce a fixed queue size by returning +EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the +lowest-priority packet first. The default value for this parameter is +initialised from /proc/sys/net/dccp/default/tx_qlen. DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, diff --git a/include/linux/dccp.h b/include/linux/dccp.h index eed52bcd35d0..010e2d87ed75 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -197,6 +197,21 @@ enum dccp_feature_numbers { DCCPF_MAX_CCID_SPECIFIC = 255, }; +/* DCCP socket control message types for cmsg */ +enum dccp_cmsg_type { + DCCP_SCM_PRIORITY = 1, + DCCP_SCM_QPOLICY_MAX = 0xFFFF, + /* ^-- Up to here reserved exclusively for qpolicy parameters */ + DCCP_SCM_MAX +}; + +/* DCCP priorities for outgoing/queued packets */ +enum dccp_packet_dequeueing_policy { + DCCPQ_POLICY_SIMPLE, + DCCPQ_POLICY_PRIO, + DCCPQ_POLICY_MAX +}; + /* DCCP socket options */ #define DCCP_SOCKOPT_PACKET_SIZE 1 /* XXX deprecated, without effect */ #define DCCP_SOCKOPT_SERVICE 2 @@ -210,6 +225,8 @@ enum dccp_feature_numbers { #define DCCP_SOCKOPT_CCID 13 #define DCCP_SOCKOPT_TX_CCID 14 #define DCCP_SOCKOPT_RX_CCID 15 +#define DCCP_SOCKOPT_QPOLICY_ID 16 +#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 @@ -458,6 +475,8 @@ struct dccp_ackvec; * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options + * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy + * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending @@ -500,6 +519,8 @@ struct dccp_sock { struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; + __u8 dccps_qpolicy; + __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; diff --git a/net/dccp/Makefile b/net/dccp/Makefile index b68440bd7fa2..0c1c9af2bf7e 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o dccp-y := ccid.o feat.o input.o minisocks.o options.o \ - output.o proto.o timer.o ackvec.o + qpolicy.o output.o proto.o timer.o ackvec.o dccp_ipv4-y := ipv4.o diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 74c90cd27677..ce2dd6f6f34d 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -234,6 +234,18 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); +/* + * TX Packet Dequeueing Interface + */ +extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); +extern bool dccp_qpolicy_full(struct sock *sk); +extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); +extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); +extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); + +/* + * TX Packet Output and TX Timers + */ extern void dccp_write_xmit(struct sock *sk); extern void dccp_write_space(struct sock *sk); extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); diff --git a/net/dccp/output.c b/net/dccp/output.c index b1eaf7bcfb11..2532797a8009 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -241,7 +241,7 @@ static void dccp_xmit_packet(struct sock *sk) { int err, len; struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); + struct sk_buff *skb = dccp_qpolicy_pop(sk); if (unlikely(skb == NULL)) return; @@ -344,7 +344,7 @@ void dccp_write_xmit(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_write_queue))) { + while ((skb = dccp_qpolicy_top(sk))) { int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); switch (ccid_packet_dequeue_eval(rc)) { @@ -358,8 +358,7 @@ void dccp_write_xmit(struct sock *sk) dccp_xmit_packet(sk); break; case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); + dccp_qpolicy_drop(sk, skb); dccp_pr_debug("packet discarded due to err=%d\n", rc); } } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8c125ffab1c5..b56efdd2a421 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -189,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; + dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); @@ -541,6 +542,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; + case DCCP_SOCKOPT_QPOLICY_ID: + if (sk->sk_state != DCCP_CLOSED) + err = -EISCONN; + else if (val < 0 || val >= DCCPQ_POLICY_MAX) + err = -EINVAL; + else + dp->dccps_qpolicy = val; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + if (val < 0) + err = -EINVAL; + else + dp->dccps_tx_qlen = val; + break; default: err = -ENOPROTOOPT; break; @@ -648,6 +663,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; + case DCCP_SOCKOPT_QPOLICY_ID: + val = dp->dccps_qpolicy; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + val = dp->dccps_tx_qlen; + break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); @@ -690,6 +711,43 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); #endif +static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) +{ + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + + /* + * Assign an (opaque) qpolicy priority value to skb->priority. + * + * We are overloading this skb field for use with the qpolicy subystem. + * The skb->priority is normally used for the SO_PRIORITY option, which + * is initialised from sk_priority. Since the assignment of sk_priority + * to skb->priority happens later (on layer 3), we overload this field + * for use with queueing priorities as long as the skb is on layer 4. + * The default priority value (if nothing is set) is 0. + */ + skb->priority = 0; + + for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_DCCP) + continue; + + switch (cmsg->cmsg_type) { + case DCCP_SCM_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) + return -EINVAL; + skb->priority = *(__u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -705,8 +763,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - if (sysctl_dccp_tx_qlen && - (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { + if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_release; } @@ -734,7 +791,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (rc != 0) goto out_discard; - skb_queue_tail(&sk->sk_write_queue, skb); + rc = dccp_msghdr_parse(msg, skb); + if (rc != 0) + goto out_discard; + + dccp_qpolicy_push(sk, skb); dccp_write_xmit(sk); out_release: release_sock(sk); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c new file mode 100644 index 000000000000..414696b0d830 --- /dev/null +++ b/net/dccp/qpolicy.c @@ -0,0 +1,126 @@ +/* + * net/dccp/qpolicy.c + * + * Policy-based packet dequeueing interface for DCCP. + * + * Copyright (c) 2008 Tomasz Grobelny + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License v2 + * as published by the Free Software Foundation. + */ +#include "dccp.h" + +/* + * Simple Dequeueing Policy: + * If tx_qlen is different from 0, enqueue up to tx_qlen elements. + */ +static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) +{ + skb_queue_tail(&sk->sk_write_queue, skb); +} + +static bool qpolicy_simple_full(struct sock *sk) +{ + return dccp_sk(sk)->dccps_tx_qlen && + sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; +} + +static struct sk_buff *qpolicy_simple_top(struct sock *sk) +{ + return skb_peek(&sk->sk_write_queue); +} + +/* + * Priority-based Dequeueing Policy: + * If tx_qlen is different from 0 and the queue has reached its upper bound + * of tx_qlen elements, replace older packets lowest-priority-first. + */ +static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) +{ + struct sk_buff *skb, *best = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (best == NULL || skb->priority > best->priority) + best = skb; + return best; +} + +static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) +{ + struct sk_buff *skb, *worst = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (worst == NULL || skb->priority < worst->priority) + worst = skb; + return worst; +} + +static bool qpolicy_prio_full(struct sock *sk) +{ + if (qpolicy_simple_full(sk)) + dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); + return false; +} + +/** + * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface + * @push: add a new @skb to the write queue + * @full: indicates that no more packets will be admitted + * @top: peeks at whatever the queueing policy defines as its `top' + */ +static struct dccp_qpolicy_operations { + void (*push) (struct sock *sk, struct sk_buff *skb); + bool (*full) (struct sock *sk); + struct sk_buff* (*top) (struct sock *sk); + +} qpol_table[DCCPQ_POLICY_MAX] = { + [DCCPQ_POLICY_SIMPLE] = { + .push = qpolicy_simple_push, + .full = qpolicy_simple_full, + .top = qpolicy_simple_top, + }, + [DCCPQ_POLICY_PRIO] = { + .push = qpolicy_simple_push, + .full = qpolicy_prio_full, + .top = qpolicy_prio_best_skb, + }, +}; + +/* + * Externally visible interface + */ +void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) +{ + qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); +} + +bool dccp_qpolicy_full(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); +} + +void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) +{ + if (skb != NULL) { + skb_unlink(skb, &sk->sk_write_queue); + kfree_skb(skb); + } +} + +struct sk_buff *dccp_qpolicy_top(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); +} + +struct sk_buff *dccp_qpolicy_pop(struct sock *sk) +{ + struct sk_buff *skb = dccp_qpolicy_top(sk); + + /* Clear any skb fields that we used internally */ + skb->priority = 0; + + if (skb) + skb_unlink(skb, &sk->sk_write_queue); + return skb; +} -- cgit v1.2.3