From bf1ecd210541ef5f3a110e88e8ca5d33b4aa5c23 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 31 May 2016 00:16:50 +0300 Subject: cfg80211: Allow cfg80211_connect_result() errors to be distinguished Previously, the status parameter to cfg80211_connect_result() was documented as using WLAN_STATUS_UNSPECIFIED_FAILURE (1) when the real status code for the failure is not known. This value can be used by an AP (and often is) and as such, user space cannot distinguish between explicitly rejected authentication/association and not being able to even try to associate or not receiving a response from the AP. Add a new inline function, cfg80211_connect_timeout(), to be used when the driver knows that the connection attempt failed due to a reason where connection could not be attempt or no response was received from the AP. The internal functions now allow a negative status value (-1) to be used as an indication of this special case. This results in the NL80211_ATTR_TIMED_OUT to be added to the NL80211_CMD_CONNECT event to allow user space to determine this case was hit. For backwards compatibility, NL80211_STATUS_CODE with the value WLAN_STATUS_UNSPECIFIED_FAILURE is still indicated in the event in such a case. Signed-off-by: Jouni Malinen [johannes: fix cfg80211_connect_bss() prototype to use int for status, add cfg80211_connect_timeout() to docbook, fix docbook] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 53 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 63921672bed0..537f010cf5e1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2367,19 +2367,23 @@ struct cfg80211_qos_map { * (invoked with the wireless_dev mutex held) * * @connect: Connect to the ESS with the specified parameters. When connected, - * call cfg80211_connect_result() with status code %WLAN_STATUS_SUCCESS. - * If the connection fails for some reason, call cfg80211_connect_result() - * with the status from the AP. The driver is allowed to roam to other - * BSSes within the ESS when the other BSS matches the connect parameters. - * When such roaming is initiated by the driver, the driver is expected to - * verify that the target matches the configured security parameters and - * to use Reassociation Request frame instead of Association Request frame. - * The connect function can also be used to request the driver to perform - * a specific roam when connected to an ESS. In that case, the prev_bssid + * call cfg80211_connect_result()/cfg80211_connect_bss() with status code + * %WLAN_STATUS_SUCCESS. If the connection fails for some reason, call + * cfg80211_connect_result()/cfg80211_connect_bss() with the status code + * from the AP or cfg80211_connect_timeout() if no frame with status code + * was received. + * The driver is allowed to roam to other BSSes within the ESS when the + * other BSS matches the connect parameters. When such roaming is initiated + * by the driver, the driver is expected to verify that the target matches + * the configured security parameters and to use Reassociation Request + * frame instead of Association Request frame. + * The connect function can also be used to request the driver to perform a + * specific roam when connected to an ESS. In that case, the prev_bssid * parameter is set to the BSSID of the currently associated BSS as an - * indication of requesting reassociation. In both the driver-initiated and - * new connect() call initiated roaming cases, the result of roaming is - * indicated with a call to cfg80211_roamed() or cfg80211_roamed_bss(). + * indication of requesting reassociation. + * In both the driver-initiated and new connect() call initiated roaming + * cases, the result of roaming is indicated with a call to + * cfg80211_roamed() or cfg80211_roamed_bss(). * (invoked with the wireless_dev mutex held) * @disconnect: Disconnect from the BSS/ESS. * (invoked with the wireless_dev mutex held) @@ -4680,7 +4684,7 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, struct cfg80211_bss *bss, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, u16 status, gfp_t gfp); + size_t resp_ie_len, int status, gfp_t gfp); /** * cfg80211_connect_result - notify cfg80211 of connection result @@ -4709,6 +4713,29 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid, resp_ie_len, status, gfp); } +/** + * cfg80211_connect_timeout - notify cfg80211 of connection timeout + * + * @dev: network device + * @bssid: the BSSID of the AP + * @req_ie: association request IEs (maybe be %NULL) + * @req_ie_len: association request IEs length + * @gfp: allocation flags + * + * It should be called by the underlying driver whenever connect() has failed + * in a sequence where no explicit authentication/association rejection was + * received from the AP. This could happen, e.g., due to not being able to send + * out the Authentication or Association Request frame or timing out while + * waiting for the response. + */ +static inline void +cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, + const u8 *req_ie, size_t req_ie_len, gfp_t gfp) +{ + cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1, + gfp); +} + /** * cfg80211_roamed - notify cfg80211 of roaming * -- cgit v1.2.3 From 019ae3a918811715192b22c400ac78d54acc26a9 Mon Sep 17 00:00:00 2001 From: "Kanchanapally, Vidyullatha" Date: Mon, 16 May 2016 10:41:04 +0530 Subject: cfg80211: Advertise extended capabilities per interface type to userspace The driver extended capabilities may differ for different interface types which the userspace needs to know (for example the fine timing measurement initiator and responder bits might differ for a station and AP). Add a new nl80211 attribute to provide extended capabilities per interface type to userspace. Signed-off-by: Vidyullatha Kanchanapally Reviewed-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 28 +++++++++++++++++++++++++++- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/core.c | 30 ++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 106 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 537f010cf5e1..7bbb00d8b2cd 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3083,6 +3083,24 @@ struct wiphy_vendor_command { unsigned long *storage); }; +/** + * struct wiphy_iftype_ext_capab - extended capabilities per interface type + * @iftype: interface type + * @extended_capabilities: extended capabilities supported by the driver, + * additional capabilities might be supported by userspace; these are the + * 802.11 extended capabilities ("Extended Capabilities element") and are + * in the same format as in the information element. See IEEE Std + * 802.11-2012 8.4.2.29 for the defined fields. + * @extended_capabilities_mask: mask of the valid values + * @extended_capabilities_len: length of the extended capabilities + */ +struct wiphy_iftype_ext_capab { + enum nl80211_iftype iftype; + const u8 *extended_capabilities; + const u8 *extended_capabilities_mask; + u8 extended_capabilities_len; +}; + /** * struct wiphy - wireless hardware description * @reg_notifier: the driver's regulatory notification callback, @@ -3203,9 +3221,14 @@ struct wiphy_vendor_command { * additional capabilities might be supported by userspace; these are * the 802.11 extended capabilities ("Extended Capabilities element") * and are in the same format as in the information element. See - * 802.11-2012 8.4.2.29 for the defined fields. + * 802.11-2012 8.4.2.29 for the defined fields. These are the default + * extended capabilities to be used if the capabilities are not specified + * for a specific interface type in iftype_ext_capab. * @extended_capabilities_mask: mask of the valid values * @extended_capabilities_len: length of the extended capabilities + * @iftype_ext_capab: array of extended capabilities per interface type + * @num_iftype_ext_capab: number of interface types for which extended + * capabilities are specified separately. * @coalesce: packet coalescing support information * * @vendor_commands: array of vendor commands supported by the hardware @@ -3305,6 +3328,9 @@ struct wiphy { const u8 *extended_capabilities, *extended_capabilities_mask; u8 extended_capabilities_len; + const struct wiphy_iftype_ext_capab *iftype_ext_capab; + unsigned int num_iftype_ext_capab; + /* If multiple wiphys are registered and you're handed e.g. * a regular netdev with assigned ieee80211_ptr, you won't * know whether it points to a wiphy your driver has registered diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8d995316aadb..53c8278827a0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1824,6 +1824,11 @@ enum nl80211_commands { * * @NL80211_ATTR_PAD: attribute used for padding for 64-bit alignment * + * @NL80211_ATTR_IFTYPE_EXT_CAPA: Nested attribute of the following attributes: + * %NL80211_ATTR_IFTYPE, %NL80211_ATTR_EXT_CAPA, + * %NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per + * interface type. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2206,6 +2211,8 @@ enum nl80211_attrs { NL80211_ATTR_PAD, + NL80211_ATTR_IFTYPE_EXT_CAPA, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index d25c82bc1bbe..b8e10a952111 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -750,6 +750,36 @@ int wiphy_register(struct wiphy *wiphy) nl80211_send_reg_change_event(&request); } + /* Check that nobody globally advertises any capabilities they do not + * advertise on all possible interface types. + */ + if (wiphy->extended_capabilities_len && + wiphy->num_iftype_ext_capab && + wiphy->iftype_ext_capab) { + u8 supported_on_all, j; + const struct wiphy_iftype_ext_capab *capab; + + capab = wiphy->iftype_ext_capab; + for (j = 0; j < wiphy->extended_capabilities_len; j++) { + if (capab[0].extended_capabilities_len > j) + supported_on_all = + capab[0].extended_capabilities[j]; + else + supported_on_all = 0x00; + for (i = 1; i < wiphy->num_iftype_ext_capab; i++) { + if (j >= capab[i].extended_capabilities_len) { + supported_on_all = 0x00; + break; + } + supported_on_all &= + capab[i].extended_capabilities[j]; + } + if (WARN_ON(wiphy->extended_capabilities[j] & + ~supported_on_all)) + break; + } + } + rdev->wiphy.registered = true; rtnl_unlock(); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03ac2ba8b174..d12044996a0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1264,7 +1264,7 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg, struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; - long split_start, band_start, chan_start; + long split_start, band_start, chan_start, capa_start; bool split; }; @@ -1761,6 +1761,47 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, nla_nest_end(msg, nested); } + state->split_start++; + break; + case 13: + if (rdev->wiphy.num_iftype_ext_capab && + rdev->wiphy.iftype_ext_capab) { + struct nlattr *nested_ext_capab, *nested; + + nested = nla_nest_start(msg, + NL80211_ATTR_IFTYPE_EXT_CAPA); + if (!nested) + goto nla_put_failure; + + for (i = state->capa_start; + i < rdev->wiphy.num_iftype_ext_capab; i++) { + const struct wiphy_iftype_ext_capab *capab; + + capab = &rdev->wiphy.iftype_ext_capab[i]; + + nested_ext_capab = nla_nest_start(msg, i); + if (!nested_ext_capab || + nla_put_u32(msg, NL80211_ATTR_IFTYPE, + capab->iftype) || + nla_put(msg, NL80211_ATTR_EXT_CAPA, + capab->extended_capabilities_len, + capab->extended_capabilities) || + nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK, + capab->extended_capabilities_len, + capab->extended_capabilities_mask)) + goto nla_put_failure; + + nla_nest_end(msg, nested_ext_capab); + if (state->split) + break; + } + nla_nest_end(msg, nested); + if (i < rdev->wiphy.num_iftype_ext_capab) { + state->capa_start = i + 1; + break; + } + } + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 595d0b29463343c3be995d3948930b8231e5b8cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 May 2016 15:22:41 -0700 Subject: udp: avoid csum_partial() for validated skb In commit e6afc8ace6dd5 ("udp: remove headers from UDP packets before queueing"), udp_csum_pull_header() helper was added but missed fact that CHECKSUM_UNNECESSARY packets were now converted to CHECKSUM_NONE and skb->csum_valid was set to 1 for them. Since csum_partial() is quite expensive, even for 8-byte area, it is worth adding a test. We also can use skb->data instead of udp_hdr() as we are pulling UDP headers, as it is sightly faster. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index ae07f375370d..8894d7144189 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -160,8 +160,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, static inline void udp_csum_pull_header(struct sk_buff *skb) { - if (skb->ip_summed == CHECKSUM_NONE) - skb->csum = csum_partial(udp_hdr(skb), sizeof(struct udphdr), + if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) + skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); -- cgit v1.2.3 From 90017accff61ae89283ad9a51f9ac46ca01633fb Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 2 Jun 2016 15:05:43 -0300 Subject: sctp: Add GSO support SCTP has this pecualiarity that its packets cannot be just segmented to (P)MTU. Its chunks must be contained in IP segments, padding respected. So we can't just generate a big skb, set gso_size to the fragmentation point and deliver it to IP layer. This patch takes a different approach. SCTP will now build a skb as it would be if it was received using GRO. That is, there will be a cover skb with protocol headers and children ones containing the actual segments, already segmented to a way that respects SCTP RFCs. With that, we can tell skb_segment() to just split based on frag_list, trusting its sizes are already in accordance. This way SCTP can benefit from GSO and instead of passing several packets through the stack, it can pass a single large packet. v2: - Added support for receiving GSO frames, as requested by Dave Miller. - Clear skb->cb if packet is GSO (otherwise it's not used by SCTP) - Added heuristics similar to what we have in TCP for not generating single GSO packets that fills cwnd. v3: - consider sctphdr size in skb_gso_transport_seglen() - rebased due to 5c7cdf339af5 ("gso: Remove arbitrary checks for unsupported GSO") Signed-off-by: Marcelo Ricardo Leitner Tested-by: Xin Long Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 7 +- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 2 + include/net/sctp/sctp.h | 4 + include/net/sctp/structs.h | 5 + net/core/ethtool.c | 1 + net/core/skbuff.c | 3 + net/sctp/Makefile | 3 +- net/sctp/input.c | 12 +- net/sctp/inqueue.c | 51 +++++- net/sctp/offload.c | 98 +++++++++++ net/sctp/output.c | 363 +++++++++++++++++++++++++++------------- net/sctp/protocol.c | 3 + net/sctp/socket.c | 2 + 14 files changed, 429 insertions(+), 126 deletions(-) create mode 100644 net/sctp/offload.c (limited to 'include/net') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index aa7b2400f98c..9c6c8ef2e9e7 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -53,8 +53,9 @@ enum { * headers in software. */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ + NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_TUNNEL_REMCSUM_BIT, + NETIF_F_GSO_SCTP_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -128,6 +129,7 @@ enum { #define NETIF_F_TSO_MANGLEID __NETIF_F(TSO_MANGLEID) #define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL) #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) +#define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) @@ -166,7 +168,8 @@ enum { NETIF_F_FSO) /* List of features with software fallbacks. */ -#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_UFO) +#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_UFO | \ + NETIF_F_GSO_SCTP) /* * If one device supports one of these features, then enable them diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f45929ce8157..fa6df2699532 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4012,6 +4012,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index aa3f9d7e8d5c..dc0fca747c5e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -487,6 +487,8 @@ enum { SKB_GSO_PARTIAL = 1 << 13, SKB_GSO_TUNNEL_REMCSUM = 1 << 14, + + SKB_GSO_SCTP = 1 << 15, }; #if BITS_PER_LONG > 32 diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index b392ac8382f2..632e205ca54b 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -186,6 +186,10 @@ void sctp_assocs_proc_exit(struct net *net); int sctp_remaddr_proc_init(struct net *net); void sctp_remaddr_proc_exit(struct net *net); +/* + * sctp/offload.c + */ +int sctp_offload_init(void); /* * Module global variables diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 16b013a6191c..83c5ec58b93a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -566,6 +566,9 @@ struct sctp_chunk { /* This points to the sk_buff containing the actual data. */ struct sk_buff *skb; + /* In case of GSO packets, this will store the head one */ + struct sk_buff *head_skb; + /* These are the SCTP headers by reverse order in a packet. * Note that some of these may happen more than once. In that * case, we point at the "current" one, whatever that means @@ -696,6 +699,8 @@ struct sctp_packet { size_t overhead; /* This is the total size of all chunks INCLUDING padding. */ size_t size; + /* This is the maximum size this packet may have */ + size_t max_size; /* The packet is destined for this transport address. * The function we finally use to pass down to the next lower diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4034817d255..977489820eb9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5ca562b56ec3..b6e0f95bef36 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #ifdef CONFIG_NET_CLS_ACT #include @@ -4383,6 +4384,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); + } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) { + thlen = sizeof(struct sctphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 0fca5824ad0e..6c4f7496cec6 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o auth.o + output.o input.o debug.o ssnmap.o auth.o \ + offload.o sctp_probe-y := probe.o diff --git a/net/sctp/input.c b/net/sctp/input.c index 5cff2546c3dd..6f8e676d285e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -139,7 +139,9 @@ int sctp_rcv(struct sk_buff *skb) skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); - else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0) + else if (!sctp_checksum_disable && + !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && + sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; @@ -1175,6 +1177,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, { sctp_chunkhdr_t *ch; + /* We do not allow GSO frames here as we need to linearize and + * then cannot guarantee frame boundaries. This shouldn't be an + * issue as packets hitting this are mostly INIT or INIT-ACK and + * those cannot be on GSO-style anyway. + */ + if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) + return NULL; + if (skb_linearize(skb)) return NULL; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 5ba08ceda3ab..edabbbdfca54 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -138,6 +138,17 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { + if (chunk->head_skb == chunk->skb) { + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + goto new_skb; + } + if (chunk->skb->next) { + chunk->skb = chunk->skb->next; + goto new_skb; + } + + if (chunk->head_skb) + chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { @@ -155,15 +166,15 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) next_chunk: /* Is the queue empty? */ - if (list_empty(&queue->in_chunk_list)) + entry = sctp_list_dequeue(&queue->in_chunk_list); + if (!entry) return NULL; - entry = queue->in_chunk_list.next; chunk = list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); /* Linearize if it's not GSO */ - if (skb_is_nonlinear(chunk->skb)) { + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) != SKB_GSO_SCTP && + skb_is_nonlinear(chunk->skb)) { if (skb_linearize(chunk->skb)) { __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); sctp_chunk_free(chunk); @@ -174,15 +185,39 @@ next_chunk: chunk->sctp_hdr = sctp_hdr(chunk->skb); } + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) { + /* GSO-marked skbs but without frags, handle + * them normally + */ + if (skb_shinfo(chunk->skb)->frag_list) + chunk->head_skb = chunk->skb; + + /* skbs with "cover letter" */ + if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + + if (WARN_ON(!chunk->skb)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } + } + + if (chunk->asoc) + sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); + queue->in_progress = chunk; +new_skb: /* This is the first chunk in the packet. */ - chunk->singleton = 1; ch = (sctp_chunkhdr_t *) chunk->skb->data; + chunk->singleton = 1; chunk->data_accepted = 0; - - if (chunk->asoc) - sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); + chunk->pdiscard = 0; + chunk->auth = 0; + chunk->has_asconf = 0; + chunk->end_of_packet = 0; + chunk->ecn_ce_done = 0; } chunk->chunk_hdr = ch; diff --git a/net/sctp/offload.c b/net/sctp/offload.c new file mode 100644 index 000000000000..a37887b373a7 --- /dev/null +++ b/net/sctp/offload.c @@ -0,0 +1,98 @@ +/* + * sctp_offload - GRO/GSO Offloading for SCTP + * + * Copyright (C) 2015, Marcelo Ricardo Leitner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static __le32 sctp_gso_make_checksum(struct sk_buff *skb) +{ + skb->ip_summed = CHECKSUM_NONE; + return sctp_compute_cksum(skb, skb_transport_offset(skb)); +} + +static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sctphdr *sh; + + sh = sctp_hdr(skb); + if (!pskb_may_pull(skb, sizeof(*sh))) + goto out; + + __skb_pull(skb, sizeof(*sh)); + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + struct skb_shared_info *pinfo = skb_shinfo(skb); + struct sk_buff *frag_iter; + + pinfo->gso_segs = 0; + if (skb->len != skb->data_len) { + /* Means we have chunks in here too */ + pinfo->gso_segs++; + } + + skb_walk_frags(skb, frag_iter) + pinfo->gso_segs++; + + segs = NULL; + goto out; + } + + segs = skb_segment(skb, features | NETIF_F_HW_CSUM); + if (IS_ERR(segs)) + goto out; + + /* All that is left is update SCTP CRC if necessary */ + if (!(features & NETIF_F_SCTP_CRC)) { + for (skb = segs; skb; skb = skb->next) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + sh = sctp_hdr(skb); + sh->checksum = sctp_gso_make_checksum(skb); + } + } + } + +out: + return segs; +} + +static const struct net_offload sctp_offload = { + .callbacks = { + .gso_segment = sctp_gso_segment, + }, +}; + +int __init sctp_offload_init(void) +{ + return inet_add_offload(&sctp_offload, IPPROTO_SCTP); +} diff --git a/net/sctp/output.c b/net/sctp/output.c index 9844fe573029..60499a69179d 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet) struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { - struct sctp_chunk *chunk = NULL; + struct sctp_transport *tp = packet->transport; + struct sctp_association *asoc = tp->asoc; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; + if (asoc && tp->dst) { + struct sock *sk = asoc->base.sk; + + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + + if (sk_can_gso(sk)) { + struct net_device *dev = tp->dst->dev; + + packet->max_size = dev->gso_max_size; + } else { + packet->max_size = asoc->pathmtu; + } + rcu_read_unlock(); + + } else { + packet->max_size = tp->pathmtu; + } + if (ecn_capable && sctp_packet_empty(packet)) { - chunk = sctp_get_ecne_prepend(packet->transport->asoc); + struct sctp_chunk *chunk; /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ + chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } @@ -381,12 +405,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - struct sk_buff *nskb; + struct sk_buff *nskb = NULL, *head = NULL; struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ + int pkt_size; __u8 has_data = 0; + int gso = 0; + int pktcount = 0; struct dst_entry *dst; unsigned char *auth = NULL; /* pointer to auth in skb data */ @@ -400,18 +427,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; - /* Allocate the new skb. */ - nskb = alloc_skb(packet->size + MAX_HEADER, gfp); - if (!nskb) + /* Allocate the head skb, or main one if not in GSO */ + if (packet->size > tp->pathmtu && !packet->ipfragok) { + if (sk_can_gso(sk)) { + gso = 1; + pkt_size = packet->overhead; + } else { + /* If this happens, we trash this packet and try + * to build a new one, hopefully correct this + * time. Application may notice this error. + */ + pr_err_once("Trying to GSO but underlying device doesn't support it."); + goto nomem; + } + } else { + pkt_size = packet->size; + } + head = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!head) goto nomem; + if (gso) { + NAPI_GRO_CB(head)->last = head; + skb_shinfo(head)->gso_type = sk->sk_gso_type; + } /* Make sure the outbound skb has enough header room reserved. */ - skb_reserve(nskb, packet->overhead + MAX_HEADER); + skb_reserve(head, packet->overhead + MAX_HEADER); /* Set the owning socket so that we know where to get the * destination IP address. */ - sctp_packet_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(head, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); @@ -422,11 +468,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) dst = dst_clone(tp->dst); if (!dst) goto no_route; - skb_dst_set(nskb, dst); + skb_dst_set(head, dst); /* Build the SCTP header. */ - sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); - skb_reset_transport_header(nskb); + sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr)); + skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); @@ -441,90 +487,133 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->vtag = htonl(packet->vtag); sh->checksum = 0; - /** - * 6.10 Bundling - * - * An endpoint bundles chunks by simply including multiple - * chunks in one outbound SCTP packet. ... - */ - - /** - * 3.2 Chunk Field Descriptions - * - * The total length of a chunk (including Type, Length and - * Value fields) MUST be a multiple of 4 bytes. If the length - * of the chunk is not a multiple of 4 bytes, the sender MUST - * pad the chunk with all zero bytes and this padding is not - * included in the chunk length field. The sender should - * never pad with more than 3 bytes. - * - * [This whole comment explains WORD_ROUND() below.] - */ - pr_debug("***sctp_transmit_packet***\n"); - list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { - list_del_init(&chunk->list); - if (sctp_chunk_is_data(chunk)) { - /* 6.3.1 C4) When data is in flight and when allowed - * by rule C5, a new RTT measurement MUST be made each - * round trip. Furthermore, new RTT measurements - * SHOULD be made no more than once per round-trip - * for a given destination transport address. - */ + do { + /* Set up convenience variables... */ + chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); + pktcount++; - if (!chunk->resent && !tp->rto_pending) { - chunk->rtt_in_progress = 1; - tp->rto_pending = 1; + /* Calculate packet size, so it fits in PMTU. Leave + * other chunks for the next packets. + */ + if (gso) { + pkt_size = packet->overhead; + list_for_each_entry(chunk, &packet->chunk_list, list) { + int padded = WORD_ROUND(chunk->skb->len); + + if (pkt_size + padded > tp->pathmtu) + break; + pkt_size += padded; } - has_data = 1; - } + /* Allocate a new skb. */ + nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!nskb) + goto nomem; - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; - if (padding) - memset(skb_put(chunk->skb, padding), 0, padding); + /* Make sure the outbound skb has enough header + * room reserved. + */ + skb_reserve(nskb, packet->overhead + MAX_HEADER); + } else { + nskb = head; + } - /* if this is the auth chunk that we are adding, - * store pointer where it will be added and put - * the auth into the packet. + /** + * 3.2 Chunk Field Descriptions + * + * The total length of a chunk (including Type, Length and + * Value fields) MUST be a multiple of 4 bytes. If the length + * of the chunk is not a multiple of 4 bytes, the sender MUST + * pad the chunk with all zero bytes and this padding is not + * included in the chunk length field. The sender should + * never pad with more than 3 bytes. + * + * [This whole comment explains WORD_ROUND() below.] */ - if (chunk == packet->auth) - auth = skb_tail_pointer(nskb); - memcpy(skb_put(nskb, chunk->skb->len), + pkt_size -= packet->overhead; + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + if (sctp_chunk_is_data(chunk)) { + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ + + if (!chunk->resent && !tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; + } + + has_data = 1; + } + + padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + if (padding) + memset(skb_put(chunk->skb, padding), 0, padding); + + /* if this is the auth chunk that we are adding, + * store pointer where it will be added and put + * the auth into the packet. + */ + if (chunk == packet->auth) + auth = skb_tail_pointer(nskb); + + memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data, chunk->skb->len); - pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, " - "rtt_in_progress:%d\n", chunk, - sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), - chunk->has_tsn ? "TSN" : "No TSN", - chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, - ntohs(chunk->chunk_hdr->length), chunk->skb->len, - chunk->rtt_in_progress); - - /* - * If this is a control chunk, this is our last - * reference. Free data chunks after they've been - * acknowledged or have failed. - */ - if (!sctp_chunk_is_data(chunk)) - sctp_chunk_free(chunk); - } + pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", + chunk, + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), + chunk->has_tsn ? "TSN" : "No TSN", + chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, + ntohs(chunk->chunk_hdr->length), chunk->skb->len, + chunk->rtt_in_progress); + + /* If this is a control chunk, this is our last + * reference. Free data chunks after they've been + * acknowledged or have failed. + * Re-queue auth chunks if needed. + */ + pkt_size -= WORD_ROUND(chunk->skb->len); - /* SCTP-AUTH, Section 6.2 - * The sender MUST calculate the MAC as described in RFC2104 [2] - * using the hash function H as described by the MAC Identifier and - * the shared association key K based on the endpoint pair shared key - * described by the shared key identifier. The 'data' used for the - * computation of the AUTH-chunk is given by the AUTH chunk with its - * HMAC field set to zero (as shown in Figure 6) followed by all - * chunks that are placed after the AUTH chunk in the SCTP packet. - */ - if (auth) - sctp_auth_calculate_hmac(asoc, nskb, - (struct sctp_auth_chunk *)auth, - gfp); + if (chunk == packet->auth && !list_empty(&packet->chunk_list)) + list_add(&chunk->list, &packet->chunk_list); + else if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + + if (!pkt_size) + break; + } + + /* SCTP-AUTH, Section 6.2 + * The sender MUST calculate the MAC as described in RFC2104 [2] + * using the hash function H as described by the MAC Identifier and + * the shared association key K based on the endpoint pair shared key + * described by the shared key identifier. The 'data' used for the + * computation of the AUTH-chunk is given by the AUTH chunk with its + * HMAC field set to zero (as shown in Figure 6) followed by all + * chunks that are placed after the AUTH chunk in the SCTP packet. + */ + if (auth) + sctp_auth_calculate_hmac(asoc, nskb, + (struct sctp_auth_chunk *)auth, + gfp); + + if (!gso) + break; + + if (skb_gro_receive(&head, nskb)) + goto nomem; + nskb = NULL; + if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= + sk->sk_gso_max_segs)) + goto nomem; + } while (!list_empty(&packet->chunk_list)); /* 2) Calculate the Adler-32 checksum of the whole packet, * including the SCTP common header and all the @@ -532,16 +621,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . + * + * If it's a GSO packet, it's postponed to sctp_skb_segment. */ - if (!sctp_checksum_disable) { - if (!(dst->dev->features & NETIF_F_SCTP_CRC) || - (dst_xfrm(dst) != NULL) || packet->ipfragok) { - sh->checksum = sctp_compute_cksum(nskb, 0); + if (!sctp_checksum_disable || gso) { + if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) || + dst_xfrm(dst) || packet->ipfragok)) { + sh->checksum = sctp_compute_cksum(head, 0); } else { /* no need to seed pseudo checksum for SCTP */ - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = skb_transport_header(nskb) - nskb->head; - nskb->csum_offset = offsetof(struct sctphdr, checksum); + head->ip_summed = CHECKSUM_PARTIAL; + head->csum_start = skb_transport_header(head) - head->head; + head->csum_offset = offsetof(struct sctphdr, checksum); } } @@ -557,7 +648,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * Note: The works for IPv6 layer checks this bit too later * in transmission. See IP6_ECN_flow_xmit(). */ - tp->af_specific->ecn_capable(nskb->sk); + tp->af_specific->ecn_capable(sk); /* Set up the IP options. */ /* BUG: not implemented @@ -566,7 +657,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) /* Dump that on IP! */ if (asoc) { - asoc->stats.opackets++; + asoc->stats.opackets += pktcount; if (asoc->peer.last_sent_to != tp) /* Considering the multiple CPU scenario, this is a * "correcter" place for last_sent_to. --xguo @@ -589,16 +680,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) } } - pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len); + pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); + + if (gso) { + /* Cleanup our debris for IP stacks */ + memset(head->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); - nskb->ignore_df = packet->ipfragok; - tp->af_specific->sctp_xmit(nskb, tp); + skb_shinfo(head)->gso_segs = pktcount; + skb_shinfo(head)->gso_size = GSO_BY_FRAGS; + + /* We have to refresh this in case we are xmiting to + * more than one transport at a time + */ + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + rcu_read_unlock(); + } + head->ignore_df = packet->ipfragok; + tp->af_specific->sctp_xmit(head, tp); out: sctp_packet_reset(packet); return err; no_route: - kfree_skb(nskb); + kfree_skb(head); + if (nskb != head) + kfree_skb(nskb); if (asoc) IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); @@ -751,39 +862,63 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { - size_t psize; - size_t pmtu; - int too_big; + size_t psize, pmtu; sctp_xmit_t retval = SCTP_XMIT_OK; psize = packet->size; - pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pathmtu) : - (packet->transport->pathmtu)); - - too_big = (psize + chunk_len > pmtu); + if (packet->transport->asoc) + pmtu = packet->transport->asoc->pathmtu; + else + pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ - if (too_big) { - /* It's OK to fragmet at IP level if any one of the following + if (psize + chunk_len > pmtu) { + /* It's OK to fragment at IP level if any one of the following * is true: - * 1. The packet is empty (meaning this chunk is greater - * the MTU) - * 2. The chunk we are adding is a control chunk - * 3. The packet doesn't have any data in it yet and data - * requires authentication. + * 1. The packet is empty (meaning this chunk is greater + * the MTU) + * 2. The packet doesn't have any data in it yet and data + * requires authentication. */ - if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) || + if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; - } else { - retval = SCTP_XMIT_PMTU_FULL; + goto out; } + + /* It is also okay to fragment if the chunk we are + * adding is a control chunk, but only if current packet + * is not a GSO one otherwise it causes fragmentation of + * a large frame. So in this case we allow the + * fragmentation by forcing it to be in a new packet. + */ + if (!sctp_chunk_is_data(chunk) && packet->has_data) + retval = SCTP_XMIT_PMTU_FULL; + + if (psize + chunk_len > packet->max_size) + /* Hit GSO/PMTU limit, gotta flush */ + retval = SCTP_XMIT_PMTU_FULL; + + if (!packet->transport->burst_limited && + psize + chunk_len > (packet->transport->cwnd >> 1)) + /* Do not allow a single GSO packet to use more + * than half of cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + + if (packet->transport->burst_limited && + psize + chunk_len > (packet->transport->burst_limited >> 1)) + /* Do not allow a single GSO packet to use more + * than half of original cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + /* Otherwise it will fit in the GSO packet */ } +out: return retval; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d3d50daa248b..40022ee885d7 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1516,6 +1516,9 @@ static __init int sctp_init(void) if (status) goto err_v6_add_protocol; + if (sctp_offload_init() < 0) + pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); + out: return status; err_v6_add_protocol: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 67154b848aa9..712fb2339baa 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4003,6 +4003,8 @@ static int sctp_init_sock(struct sock *sk) return -ESOCKTNOSUPPORT; } + sk->sk_gso_type = SKB_GSO_SCTP; + /* Initialize default send parameters. These parameters can be * modified with the SCTP_DEFAULT_SEND_PARAM socket option. */ -- cgit v1.2.3 From c8b098086b4c744084350d2757a637ad756adf34 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:57 +0200 Subject: net: dsa: Add a ports structure and use it in the switch structure There are going to be more per-port members added to the switch structure. So add a port structure and move the netdev into it. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 4 ++-- drivers/net/dsa/mv88e6xxx.c | 27 ++++++++++++++++----------- include/net/dsa.h | 8 ++++++-- net/dsa/dsa.c | 8 ++++---- net/dsa/slave.c | 4 ++-- net/dsa/tag_brcm.c | 4 ++-- net/dsa/tag_dsa.c | 4 ++-- net/dsa/tag_edsa.c | 4 ++-- net/dsa/tag_trailer.c | 4 ++-- 9 files changed, 38 insertions(+), 29 deletions(-) (limited to 'include/net') diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 10ddd5a5dfb6..73df91bb0466 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -804,7 +804,7 @@ static int bcm_sf2_sw_fdb_dump(struct dsa_switch *ds, int port, int (*cb)(struct switchdev_obj *obj)) { struct bcm_sf2_priv *priv = ds_to_priv(ds); - struct net_device *dev = ds->ports[port]; + struct net_device *dev = ds->ports[port].netdev; struct bcm_sf2_arl_entry results[2]; unsigned int count = 0; int ret; @@ -1248,7 +1248,7 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port, * state machine and make it go in PHY_FORCING state instead. */ if (!status->link) - netif_carrier_off(ds->ports[port]); + netif_carrier_off(ds->ports[port].netdev); status->duplex = 1; } else { status->link = 1; diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index c361036e7f9c..85332d9a245a 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -1327,7 +1327,7 @@ static int _mv88e6xxx_port_state(struct mv88e6xxx_priv_state *ps, int port, if (ret) return ret; - netdev_dbg(ds->ports[port], "PortState %s (was %s)\n", + netdev_dbg(ds->ports[port].netdev, "PortState %s (was %s)\n", mv88e6xxx_port_state_names[state], mv88e6xxx_port_state_names[oldstate]); } @@ -1405,7 +1405,8 @@ static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port, mutex_unlock(&ps->smi_mutex); if (err) - netdev_err(ds->ports[port], "failed to update state to %s\n", + netdev_err(ds->ports[port].netdev, + "failed to update state to %s\n", mv88e6xxx_port_state_names[stp_state]); } @@ -1431,8 +1432,8 @@ static int _mv88e6xxx_port_pvid(struct mv88e6xxx_priv_state *ps, int port, if (ret < 0) return ret; - netdev_dbg(ds->ports[port], "DefaultVID %d (was %d)\n", *new, - pvid); + netdev_dbg(ds->ports[port].netdev, + "DefaultVID %d (was %d)\n", *new, pvid); } if (old) @@ -1847,7 +1848,8 @@ static int _mv88e6xxx_port_fid(struct mv88e6xxx_priv_state *ps, int port, if (ret < 0) return ret; - netdev_dbg(ds->ports[port], "FID %d (was %d)\n", *new, fid); + netdev_dbg(ds->ports[port].netdev, + "FID %d (was %d)\n", *new, fid); } if (old) @@ -2028,7 +2030,7 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, ps->ports[port].bridge_dev) break; /* same bridge, check next VLAN */ - netdev_warn(ds->ports[port], + netdev_warn(ds->ports[port].netdev, "hardware VLAN %d already used by %s\n", vlan.vid, netdev_name(ps->ports[i].bridge_dev)); @@ -2078,7 +2080,7 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port, if (ret < 0) goto unlock; - netdev_dbg(ds->ports[port], "802.1Q Mode %s (was %s)\n", + netdev_dbg(ds->ports[port].netdev, "802.1Q Mode %s (was %s)\n", mv88e6xxx_port_8021q_mode_names[new], mv88e6xxx_port_8021q_mode_names[old]); } @@ -2147,11 +2149,12 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) if (_mv88e6xxx_port_vlan_add(ps, port, vid, untagged)) - netdev_err(ds->ports[port], "failed to add VLAN %d%c\n", + netdev_err(ds->ports[port].netdev, + "failed to add VLAN %d%c\n", vid, untagged ? 'u' : 't'); if (pvid && _mv88e6xxx_port_pvid_set(ps, port, vlan->vid_end)) - netdev_err(ds->ports[port], "failed to set PVID %d\n", + netdev_err(ds->ports[port].netdev, "failed to set PVID %d\n", vlan->vid_end); mutex_unlock(&ps->smi_mutex); @@ -2336,7 +2339,8 @@ static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, mutex_lock(&ps->smi_mutex); if (_mv88e6xxx_port_fdb_load(ps, port, fdb->addr, fdb->vid, state)) - netdev_err(ds->ports[port], "failed to load MAC address\n"); + netdev_err(ds->ports[port].netdev, + "failed to load MAC address\n"); mutex_unlock(&ps->smi_mutex); } @@ -2537,7 +2541,8 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port) for (i = 0; i < ps->info->num_ports; ++i) if (i == port || ps->ports[i].bridge_dev == bridge) if (_mv88e6xxx_port_based_vlan_map(ps, i)) - netdev_warn(ds->ports[i], "failed to remap\n"); + netdev_warn(ds->ports[i].netdev, + "failed to remap\n"); mutex_unlock(&ps->smi_mutex); } diff --git a/include/net/dsa.h b/include/net/dsa.h index 17c3d37b6779..9aed8572037c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -119,6 +119,10 @@ struct dsa_switch_tree { struct dsa_switch *ds[DSA_MAX_SWITCHES]; }; +struct dsa_port { + struct net_device *netdev; +}; + struct dsa_switch { struct device *dev; @@ -158,8 +162,8 @@ struct dsa_switch { u32 dsa_port_mask; u32 enabled_port_mask; u32 phys_mii_mask; + struct dsa_port ports[DSA_MAX_PORTS]; struct mii_bus *slave_mii_bus; - struct net_device *ports[DSA_MAX_PORTS]; }; static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) @@ -174,7 +178,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) static inline bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return ds->enabled_port_mask & (1 << p) && ds->ports[p]; + return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev; } static inline u8 dsa_upstream_port(struct dsa_switch *ds) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index eff5dfc2e33f..18086e0cc617 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -437,10 +437,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(ds->enabled_port_mask & (1 << port))) continue; - if (!ds->ports[port]) + if (!ds->ports[port].netdev) continue; - dsa_slave_destroy(ds->ports[port]); + dsa_slave_destroy(ds->ports[port].netdev); } /* Remove any fixed link PHYs */ @@ -469,7 +469,7 @@ static int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i]); + ret = dsa_slave_suspend(ds->ports[i].netdev); if (ret) return ret; } @@ -495,7 +495,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i]); + ret = dsa_slave_resume(ds->ports[i].netdev); if (ret) return ret; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f640a48a6ff3..169abacbc6ce 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1183,12 +1183,12 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->old_link = -1; p->old_duplex = -1; - ds->ports[port] = slave_dev; + ds->ports[port].netdev = slave_dev; ret = register_netdev(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - ds->ports[port] = NULL; + ds->ports[port].netdev = NULL; free_netdev(slave_dev); return ret; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e2aadb73111d..21bffde6e4bf 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -127,7 +127,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, source_port = brcm_tag[3] & BRCM_EG_PID_MASK; /* Validate port against switch setup, either the port is totally */ - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* Remove Broadcom tag and update checksum */ @@ -140,7 +140,7 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb->protocol = eth_type_trans(skb, skb->dev); skb->dev->stats.rx_packets++; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index f9832f097681..bce79ffe342b 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -114,7 +114,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds) goto out_drop; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* @@ -163,7 +163,7 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 3890aac8190f..6c1720e88537 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -127,7 +127,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds) goto out_drop; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; /* @@ -182,7 +182,7 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b6ca0890d018..5e3903eb1afa 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -82,12 +82,12 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, goto out_drop; source_port = trailer[1] & 7; - if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + if (source_port >= DSA_MAX_PORTS || !ds->ports[source_port].netdev) goto out_drop; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port]; + skb->dev = ds->ports[source_port].netdev; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); -- cgit v1.2.3 From 189b0d93ec61e1f991e96d7bc03b03cf929d164c Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:58 +0200 Subject: net: dsa: Move port device node into port structure Move the port device node structure into the port structure, from the chip data. This information is needed in the next step of implementing the new binding. The chip data structure is used while parsing the whole old binding, before the individual switch structures exist. With the new bindings, this is reversed, the switches exist first, and the interconnections between the switches is derived from the individual switch bindings. Thus this chip data structure becomes unneeded. Signed-off-by: Andrew Lunn eviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 8 ++++---- net/dsa/slave.c | 5 ++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9aed8572037c..8314197d028f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -121,6 +121,7 @@ struct dsa_switch_tree { struct dsa_port { struct net_device *netdev; + struct device_node *dn; }; struct dsa_switch { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 18086e0cc617..5907f8cd13b6 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -182,7 +182,6 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); /* basic switch operations **************************************************/ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) { - struct dsa_chip_data *cd = ds->cd; struct device_node *port_dn; struct phy_device *phydev; int ret, port, mode; @@ -191,7 +190,7 @@ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; - port_dn = cd->port_dn[port]; + port_dn = ds->ports[port].dn; if (of_phy_is_fixed_link(port_dn)) { ret = of_phy_register_fixed_link(port_dn); if (ret) { @@ -325,6 +324,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * Create network devices for physical switch ports. */ for (i = 0; i < DSA_MAX_PORTS; i++) { + ds->ports[i].dn = cd->port_dn[i]; + if (!(ds->enabled_port_mask & (1 << i))) continue; @@ -424,7 +425,6 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { struct device_node *port_dn; struct phy_device *phydev; - struct dsa_chip_data *cd = ds->cd; int port; #ifdef CONFIG_NET_DSA_HWMON @@ -445,7 +445,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) /* Remove any fixed link PHYs */ for (port = 0; port < DSA_MAX_PORTS; port++) { - port_dn = cd->port_dn[port]; + port_dn = ds->ports[port].dn; if (of_phy_is_fixed_link(port_dn)) { phydev = of_phy_find_device(port_dn); if (phydev) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 169abacbc6ce..52f1183c42a0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -998,13 +998,12 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { struct dsa_switch *ds = p->parent; - struct dsa_chip_data *cd = ds->cd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; - port_dn = cd->port_dn[p->port]; + port_dn = ds->ports[p->port].dn; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; @@ -1146,7 +1145,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, NULL); SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->cd->port_dn[port]; + slave_dev->dev.of_node = ds->ports[port].dn; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); -- cgit v1.2.3 From 4a7704ffa86705b0580b6473c407b7b7618e072d Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:16:59 +0200 Subject: net: dsa: Remove dynamic allocate of routing table With a maximum of four switches, the size of the routing table is the same as the pointer to it. Removing it makes the code simpler. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx.c | 3 +-- include/net/dsa.h | 9 ++++----- net/dsa/dsa.c | 12 ------------ 3 files changed, 5 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 85332d9a245a..d622c0fb76cc 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -3024,8 +3024,7 @@ static int mv88e6xxx_setup_global(struct mv88e6xxx_priv_state *ps) for (i = 0; i < 32; i++) { int nexthop = 0x1f; - if (ps->ds->cd->rtable && - i != ps->ds->index && i < ps->ds->dst->pd->nr_chips) + if (i != ps->ds->index && i < ps->ds->dst->pd->nr_chips) nexthop = ps->ds->cd->rtable[i] & 0x1f; err = _mv88e6xxx_reg_write( diff --git a/include/net/dsa.h b/include/net/dsa.h index 8314197d028f..4e3afa9648ca 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -58,12 +58,11 @@ struct dsa_chip_data { struct device_node *port_dn[DSA_MAX_PORTS]; /* - * An array (with nr_chips elements) of which element [a] - * indicates which port on this switch should be used to - * send packets to that are destined for switch a. Can be - * NULL if there is only one switch chip. + * An array of which element [a] indicates which port on this + * switch should be used to send packets to that are destined + * for switch a. Can be NULL if there is only one switch chip. */ - s8 *rtable; + s8 rtable[DSA_MAX_SWITCHES]; }; struct dsa_platform_data { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5907f8cd13b6..6177dd750847 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -587,17 +587,6 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, if (link_sw_addr >= pd->nr_chips) return -EINVAL; - /* First time routing table allocation */ - if (!cd->rtable) { - cd->rtable = kmalloc_array(pd->nr_chips, sizeof(s8), - GFP_KERNEL); - if (!cd->rtable) - return -ENOMEM; - - /* default to no valid uplink/downlink */ - memset(cd->rtable, -1, pd->nr_chips * sizeof(s8)); - } - cd->rtable[link_sw_addr] = port_index; return 0; @@ -639,7 +628,6 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) kfree(pd->chip[i].port_names[port_index]); port_index++; } - kfree(pd->chip[i].rtable); /* Drop our reference to the MDIO bus device */ if (pd->chip[i].host_dev) -- cgit v1.2.3 From 66472fc04e8be62858f29c7798ed17e984c1ab3b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:00 +0200 Subject: net: dsa: Copy the routing table into the switch structure The new binding will not have a chip data structure, it will place the routing directly into the switch structure. To enable backwards compatibility, copy the routing from the chip data into the switch structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx.c | 4 ++-- include/net/dsa.h | 9 ++++++++- net/dsa/dsa.c | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index d622c0fb76cc..492801a6398c 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -3024,8 +3024,8 @@ static int mv88e6xxx_setup_global(struct mv88e6xxx_priv_state *ps) for (i = 0; i < 32; i++) { int nexthop = 0x1f; - if (i != ps->ds->index && i < ps->ds->dst->pd->nr_chips) - nexthop = ps->ds->cd->rtable[i] & 0x1f; + if (i != ds->index && i < DSA_MAX_SWITCHES) + nexthop = ds->rtable[i] & 0x1f; err = _mv88e6xxx_reg_write( ps, REG_GLOBAL2, diff --git a/include/net/dsa.h b/include/net/dsa.h index 4e3afa9648ca..b666f27b3daa 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -148,6 +148,13 @@ struct dsa_switch { */ struct dsa_switch_driver *drv; + /* + * An array of which element [a] indicates which port on this + * switch should be used to send packets to that are destined + * for switch a. Can be NULL if there is only one switch chip. + */ + s8 rtable[DSA_MAX_SWITCHES]; + #ifdef CONFIG_NET_DSA_HWMON /* * Hardware monitoring information @@ -194,7 +201,7 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds) if (dst->cpu_switch == ds->index) return dst->cpu_port; else - return ds->cd->rtable[dst->cpu_switch]; + return ds->rtable[dst->cpu_switch]; } struct switchdev_trans; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6177dd750847..bfe1d03d4730 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -297,6 +297,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) dst->tag_protocol = drv->tag_protocol; } + memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); + /* * Do basic register setup. */ -- cgit v1.2.3 From 39a7f2a4eb496c0c68cc93fcb403190b48605168 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:03 +0200 Subject: net: dsa: Refactor selection of tag ops into a function Replace the two switch statements with an array lookup, and store the result in the dsa tree structure. The drivers no longer need to know the selected tag protocol, so remove it from the dsa switch structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 8 +++++- net/dsa/dsa.c | 71 ++++++++++++++++++++++++++++++++++-------------------- net/dsa/dsa_priv.h | 1 + net/dsa/slave.c | 35 +-------------------------- 4 files changed, 54 insertions(+), 61 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index b666f27b3daa..bd6ecaa0de96 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -26,6 +26,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_TRAILER, DSA_TAG_PROTO_EDSA, DSA_TAG_PROTO_BRCM, + DSA_TAG_LAST, /* MUST BE LAST */ }; #define DSA_MAX_SWITCHES 4 @@ -99,7 +100,6 @@ struct dsa_switch_tree { struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); - enum dsa_tag_protocol tag_protocol; /* * Original copy of the master netdev ethtool_ops @@ -116,6 +116,12 @@ struct dsa_switch_tree { * Data for the individual switch chips. */ struct dsa_switch *ds[DSA_MAX_SWITCHES]; + + /* + * Tagging protocol operations for adding and removing an + * encapsulation tag. + */ + const struct dsa_device_ops *tag_ops; }; struct dsa_port { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7140de475c07..221ebde4318d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -29,6 +29,33 @@ char dsa_driver_version[] = "0.1"; +static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Just return the original SKB */ + return skb; +} + +static const struct dsa_device_ops none_ops = { + .xmit = dsa_slave_notag_xmit, + .rcv = NULL, +}; + +const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { +#ifdef CONFIG_NET_DSA_TAG_DSA + [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, +#endif + [DSA_TAG_PROTO_NONE] = &none_ops, +}; /* switch driver registration ***********************************************/ static DEFINE_MUTEX(dsa_switch_drivers_mutex); @@ -225,6 +252,20 @@ static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) return 0; } +const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) +{ + const struct dsa_device_ops *ops; + + if (tag_protocol >= DSA_TAG_LAST) + return ERR_PTR(-EINVAL); + ops = dsa_device_ops[tag_protocol]; + + if (!ops) + return ERR_PTR(-ENOPROTOOPT); + + return ops; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -277,35 +318,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - switch (drv->tag_protocol) { -#ifdef CONFIG_NET_DSA_TAG_DSA - case DSA_TAG_PROTO_DSA: - dst->rcv = dsa_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - case DSA_TAG_PROTO_EDSA: - dst->rcv = edsa_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - case DSA_TAG_PROTO_TRAILER: - dst->rcv = trailer_netdev_ops.rcv; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - case DSA_TAG_PROTO_BRCM: - dst->rcv = brcm_netdev_ops.rcv; - break; -#endif - case DSA_TAG_PROTO_NONE: - break; - default: - ret = -ENOPROTOOPT; + dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); + if (IS_ERR(dst->tag_ops)) { + ret = PTR_ERR(dst->tag_ops); goto out; } - dst->tag_protocol = drv->tag_protocol; + dst->rcv = dst->tag_ops->rcv; } memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index dbea5d9e7f75..72f7b8989cfb 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -53,6 +53,7 @@ extern char dsa_driver_version[]; int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct device_node *port_dn, int port); void dsa_cpu_dsa_destroy(struct device_node *port_dn); +const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 52f1183c42a0..35e5f0f6688b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -521,14 +521,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - /* Just return the original SKB */ - return skb; -} - - /* ethtool operations *******************************************************/ static int dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) @@ -1151,32 +1143,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, p = netdev_priv(slave_dev); p->parent = ds; p->port = port; - - switch (ds->dst->tag_protocol) { -#ifdef CONFIG_NET_DSA_TAG_DSA - case DSA_TAG_PROTO_DSA: - p->xmit = dsa_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - case DSA_TAG_PROTO_EDSA: - p->xmit = edsa_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - case DSA_TAG_PROTO_TRAILER: - p->xmit = trailer_netdev_ops.xmit; - break; -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM - case DSA_TAG_PROTO_BRCM: - p->xmit = brcm_netdev_ops.xmit; - break; -#endif - default: - p->xmit = dsa_slave_notag_xmit; - break; - } + p->xmit = dst->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; -- cgit v1.2.3 From 83c0afaec7b730b16c518aecc8e6246ec91b265e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 4 Jun 2016 21:17:07 +0200 Subject: net: dsa: Add new binding implementation The existing DSA binding has a number of limitations and problems. The main problem is that it cannot represent a switch as a linux device, hanging off some bus. It is limited to one CPU port. The DSA platform device is artificial, and does not really represent hardware. Implement a new binding which can be embedded into any type of node on a bus to represent one switch device, and its links to other switches. Signed-off-by: Andrew Lunn Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx.c | 7 + include/net/dsa.h | 20 ++ net/dsa/Makefile | 2 +- net/dsa/dsa.c | 5 + net/dsa/dsa2.c | 654 ++++++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa_priv.h | 2 +- net/dsa/slave.c | 8 +- 7 files changed, 694 insertions(+), 4 deletions(-) create mode 100644 net/dsa/dsa2.c (limited to 'include/net') diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 192b39cbdbcd..ee06055444f9 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -3748,6 +3748,12 @@ int mv88e6xxx_probe(struct mdio_device *mdiodev) dev_set_drvdata(dev, ds); + err = dsa_register_switch(ds, mdiodev->dev.of_node); + if (err) { + mv88e6xxx_mdio_unregister(ps); + return err; + } + dev_info(dev, "switch 0x%x probed: %s, revision %u\n", prod_num, ps->info->name, rev); @@ -3759,6 +3765,7 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev) struct dsa_switch *ds = dev_get_drvdata(&mdiodev->dev); struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); + dsa_unregister_switch(ds); put_device(&ps->bus->dev); mv88e6xxx_mdio_unregister(ps); diff --git a/include/net/dsa.h b/include/net/dsa.h index bd6ecaa0de96..cca7ef230742 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -85,6 +85,17 @@ struct dsa_platform_data { struct packet_type; struct dsa_switch_tree { + struct list_head list; + + /* Tree identifier */ + u32 tree; + + /* Number of switches attached to this tree */ + struct kref refcount; + + /* Has this tree been applied to the hardware? */ + bool applied; + /* * Configuration data for the platform device that owns * this dsa switch tree instance. @@ -169,10 +180,16 @@ struct dsa_switch { struct device *hwmon_dev; #endif + /* + * The lower device this switch uses to talk to the host + */ + struct net_device *master_netdev; + /* * Slave mii_bus and devices for the individual ports. */ u32 dsa_port_mask; + u32 cpu_port_mask; u32 enabled_port_mask; u32 phys_mii_mask; struct dsa_port ports[DSA_MAX_PORTS]; @@ -361,4 +378,7 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) { return dst->rcv != NULL; } + +void dsa_unregister_switch(struct dsa_switch *ds); +int dsa_register_switch(struct dsa_switch *ds, struct device_node *np); #endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index da06ed1df620..8af4ded70f1c 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o +dsa_core-y += dsa.o slave.o dsa2.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6c314f300424..ce3b942dce76 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -294,6 +294,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } dst->cpu_switch = index; dst->cpu_port = i; + ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; } else { @@ -492,6 +493,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) continue; dsa_cpu_dsa_destroy(ds->ports[port].dn); + + /* Clearing a bit which is not set does no harm */ + ds->cpu_port_mask |= ~(1 << port); + ds->dsa_port_mask |= ~(1 << port); } if (ds->slave_mii_bus && ds->drv->phy_read) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c new file mode 100644 index 000000000000..80dfe08db825 --- /dev/null +++ b/net/dsa/dsa2.c @@ -0,0 +1,654 @@ +/* + * net/dsa/dsa2.c - Hardware switch handling, binding version 2 + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * Copyright (c) 2016 Andrew Lunn + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dsa_priv.h" + +static LIST_HEAD(dsa_switch_trees); +static DEFINE_MUTEX(dsa2_mutex); + +static struct dsa_switch_tree *dsa_get_dst(u32 tree) +{ + struct dsa_switch_tree *dst; + + list_for_each_entry(dst, &dsa_switch_trees, list) + if (dst->tree == tree) + return dst; + return NULL; +} + +static void dsa_free_dst(struct kref *ref) +{ + struct dsa_switch_tree *dst = container_of(ref, struct dsa_switch_tree, + refcount); + + list_del(&dst->list); + kfree(dst); +} + +static void dsa_put_dst(struct dsa_switch_tree *dst) +{ + kref_put(&dst->refcount, dsa_free_dst); +} + +static struct dsa_switch_tree *dsa_add_dst(u32 tree) +{ + struct dsa_switch_tree *dst; + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return NULL; + dst->tree = tree; + dst->cpu_switch = -1; + INIT_LIST_HEAD(&dst->list); + list_add_tail(&dsa_switch_trees, &dst->list); + kref_init(&dst->refcount); + + return dst; +} + +static void dsa_dst_add_ds(struct dsa_switch_tree *dst, + struct dsa_switch *ds, u32 index) +{ + kref_get(&dst->refcount); + dst->ds[index] = ds; +} + +static void dsa_dst_del_ds(struct dsa_switch_tree *dst, + struct dsa_switch *ds, u32 index) +{ + dst->ds[index] = NULL; + kref_put(&dst->refcount, dsa_free_dst); +} + +static bool dsa_port_is_dsa(struct device_node *port) +{ + const char *name; + + name = of_get_property(port, "label", NULL); + if (!name) + return false; + + if (!strcmp(name, "dsa")) + return true; + + return false; +} + +static bool dsa_port_is_cpu(struct device_node *port) +{ + const char *name; + + name = of_get_property(port, "label", NULL); + if (!name) + return false; + + if (!strcmp(name, "cpu")) + return true; + + return false; +} + +static bool dsa_ds_find_port(struct dsa_switch *ds, + struct device_node *port) +{ + u32 index; + + for (index = 0; index < DSA_MAX_PORTS; index++) + if (ds->ports[index].dn == port) + return true; + return false; +} + +static struct dsa_switch *dsa_dst_find_port(struct dsa_switch_tree *dst, + struct device_node *port) +{ + struct dsa_switch *ds; + u32 index; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + if (dsa_ds_find_port(ds, port)) + return ds; + } + + return NULL; +} + +static int dsa_port_complete(struct dsa_switch_tree *dst, + struct dsa_switch *src_ds, + struct device_node *port, + u32 src_port) +{ + struct device_node *link; + int index; + struct dsa_switch *dst_ds; + + for (index = 0;; index++) { + link = of_parse_phandle(port, "link", index); + if (!link) + break; + + dst_ds = dsa_dst_find_port(dst, link); + of_node_put(link); + + if (!dst_ds) + return 1; + + src_ds->rtable[dst_ds->index] = src_port; + } + + return 0; +} + +/* A switch is complete if all the DSA ports phandles point to ports + * known in the tree. A return value of 1 means the tree is not + * complete. This is not an error condition. A value of 0 is + * success. + */ +static int dsa_ds_complete(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (!dsa_port_is_dsa(port)) + continue; + + err = dsa_port_complete(dst, ds, port, index); + if (err != 0) + return err; + + ds->dsa_port_mask |= BIT(index); + } + + return 0; +} + +/* A tree is complete if all the DSA ports phandles point to ports + * known in the tree. A return value of 1 means the tree is not + * complete. This is not an error condition. A value of 0 is + * success. + */ +static int dsa_dst_complete(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_complete(dst, ds); + if (err != 0) + return err; + } + + return 0; +} + +static int dsa_dsa_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + int err; + + err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + if (err) { + dev_warn(ds->dev, "Failed to setup dsa port %d: %d\n", + index, err); + return err; + } + + return 0; +} + +static void dsa_dsa_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + dsa_cpu_dsa_destroy(port); +} + +static int dsa_cpu_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + int err; + + err = dsa_cpu_dsa_setup(ds, ds->dev, port, index); + if (err) { + dev_warn(ds->dev, "Failed to setup cpu port %d: %d\n", + index, err); + return err; + } + + ds->cpu_port_mask |= BIT(index); + + return 0; +} + +static void dsa_cpu_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + dsa_cpu_dsa_destroy(port); + ds->cpu_port_mask &= ~BIT(index); + +} + +static int dsa_user_port_apply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + const char *name; + int err; + + name = of_get_property(port, "label", NULL); + + err = dsa_slave_create(ds, ds->dev, index, name); + if (err) { + dev_warn(ds->dev, "Failed to create slave %d: %d\n", + index, err); + return err; + } + + return 0; +} + +static void dsa_user_port_unapply(struct device_node *port, u32 index, + struct dsa_switch *ds) +{ + if (ds->ports[index].netdev) { + dsa_slave_destroy(ds->ports[index].netdev); + ds->ports[index].netdev = NULL; + } +} + +static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + err = ds->drv->setup(ds); + if (err < 0) + return err; + + err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + + err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_dsa(port)) { + err = dsa_dsa_port_apply(port, index, ds); + if (err) + return err; + continue; + } + + if (dsa_port_is_cpu(port)) { + err = dsa_cpu_port_apply(port, index, ds); + if (err) + return err; + continue; + } + + err = dsa_user_port_apply(port, index, ds); + if (err) + continue; + } + + return 0; +} + +static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_dsa(port)) { + dsa_dsa_port_unapply(port, index, ds); + continue; + } + + if (dsa_port_is_cpu(port)) { + dsa_cpu_port_unapply(port, index, ds); + continue; + } + + dsa_user_port_unapply(port, index, ds); + } +} + +static int dsa_dst_apply(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_apply(dst, ds); + if (err) + return err; + } + + /* If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dst->master_netdev->dsa_ptr = (void *)dst; + dst->applied = true; + + return 0; +} + +static void dsa_dst_unapply(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + + if (!dst->applied) + return; + + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + dsa_ds_unapply(dst, ds); + } + + pr_info("DSA: tree %d unapplied\n", dst->tree); + dst->applied = false; +} + +static int dsa_cpu_parse(struct device_node *port, u32 index, + struct dsa_switch_tree *dst, + struct dsa_switch *ds) +{ + struct net_device *ethernet_dev; + struct device_node *ethernet; + + ethernet = of_parse_phandle(port, "ethernet", 0); + if (!ethernet) + return -EINVAL; + + ethernet_dev = of_find_net_device_by_node(ethernet); + if (!ethernet_dev) + return -EPROBE_DEFER; + + if (!ds->master_netdev) + ds->master_netdev = ethernet_dev; + + if (!dst->master_netdev) + dst->master_netdev = ethernet_dev; + + if (dst->cpu_switch == -1) { + dst->cpu_switch = ds->index; + dst->cpu_port = index; + } + + dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); + if (IS_ERR(dst->tag_ops)) { + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(dst->tag_ops); + } + + dst->rcv = dst->tag_ops->rcv; + + return 0; +} + +static int dsa_ds_parse(struct dsa_switch_tree *dst, struct dsa_switch *ds) +{ + struct device_node *port; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_PORTS; index++) { + port = ds->ports[index].dn; + if (!port) + continue; + + if (dsa_port_is_cpu(port)) { + err = dsa_cpu_parse(port, index, dst, ds); + if (err) + return err; + } + } + + pr_info("DSA: switch %d %d parsed\n", dst->tree, ds->index); + + return 0; +} + +static int dsa_dst_parse(struct dsa_switch_tree *dst) +{ + struct dsa_switch *ds; + u32 index; + int err; + + for (index = 0; index < DSA_MAX_SWITCHES; index++) { + ds = dst->ds[index]; + if (!ds) + continue; + + err = dsa_ds_parse(dst, ds); + if (err) + return err; + } + + if (!dst->master_netdev) { + pr_warn("Tree has no master device\n"); + return -EINVAL; + } + + pr_info("DSA: tree %d parsed\n", dst->tree); + + return 0; +} + +static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) +{ + struct device_node *port; + int err; + u32 reg; + + for_each_available_child_of_node(ports, port) { + err = of_property_read_u32(port, "reg", ®); + if (err) + return err; + + if (reg >= DSA_MAX_PORTS) + return -EINVAL; + + ds->ports[reg].dn = port; + } + + return 0; +} + +static int dsa_parse_member(struct device_node *np, u32 *tree, u32 *index) +{ + int err; + + *tree = *index = 0; + + err = of_property_read_u32_index(np, "dsa,member", 0, tree); + if (err) { + /* Does not exist, but it is optional */ + if (err == -EINVAL) + return 0; + return err; + } + + err = of_property_read_u32_index(np, "dsa,member", 1, index); + if (err) + return err; + + if (*index >= DSA_MAX_SWITCHES) + return -EINVAL; + + return 0; +} + +static struct device_node *dsa_get_ports(struct dsa_switch *ds, + struct device_node *np) +{ + struct device_node *ports; + + ports = of_get_child_by_name(np, "ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return ERR_PTR(-EINVAL); + } + + return ports; +} + +static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +{ + struct device_node *ports = dsa_get_ports(ds, np); + struct dsa_switch_tree *dst; + u32 tree, index; + int err; + + err = dsa_parse_member(np, &tree, &index); + if (err) + return err; + + if (IS_ERR(ports)) + return PTR_ERR(ports); + + err = dsa_parse_ports_dn(ports, ds); + if (err) + return err; + + dst = dsa_get_dst(tree); + if (!dst) { + dst = dsa_add_dst(tree); + if (!dst) + return -ENOMEM; + } + + if (dst->ds[index]) { + err = -EBUSY; + goto out; + } + + ds->dst = dst; + ds->index = index; + dsa_dst_add_ds(dst, ds, index); + + err = dsa_dst_complete(dst); + if (err < 0) + goto out_del_dst; + + if (err == 1) { + /* Not all switches registered yet */ + err = 0; + goto out; + } + + if (dst->applied) { + pr_info("DSA: Disjoint trees?\n"); + return -EINVAL; + } + + err = dsa_dst_parse(dst); + if (err) + goto out_del_dst; + + err = dsa_dst_apply(dst); + if (err) { + dsa_dst_unapply(dst); + goto out_del_dst; + } + + dsa_put_dst(dst); + return 0; + +out_del_dst: + dsa_dst_del_ds(dst, ds, ds->index); +out: + dsa_put_dst(dst); + + return err; +} + +int dsa_register_switch(struct dsa_switch *ds, struct device_node *np) +{ + int err; + + mutex_lock(&dsa2_mutex); + err = _dsa_register_switch(ds, np); + mutex_unlock(&dsa2_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(dsa_register_switch); + +void _dsa_unregister_switch(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + dsa_dst_unapply(dst); + + dsa_dst_del_ds(dst, ds, ds->index); +} + +void dsa_unregister_switch(struct dsa_switch *ds) +{ + mutex_lock(&dsa2_mutex); + _dsa_unregister_switch(ds); + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_unregister_switch); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 72f7b8989cfb..b42f1a5f95f3 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -59,7 +59,7 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name); + int port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 35e5f0f6688b..15a492261895 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1099,14 +1099,18 @@ int dsa_slave_resume(struct net_device *slave_dev) } int dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name) + int port, const char *name) { - struct net_device *master = ds->dst->master_netdev; struct dsa_switch_tree *dst = ds->dst; + struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name, NET_NAME_UNKNOWN, ether_setup); if (slave_dev == NULL) -- cgit v1.2.3 From 53eb440f4ada034ea43b295891feec3df0fa7a29 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 6 Jun 2016 06:32:54 -0400 Subject: net sched actions: introduce timestamp for firsttime use Useful to know when the action was first used for accounting (and debugging) Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 2 ++ include/uapi/linux/pkt_cls.h | 1 + net/sched/act_api.c | 1 + net/sched/act_bpf.c | 1 + net/sched/act_connmark.c | 1 + net/sched/act_csum.c | 1 + net/sched/act_gact.c | 1 + net/sched/act_ipt.c | 1 + net/sched/act_mirred.c | 1 + net/sched/act_nat.c | 1 + net/sched/act_pedit.c | 1 + net/sched/act_police.c | 2 ++ net/sched/act_simple.c | 1 + net/sched/act_skbedit.c | 1 + net/sched/act_vlan.c | 1 + 15 files changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9a9a8edc138f..8389c007076f 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -76,6 +76,8 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) if (tm->lastuse != now) tm->lastuse = now; + if (unlikely(!tm->firstuse)) + tm->firstuse = now; } struct tc_action { diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index f4297c8a42fe..9ba1410bd21d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -124,6 +124,7 @@ struct tcf_t { __u64 install; __u64 lastuse; __u64 expires; + __u64 firstuse; }; struct tc_cnt { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 336774a535c3..5ebf6d6f85f6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -283,6 +283,7 @@ err2: p->tcfc_index = index ? index : tcf_hash_new_index(tn); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; + p->tcfc_tm.firstuse = 0; if (est) { err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, &p->tcfc_rate_est, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c7123e01c2ca..e4b877f9d322 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -156,6 +156,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); + tm.firstuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.firstuse); tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e0e6c6876bc7..e3f64f2d6206 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -163,6 +163,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); + t.firstuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.firstuse); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 065f71618276..7725eafbe581 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -548,6 +548,7 @@ static int tcf_csum_dump(struct sk_buff *skb, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ec5cc8435238..c9d59f38a3f8 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -190,6 +190,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int #endif t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 30e9087b3536..47525ee201a9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -279,6 +279,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int goto nla_put_failure; tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); + tm.firstuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.firstuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index d3ac73e90f00..1b06093f8ef8 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -220,6 +220,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - m->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9135aa8f2970..9fbf780a04c7 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -266,6 +266,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 67a17265c967..fb89275bc595 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -202,6 +202,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index b884dae692a1..820b11686f85 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -241,6 +241,7 @@ override: tcf_hash_new_index(tn); police->tcf_tm.install = jiffies; police->tcf_tm.lastuse = jiffies; + police->tcf_tm.firstuse = 0; h = tcf_hash(police->tcf_index, POL_TAB_MASK); spin_lock_bh(&hinfo->lock); hlist_add_head(&police->tcf_head, &hinfo->htab[h]); @@ -347,6 +348,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(police->tcf_tm.expires); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index f95d1c596986..81040f1cc3bb 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -160,6 +160,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 82105c8cb60f..cf34f31f4106 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -170,6 +170,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index da5120f5513f..978ec4c89684 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -184,6 +184,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); + t.firstuse = jiffies_to_clock_t(jiffies - v->tcf_tm.firstuse); t.expires = jiffies_to_clock_t(v->tcf_tm.expires); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; -- cgit v1.2.3 From 48d8ee1694dd1ab25614b58f968123a4598f887e Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 6 Jun 2016 06:32:55 -0400 Subject: net sched actions: aggregate dumping of actions timeinfo Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 8 ++++++++ net/sched/act_bpf.c | 6 +----- net/sched/act_connmark.c | 5 +---- net/sched/act_csum.c | 6 ++---- net/sched/act_gact.c | 5 +---- net/sched/act_ife.c | 4 +--- net/sched/act_ipt.c | 7 +++---- net/sched/act_mirred.c | 6 ++---- net/sched/act_nat.c | 6 ++---- net/sched/act_pedit.c | 7 +++---- net/sched/act_simple.c | 6 ++---- net/sched/act_skbedit.c | 6 ++---- net/sched/act_vlan.c | 5 +---- 13 files changed, 29 insertions(+), 48 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8389c007076f..a891978310e9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -80,6 +80,14 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) tm->firstuse = now; } +static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) +{ + dtm->install = jiffies_to_clock_t(jiffies - stm->install); + dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); + dtm->firstuse = jiffies_to_clock_t(jiffies - stm->firstuse); + dtm->expires = jiffies_to_clock_t(stm->expires); +} + struct tc_action { void *priv; const struct tc_action_ops *ops; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e4b877f9d322..ae0e7cbe488c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -154,11 +154,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, if (ret) goto nla_put_failure; - tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); - tm.firstuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.firstuse); - tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - + tcf_tm_dump(&tm, &prog->tcf_tm); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, TCA_ACT_BPF_PAD)) goto nla_put_failure; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e3f64f2d6206..35a5270f289d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -160,10 +160,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); - t.firstuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.firstuse); + tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 7725eafbe581..dcd9ababd351 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -546,10 +546,8 @@ static int tcf_csum_dump(struct sk_buff *skb, if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index c9d59f38a3f8..4c6e0085054a 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -188,10 +188,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int goto nla_put_failure; } #endif - t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); + tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 649157624f46..02f5a8ba95d7 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -553,9 +553,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); + tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 47525ee201a9..3fcde44b8f4d 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -277,12 +277,11 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) goto nla_put_failure; - tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); - tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); - tm.firstuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.firstuse); - tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); + + tcf_tm_dump(&tm, &ipt->tcf_tm); if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; + kfree(t); return skb->len; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1b06093f8ef8..787751a7981a 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -218,10 +218,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - m->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(m->tcf_tm.expires); + + tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9fbf780a04c7..06ccb03f25da 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -264,10 +264,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fb89275bc595..82d3c1479029 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -200,12 +200,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - p->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + + tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; + kfree(opt); return skb->len; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 81040f1cc3bb..be5fbb51cfed 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -158,10 +158,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + + tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cf34f31f4106..7e2bc3c2b6da 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -168,10 +168,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), &d->mark)) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - d->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(d->tcf_tm.expires); + + tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 978ec4c89684..f0a08a11f54f 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -182,10 +182,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto))) goto nla_put_failure; - t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); - t.firstuse = jiffies_to_clock_t(jiffies - v->tcf_tm.firstuse); - t.expires = jiffies_to_clock_t(v->tcf_tm.expires); + tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; return skb->len; -- cgit v1.2.3 From 0b0f43fe2e7291aa97b1febeaa5a0de453d007ca Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 5 Jun 2016 10:41:32 -0400 Subject: net sched: indentation and other OCD stylistic fixes Signed-off-by: Jamal Hadi Salim Acked-by: Cong Wang --- include/net/act_api.h | 14 ++++++++------ include/net/tc_act/tc_defact.h | 4 ++-- include/uapi/linux/pkt_cls.h | 6 +++--- net/sched/act_api.c | 19 +++++++++++-------- net/sched/act_bpf.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_vlan.c | 3 ++- net/sched/cls_api.c | 11 +++++++---- 9 files changed, 41 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index a891978310e9..db218a12efb5 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -2,8 +2,8 @@ #define __NET_ACT_API_H /* - * Public police action API for classifiers/qdiscs - */ + * Public action API for classifiers/qdiscs +*/ #include #include @@ -107,7 +107,8 @@ struct tc_action_ops { char kind[IFNAMSIZ]; __u32 type; /* TBD to match kind */ struct module *owner; - int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); + int (*act)(struct sk_buff *, const struct tc_action *, + struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *, int bind); int (*lookup)(struct net *, struct tc_action *, u32); @@ -125,8 +126,8 @@ struct tc_action_net { }; static inline -int tc_action_net_init(struct tc_action_net *tn, const struct tc_action_ops *ops, - unsigned int mask) +int tc_action_net_init(struct tc_action_net *tn, + const struct tc_action_ops *ops, unsigned int mask) { int err = 0; @@ -169,7 +170,8 @@ static inline int tcf_hash_release(struct tc_action *a, bool bind) } int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); -int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); +int tcf_unregister_action(struct tc_action_ops *a, + struct pernet_operations *ops); int tcf_action_destroy(struct list_head *actions, int bind); int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, struct tcf_result *res); diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h index 9763dcbb9bc3..ab9b5d6be67b 100644 --- a/include/net/tc_act/tc_defact.h +++ b/include/net/tc_act/tc_defact.h @@ -5,8 +5,8 @@ struct tcf_defact { struct tcf_common common; - u32 tcfd_datalen; - void *tcfd_defdata; + u32 tcfd_datalen; + void *tcfd_defdata; }; #define to_defact(a) \ container_of(a->priv, struct tcf_defact, common) diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 9ba1410bd21d..5702e933dc07 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -115,8 +115,8 @@ struct tc_police { __u32 mtu; struct tc_ratespec rate; struct tc_ratespec peakrate; - int refcnt; - int bindcnt; + int refcnt; + int bindcnt; __u32 capab; }; @@ -128,7 +128,7 @@ struct tcf_t { }; struct tc_cnt { - int refcnt; + int refcnt; int bindcnt; }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5ebf6d6f85f6..719bc2e85852 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -504,8 +504,8 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int -tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref) +int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, + int bind, int ref) { struct tc_action *a; int err = -EINVAL; @@ -688,9 +688,9 @@ errout: return -1; } -static int -tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq, - u16 flags, int event, int bind, int ref) +static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, + u32 portid, u32 seq, u16 flags, int event, int bind, + int ref) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -731,7 +731,8 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, + 0, 0) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -839,7 +840,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (a.ops == NULL) /*some idjot trying to flush unknown action */ goto err_out; - nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); + nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, + sizeof(*t), 0); if (!nlh) goto out_module_put; t = nlmsg_data(nlh); @@ -1002,7 +1004,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; - if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) + if ((n->nlmsg_type != RTM_GETACTION) && + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index ae0e7cbe488c..f7b6cf49ea6f 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -169,7 +169,8 @@ nla_put_failure: static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, - [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, + [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, + .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4c6e0085054a..19058a7f3e5c 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -162,7 +162,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, tm->lastuse = lastuse; } -static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gact *gact = a->priv; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 3fcde44b8f4d..e7c0f4d944a2 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -34,7 +34,8 @@ static int ipt_net_id; static int xt_net_id; -static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) +static int ipt_init_target(struct xt_entry_target *t, char *table, + unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; @@ -250,7 +251,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, } -static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ipt *ipt = a->priv; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index f0a08a11f54f..b075d50e0fc3 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -179,7 +179,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (v->tcfv_action == TCA_VLAN_ACT_PUSH && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || - nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto))) + nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, + v->tcfv_push_proto))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a75864d93142..aafa6bce173e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -351,8 +351,9 @@ errout: return err; } -static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, - unsigned long fh, u32 portid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct net *net, struct sk_buff *skb, + struct tcf_proto *tp, unsigned long fh, u32 portid, + u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -474,9 +475,11 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) TC_H_MIN(tcm->tcm_info) != tp->protocol) continue; if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + memset(&cb->args[1], 0, + sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid, + if (tcf_fill_node(net, skb, tp, 0, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) break; -- cgit v1.2.3 From f9eb8aea2a1e12fc2f584d1627deeb957435a801 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jun 2016 09:37:15 -0700 Subject: net_sched: transform qdisc running bit into a seqcount Instead of using a single bit (__QDISC___STATE_RUNNING) in sch->__state, use a seqcount. This adds lockdep support, but more importantly it will allow us to sample qdisc/class statistics without having to grab qdisc root lock. Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jamal Hadi Salim Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 ++ drivers/net/ppp/ppp_generic.c | 3 +++ drivers/net/team/team.c | 2 ++ include/linux/netdevice.h | 1 + include/net/sch_generic.h | 15 ++++----------- net/bluetooth/6lowpan.c | 2 ++ net/core/dev.c | 2 +- net/ieee802154/6lowpan/core.c | 3 +++ net/l2tp/l2tp_eth.c | 4 ++++ net/sched/sch_generic.c | 14 ++++++++++---- 10 files changed, 32 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 941ec99cd3b6..681af31a60ed 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4610,6 +4610,7 @@ static int bond_check_params(struct bond_params *params) static struct lock_class_key bonding_netdev_xmit_lock_key; static struct lock_class_key bonding_netdev_addr_lock_key; static struct lock_class_key bonding_tx_busylock_key; +static struct lock_class_key bonding_qdisc_running_key; static void bond_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -4625,6 +4626,7 @@ static void bond_set_lockdep_class(struct net_device *dev) &bonding_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, bond_set_lockdep_class_one, NULL); dev->qdisc_tx_busylock = &bonding_tx_busylock_key; + dev->qdisc_running_key = &bonding_qdisc_running_key; } /* Called from registration process */ diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 8dedafa1a95d..aeabaa42317f 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1313,9 +1313,12 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) } static struct lock_class_key ppp_tx_busylock; +static struct lock_class_key ppp_qdisc_running_key; + static int ppp_dev_init(struct net_device *dev) { dev->qdisc_tx_busylock = &ppp_tx_busylock; + dev->qdisc_running_key = &ppp_qdisc_running_key; return 0; } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 2ace126533cd..00eb38956a2c 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1577,6 +1577,7 @@ static const struct team_option team_options[] = { static struct lock_class_key team_netdev_xmit_lock_key; static struct lock_class_key team_netdev_addr_lock_key; static struct lock_class_key team_tx_busylock_key; +static struct lock_class_key team_qdisc_running_key; static void team_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -1590,6 +1591,7 @@ static void team_set_lockdep_class(struct net_device *dev) lockdep_set_class(&dev->addr_list_lock, &team_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, team_set_lockdep_class_one, NULL); dev->qdisc_tx_busylock = &team_tx_busylock_key; + dev->qdisc_running_key = &team_qdisc_running_key; } static int team_init(struct net_device *dev) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa6df2699532..59d7e06d88d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1862,6 +1862,7 @@ struct net_device { #endif struct phy_device *phydev; struct lock_class_key *qdisc_tx_busylock; + struct lock_class_key *qdisc_running_key; bool proto_down; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a1fd76c22a59..bff8d895ef8a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -29,13 +29,6 @@ enum qdisc_state_t { __QDISC_STATE_THROTTLED, }; -/* - * following bits are only changed while qdisc lock is held - */ -enum qdisc___state_t { - __QDISC___STATE_RUNNING = 1, -}; - struct qdisc_size_table { struct rcu_head rcu; struct list_head list; @@ -93,7 +86,7 @@ struct Qdisc { unsigned long state; struct sk_buff_head q; struct gnet_stats_basic_packed bstats; - unsigned int __state; + seqcount_t running; struct gnet_stats_queue qstats; struct rcu_head rcu_head; int padded; @@ -104,20 +97,20 @@ struct Qdisc { static inline bool qdisc_is_running(const struct Qdisc *qdisc) { - return (qdisc->__state & __QDISC___STATE_RUNNING) ? true : false; + return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc_is_running(qdisc)) return false; - qdisc->__state |= __QDISC___STATE_RUNNING; + write_seqcount_begin(&qdisc->running); return true; } static inline void qdisc_run_end(struct Qdisc *qdisc) { - qdisc->__state &= ~__QDISC___STATE_RUNNING; + write_seqcount_end(&qdisc->running); } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 780089d75915..977a11e418d0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -629,6 +629,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) static struct lock_class_key bt_tx_busylock; static struct lock_class_key bt_netdev_xmit_lock_key; +static struct lock_class_key bt_qdisc_running_key; static void bt_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -641,6 +642,7 @@ static int bt_dev_init(struct net_device *dev) { netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL); dev->qdisc_tx_busylock = &bt_tx_busylock; + dev->qdisc_running_key = &bt_qdisc_running_key; return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index 896b686d1966..e0bcc39f4a7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3075,7 +3075,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC___STATE_RUNNING owner to get the lock more + * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ contended = qdisc_is_running(q); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index dd085db8580e..14aa5effd29a 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -60,6 +60,7 @@ static struct header_ops lowpan_header_ops = { static struct lock_class_key lowpan_tx_busylock; static struct lock_class_key lowpan_netdev_xmit_lock_key; +static struct lock_class_key lowpan_qdisc_running_key; static void lowpan_set_lockdep_class_one(struct net_device *ldev, struct netdev_queue *txq, @@ -73,6 +74,8 @@ static int lowpan_dev_init(struct net_device *ldev) { netdev_for_each_tx_queue(ldev, lowpan_set_lockdep_class_one, NULL); ldev->qdisc_tx_busylock = &lowpan_tx_busylock; + ldev->qdisc_running_key = &lowpan_qdisc_running_key; + return 0; } diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e253c26f31ac..c00d72d182fa 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -68,6 +68,8 @@ static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) } static struct lock_class_key l2tp_eth_tx_busylock; +static struct lock_class_key l2tp_qdisc_running_key; + static int l2tp_eth_dev_init(struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); @@ -76,6 +78,8 @@ static int l2tp_eth_dev_init(struct net_device *dev) eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock; + dev->qdisc_running_key = &l2tp_qdisc_running_key; + return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 269dd71b3828..cebea73e70ac 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -110,7 +110,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, /* * Transmit possibly several skbs, and handle the return status as - * required. Holding the __QDISC___STATE_RUNNING bit guarantees that + * required. Owning running seqcount bit guarantees that * only one CPU can execute this function. * * Returns to the caller: @@ -137,10 +137,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); + spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); return qdisc_qlen(q); } - spin_lock(root_lock); + spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ @@ -163,7 +163,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * - * __QDISC___STATE_RUNNING guarantees only one CPU can process + * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * @@ -379,6 +379,7 @@ struct Qdisc noop_qdisc = { .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, + .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), }; EXPORT_SYMBOL(noop_qdisc); @@ -537,6 +538,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops) @@ -570,6 +572,10 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); + sch->ops = ops; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; -- cgit v1.2.3 From edb09eb17ed89eaa82a52dd306beac93e292b485 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jun 2016 09:37:16 -0700 Subject: net: sched: do not acquire qdisc spinlock in qdisc/class stats dump Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host agent [1] are problematic at scale : For each qdisc/class found in the dump, we currently lock the root qdisc spinlock in order to get stats. Sampling stats every 5 seconds from thousands of HTB classes is a challenge when the root qdisc spinlock is under high pressure. Not only the dumps take time, they also slow down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases. An audit of existing qdiscs showed that sch_fq_codel is the only qdisc that might need the qdisc lock in fq_codel_dump_stats() and fq_codel_dump_class_stats() In v2 of this patch, I now use the Qdisc running seqcount to provide consistent reads of packets/bytes counters, regardless of 32/64 bit arches. I also changed rate estimators to use the same infrastructure so that they no longer need to lock root qdisc lock. [1] http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Jamal Hadi Salim Cc: John Fastabend Cc: Kevin Athey Cc: Xiaotian Pei Signed-off-by: David S. Miller --- Documentation/networking/gen_stats.txt | 2 +- include/net/gen_stats.h | 12 ++++++++---- include/net/sch_generic.h | 8 ++++++++ net/core/gen_estimator.c | 24 ++++++++++++++++-------- net/core/gen_stats.c | 34 +++++++++++++++++++++++----------- net/netfilter/xt_RATEEST.c | 2 +- net/sched/act_api.c | 4 ++-- net/sched/act_police.c | 3 ++- net/sched/sch_api.c | 21 +++++++++++---------- net/sched/sch_atm.c | 3 ++- net/sched/sch_cbq.c | 9 ++++++--- net/sched/sch_drr.c | 9 ++++++--- net/sched/sch_fq_codel.c | 15 +++++++++++---- net/sched/sch_hfsc.c | 10 +++++----- net/sched/sch_htb.c | 11 ++++++----- net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 11 +++++++---- net/sched/sch_multiq.c | 3 ++- net/sched/sch_prio.c | 3 ++- net/sched/sch_qfq.c | 9 ++++++--- 20 files changed, 126 insertions(+), 69 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt index ff630a87b511..179b18ce45ff 100644 --- a/Documentation/networking/gen_stats.txt +++ b/Documentation/networking/gen_stats.txt @@ -21,7 +21,7 @@ struct mystruct { ... }; -Update statistics: +Update statistics, in dequeue() methods only, (while owning qdisc->running) mystruct->tstats.packet++; mystruct->qstats.backlog += skb->pkt_len; diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 610cd397890e..231e121cc7d9 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -33,10 +33,12 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); -int gnet_stats_copy_basic(struct gnet_dump *d, +int gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); -void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, +void __gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); int gnet_stats_copy_rate_est(struct gnet_dump *d, @@ -52,13 +54,15 @@ int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, struct nlattr *opt); + spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt); void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est64 *rate_est); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, struct nlattr *opt); + spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt); bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, const struct gnet_stats_rate_est64 *rate_est); #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bff8d895ef8a..c4f5749342ec 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -314,6 +314,14 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) return qdisc_lock(root); } +static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) +{ + struct Qdisc *root = qdisc_root_sleeping(qdisc); + + ASSERT_RTNL(); + return &root->running; +} + static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 4573d81093fe..cad8e791f28e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -84,6 +84,7 @@ struct gen_estimator struct gnet_stats_basic_packed *bstats; struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; + seqcount_t *running; int ewma_log; u32 last_packets; unsigned long avpps; @@ -121,26 +122,28 @@ static void est_timer(unsigned long arg) unsigned long rate; u64 brate; - spin_lock(e->stats_lock); + if (e->stats_lock) + spin_lock(e->stats_lock); read_lock(&est_lock); if (e->bstats == NULL) goto skip; - __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats); brate = (b.bytes - e->last_bytes)<<(7 - idx); e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); - e->rate_est->bps = (e->avbps+0xF)>>5; + WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5); rate = b.packets - e->last_packets; rate <<= (7 - idx); e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - e->rate_est->pps = (e->avpps + 0xF) >> 5; + WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5); skip: read_unlock(&est_lock); - spin_unlock(e->stats_lock); + if (e->stats_lock) + spin_unlock(e->stats_lock); } if (!list_empty(&elist[idx].list)) @@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock + * @running: qdisc running seqcount * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt) { struct gen_estimator *est; @@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; - __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats); idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; + est->running = running; est->ewma_log = parm->ewma_log; est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; @@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @stats_lock: statistics lock + * @running: qdisc running seqcount (might be NULL) * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, - spinlock_t *stats_lock, struct nlattr *opt) + spinlock_t *stats_lock, + seqcount_t *running, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index f96ee8b9478d..d9c210caff32 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) return 0; nla_put_failure: + if (d->lock) + spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; - spin_unlock_bh(d->lock); return -1; } @@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, { memset(d, 0, sizeof(*d)); - spin_lock_bh(lock); - d->lock = lock; if (type) d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; d->padattr = padattr; - + if (lock) { + d->lock = lock; + spin_lock_bh(lock); + } if (d->tail) return gnet_stats_copy(d, type, NULL, 0, padattr); @@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, } void -__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, +__gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { + unsigned int seq; + if (cpu) { __gnet_stats_copy_basic_cpu(bstats, cpu); - } else { + return; + } + do { + if (running) + seq = read_seqcount_begin(running); bstats->bytes = b->bytes; bstats->packets = b->packets; - } + } while (running && read_seqcount_retry(running, seq)); } EXPORT_SYMBOL(__gnet_stats_copy_basic); @@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, +gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b) { struct gnet_stats_basic_packed bstats = {0}; - __gnet_stats_copy_basic(&bstats, cpu, b); + __gnet_stats_copy_basic(running, &bstats, cpu, b); if (d->compat_tc_stats) { d->tc_stats.bytes = bstats.bytes; @@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) return 0; err_out: + if (d->lock) + spin_unlock_bh(d->lock); d->xstats_len = 0; - spin_unlock_bh(d->lock); return -1; } EXPORT_SYMBOL(gnet_stats_copy_app); @@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d) return -1; } + if (d->lock) + spin_unlock_bh(d->lock); kfree(d->xstats); d->xstats = NULL; d->xstats_len = 0; - spin_unlock_bh(d->lock); return 0; } EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 604df6fae6fc..515131f9e021 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, - &est->lock, &cfg.opt); + &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 719bc2e85852..b6db56ec8117 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -287,7 +287,7 @@ err2: if (est) { err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, &p->tcfc_rate_est, - &p->tcfc_lock, est); + &p->tcfc_lock, NULL, est); if (err) { free_percpu(p->cpu_qstats); goto err2; @@ -671,7 +671,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 820b11686f85..bcb31142556b 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -185,7 +185,8 @@ override: if (est) { err = gen_replace_estimator(&police->tcf_bstats, NULL, &police->tcf_rate_est, - &police->tcf_lock, est); + &police->tcf_lock, + NULL, est); if (err) goto failure_unlock; } else if (tb[TCA_POLICE_AVRATE] && diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ddf047df5361..d4a8bbfcc953 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -982,7 +982,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - spinlock_t *root_lock; + seqcount_t *running; err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) @@ -991,14 +991,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, if ((sch->parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS) && (!p || !(p->flags & TCQ_F_MQROOT))) - root_lock = qdisc_root_sleeping_lock(sch); + running = qdisc_root_sleeping_running(sch); else - root_lock = qdisc_lock(sch); + running = &sch->running; err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, - root_lock, + NULL, + running, tca[TCA_RATE]); if (err) goto err_out4; @@ -1061,7 +1062,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) gen_replace_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); } out: @@ -1369,8 +1371,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d, - TCA_PAD) < 0) + NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) @@ -1381,7 +1382,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), + &d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @@ -1684,8 +1686,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d, - TCA_PAD) < 0) + NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 1911af3ca7c0..34f8f79e56d5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -637,7 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &flow->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index baafddf229ce..1b8128fb845d 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1600,7 +1600,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; @@ -1755,7 +1756,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { qdisc_put_rtab(rtab); @@ -1848,7 +1850,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { kfree(cl); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index a63e879e8975..1b7e1a27773d 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -91,7 +91,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -119,7 +120,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { qdisc_destroy(cl->qdisc); @@ -279,7 +281,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 6883a8971562..1daa54237f4e 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -566,11 +566,13 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.memory_usage = q->memory_usage; st.qdisc_stats.drop_overmemory = q->drop_overmemory; + sch_tree_lock(sch); list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; list_for_each(pos, &q->old_flows) st.qdisc_stats.old_flows_len++; + sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); } @@ -624,7 +626,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (idx < q->flows_cnt) { const struct fq_codel_flow *flow = &q->flows[idx]; - const struct sk_buff *skb = flow->head; + const struct sk_buff *skb; memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; @@ -642,9 +644,14 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, codel_time_to_us(delta) : -codel_time_to_us(-delta); } - while (skb) { - qs.qlen++; - skb = skb->next; + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); } qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d783d7cc3348..74813dd49053 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1015,11 +1015,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cur_time = psched_get_time(); if (tca[TCA_RATE]) { - spinlock_t *lock = qdisc_root_sleeping_lock(sch); - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - lock, + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -1068,7 +1067,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) { kfree(cl); @@ -1373,7 +1373,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index d4b4218af6b1..2b057649f24b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1141,7 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1395,7 +1396,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (htb_rate_est || tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE] ? : &est.nla); if (err) { kfree(cl); @@ -1457,11 +1459,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent->children++; } else { if (tca[TCA_RATE]) { - spinlock_t *lock = qdisc_root_sleeping_lock(sch); - err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - lock, + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 56a77b878eb3..b9439827c172 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -199,7 +199,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b8002ce3d010..549c66359924 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -342,7 +342,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, * hold here is the look on dev_queue->qdisc_sleeping * also acquired below. */ - spin_unlock_bh(d->lock); + if (d->lock) + spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); @@ -359,15 +360,17 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(qdisc_lock(qdisc)); } /* Reclaim root sleeping lock before completing stats */ - spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || + if (d->lock) + spin_lock_bh(d->lock); + if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index bcdd54bb101c..21e69d2e8347 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -356,7 +356,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index fee1b15506b2..06eca7060683 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -319,7 +319,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl_q->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8d2d8d953432..85d41979d825 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -460,7 +460,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) return err; @@ -486,7 +487,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + NULL, + qdisc_root_sleeping_running(sch), tca[TCA_RATE]); if (err) goto destroy_class; @@ -663,7 +665,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), + d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) -- cgit v1.2.3 From 0c73c523cf737b5d446705392e0e14ee0411a351 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Jun 2016 16:32:42 -0700 Subject: net: dsa: Initialize CPU port ethtool ops per tree Now that we can properly support multiple distinct trees in the system, using a global variable: dsa_cpu_port_ethtool_ops is getting clobbered as soon as the second switch tree gets probed, and we don't want that. We need to move this to be dynamically allocated, and since we can't really be comparing addresses anymore to determine first time initialization versus any other times, just move this to dsa.c and dsa2.c where the remainder of the dst/ds initialization happens. The operations teardown restores the master netdev's ethtool_ops to its original ethtool_ops pointer (typically within the Ethernet driver) Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa2.c | 6 ++++++ net/dsa/dsa_priv.h | 2 ++ net/dsa/slave.c | 10 ---------- 5 files changed, 50 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index cca7ef230742..20b3087ad193 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,6 +116,7 @@ struct dsa_switch_tree { * Original copy of the master netdev ethtool_ops */ struct ethtool_ops master_ethtool_ops; + const struct ethtool_ops *master_orig_ethtool_ops; /* * The switch and port to which the CPU is attached. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ce3b942dce76..766d2a525ada 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -266,6 +266,41 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } +int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds) +{ + struct net_device *master; + struct ethtool_ops *cpu_ops; + + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + + cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); + if (!cpu_ops) + return -ENOMEM; + + memcpy(&ds->dst->master_ethtool_ops, master->ethtool_ops, + sizeof(struct ethtool_ops)); + ds->dst->master_orig_ethtool_ops = master->ethtool_ops; + memcpy(cpu_ops, &ds->dst->master_ethtool_ops, + sizeof(struct ethtool_ops)); + dsa_cpu_port_ethtool_init(cpu_ops); + master->ethtool_ops = cpu_ops; + + return 0; +} + +void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) +{ + struct net_device *master; + + master = ds->dst->master_netdev; + if (ds->master_netdev) + master = ds->master_netdev; + + master->ethtool_ops = ds->dst->master_orig_ethtool_ops; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -379,6 +414,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = 0; } + ret = dsa_cpu_port_ethtool_setup(ds); + if (ret) + return ret; + #ifdef CONFIG_NET_DSA_HWMON /* If the switch provides a temperature sensor, * register with hardware monitoring subsystem. @@ -963,6 +1002,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } + dsa_cpu_port_ethtool_restore(dst->ds[0]); + dev_put(dst->master_netdev); } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 4e0f3c268103..83b95fc4cede 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -394,6 +394,10 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } + err = dsa_cpu_port_ethtool_setup(dst->ds[0]); + if (err) + return err; + /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -429,6 +433,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } + dsa_cpu_port_ethtool_restore(dst->ds[0]); + pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 106a9f067f94..00077a9c97f4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -54,6 +54,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct device_node *port_dn, int port); void dsa_cpu_dsa_destroy(struct device_node *port_dn); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); +int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); +void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8d159932e082..7236eb26dc97 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -892,8 +892,6 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, }; -static struct ethtool_ops dsa_cpu_port_ethtool_ops; - static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1126,14 +1124,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) { - memcpy(&dst->master_ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); - memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops, - sizeof(struct ethtool_ops)); - dsa_cpu_port_ethtool_init(&dsa_cpu_port_ethtool_ops); - master->ethtool_ops = &dsa_cpu_port_ethtool_ops; - } eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; -- cgit v1.2.3 From 96c63fa7393d0a346acfe5a91e0c7d4c7782641b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 8 Jun 2016 10:55:39 -0700 Subject: net: Add l3mdev rule Currently, VRFs require 1 oif and 1 iif rule per address family per VRF. As the number of VRF devices increases it brings scalability issues with the increasing rule list. All of the VRF rules have the same format with the exception of the specific table id to direct the lookup. Since the table id is available from the oif or iif in the loopup, the VRF rules can be consolidated to a single rule that pulls the table from the VRF device. This patch introduces a new rule attribute l3mdev. The l3mdev rule means the table id used for the lookup is pulled from the L3 master device (e.g., VRF) rather than being statically defined. With the l3mdev rule all of the basic VRF FIB rules are reduced to 1 l3mdev rule per address family (IPv4 and IPv6). If an admin wishes to insert higher priority rules for specific VRFs those rules will co-exist with the l3mdev rule. This capability means current VRF scripts will co-exist with this new simpler implementation. Currently, the rules list for both ipv4 and ipv6 look like this: $ ip ru ls 1000: from all oif vrf1 lookup 1001 1000: from all iif vrf1 lookup 1001 1000: from all oif vrf2 lookup 1002 1000: from all iif vrf2 lookup 1002 1000: from all oif vrf3 lookup 1003 1000: from all iif vrf3 lookup 1003 1000: from all oif vrf4 lookup 1004 1000: from all iif vrf4 lookup 1004 1000: from all oif vrf5 lookup 1005 1000: from all iif vrf5 lookup 1005 1000: from all oif vrf6 lookup 1006 1000: from all iif vrf6 lookup 1006 1000: from all oif vrf7 lookup 1007 1000: from all iif vrf7 lookup 1007 1000: from all oif vrf8 lookup 1008 1000: from all iif vrf8 lookup 1008 ... 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default With the l3mdev rule the list is just the following regardless of the number of VRFs: $ ip ru ls 1000: from all lookup [l3mdev table] 32765: from all lookup local 32766: from all lookup main 32767: from all lookup default (Note: the above pretty print of the rule is based on an iproute2 prototype. Actual verbage may change) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/fib_rules.h | 24 ++++++++++++++++++++++-- include/net/l3mdev.h | 12 ++++++++++++ include/uapi/linux/fib_rules.h | 1 + net/core/fib_rules.c | 33 ++++++++++++++++++++++++++++----- net/ipv4/fib_rules.c | 6 ++++-- net/ipv6/fib6_rules.c | 6 ++++-- net/l3mdev/l3mdev.c | 38 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 109 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 59160de702b6..456e4a6006ab 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -17,7 +17,8 @@ struct fib_rule { u32 flags; u32 table; u8 action; - /* 3 bytes hole, try to use */ + u8 l3mdev; + /* 2 bytes hole, try to use */ u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; @@ -36,6 +37,7 @@ struct fib_lookup_arg { void *lookup_ptr; void *result; struct fib_rule *rule; + u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 @@ -89,7 +91,8 @@ struct fib_rules_ops { [FRA_TABLE] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ - [FRA_GOTO] = { .type = NLA_U32 } + [FRA_GOTO] = { .type = NLA_U32 }, \ + [FRA_L3MDEV] = { .type = NLA_U8 } static inline void fib_rule_get(struct fib_rule *rule) { @@ -102,6 +105,20 @@ static inline void fib_rule_put(struct fib_rule *rule) kfree_rcu(rule, rcu); } +#ifdef CONFIG_NET_L3_MASTER_DEV +static inline u32 fib_rule_get_table(struct fib_rule *rule, + struct fib_lookup_arg *arg) +{ + return rule->l3mdev ? arg->table : rule->table; +} +#else +static inline u32 fib_rule_get_table(struct fib_rule *rule, + struct fib_lookup_arg *arg) +{ + return rule->table; +} +#endif + static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) @@ -117,4 +134,7 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); + +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh); +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh); #endif diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 374388dc01c8..34f33eb96a5e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -11,6 +11,8 @@ #ifndef _NET_L3MDEV_H_ #define _NET_L3MDEV_H_ +#include + /** * struct l3mdev_ops - l3mdev operations * @@ -41,6 +43,9 @@ struct l3mdev_ops { #ifdef CONFIG_NET_L3_MASTER_DEV +int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, + struct fib_lookup_arg *arg); + int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { @@ -236,6 +241,13 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { return skb; } + +static inline +int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, + struct fib_lookup_arg *arg) +{ + return 1; +} #endif #endif /* _NET_L3MDEV_H_ */ diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 620c8a5ddc00..14404b3ebb89 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -50,6 +50,7 @@ enum { FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, FRA_PAD, + FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 840acebbb80c..98298b11f534 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -173,7 +173,8 @@ void fib_rules_unregister(struct fib_rules_ops *ops) EXPORT_SYMBOL_GPL(fib_rules_unregister); static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, - struct flowi *fl, int flags) + struct flowi *fl, int flags, + struct fib_lookup_arg *arg) { int ret = 0; @@ -189,6 +190,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; + if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -204,7 +208,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: - if (!fib_rule_match(rule, ops, fl, flags)) + if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { @@ -265,7 +269,7 @@ errout: return err; } -static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -336,6 +340,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_TUN_ID]) rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + if (tb[FRA_L3MDEV]) { +#ifdef CONFIG_NET_L3_MASTER_DEV + rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); + if (rule->l3mdev != 1) +#endif + goto errout_free; + } + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -371,6 +383,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } else if (rule->action == FR_ACT_GOTO) goto errout_free; + if (rule->l3mdev && rule->table) + goto errout_free; + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; @@ -424,8 +439,9 @@ errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_nl_newrule); -static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -483,6 +499,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) continue; + if (tb[FRA_L3MDEV] && + (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -536,6 +556,7 @@ errout: rules_ops_put(ops); return err; } +EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -607,7 +628,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || + (rule->l3mdev && + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2bda9e89c61..6e9ea69e5f75 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -76,6 +76,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, { int err = -EAGAIN; struct fib_table *tbl; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -94,7 +95,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, rcu_read_lock(); - tbl = fib_get_table(rule->fr_net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, @@ -180,7 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (err) goto errout; - if (rule->table == RT_TABLE_UNSPEC) { + if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index ed33abf57abd..5857c1fc8b67 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -67,6 +67,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; int err = 0; + u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: @@ -86,7 +87,8 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto discard_pkt; } - table = fib6_get_table(net, rule->table); + tb_id = fib_rule_get_table(rule, arg); + table = fib6_get_table(net, tb_id); if (!table) { err = -EAGAIN; goto out; @@ -199,7 +201,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; - if (rule->action == FR_ACT_TO_TBL) { + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) goto errout; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 6651a78e100c..7da97809a7e8 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -10,6 +10,7 @@ */ #include +#include #include /** @@ -160,3 +161,40 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) return rc; } EXPORT_SYMBOL_GPL(l3mdev_get_saddr); + +/** + * l3mdev_fib_rule_match - Determine if flowi references an + * L3 master device + * @net: network namespace for device index lookup + * @fl: flow struct + */ + +int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, + struct fib_lookup_arg *arg) +{ + struct net_device *dev; + int rc = 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl->flowi_oif); + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_fib_table) { + arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); + rc = 1; + goto out; + } + + dev = dev_get_by_index_rcu(net, fl->flowi_iif); + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_fib_table) { + arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); + rc = 1; + goto out; + } + +out: + rcu_read_unlock(); + + return rc; +} -- cgit v1.2.3 From dd47c1fa776cda48531b651c88341e951140b0a7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:40 +0200 Subject: cbq: remove TCA_CBQ_POLICE support iproute2 doesn't implement any cbq option that results in this attribute being sent to kernel. To make use of it, user would have to - patch iproute2 - add a class - attach a qdisc to the class (default pfifo doesn't work as q->handle is 0 and cbq_set_police() is a no-op in this case) - re-'add' the same class (tc class change ...) again - user must also specifiy a defmap (e.g. 'split 1:0 defmap 3f'), since this 'police' feature relies on its presence - the added qdisc must be one of bfifo, pfifo or netem If all of these conditions are met and _some_ leaf qdiscs, namely p/bfifo, netem, plug or tbf would drop a packet, kernel calls back into cbq, which will attempt to re-queue the skb into a different class as indicated by the parents' defmap entry for TC_PRIO_BESTEFFORT. [ i.e. we behave as if tc_classify returned TC_ACT_RECLASSIFY ]. This feature, which isn't documented or implemented in iproute2, and isn't implemented consistently (most qdiscs like sfq, codel, etc drop right away instead of attempting this reclassification) is the sole reason for the reshape_fail and __parent member in Qdisc struct. So remove TCA_CBQ_POLICE support from the kernel, reject it via EOPNOTSUPP so userspace knows we don't support it, and then remove no-longer needed infrastructure in followup commit. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 -- net/sched/sch_cbq.c | 95 +---------------------------------------------- 2 files changed, 1 insertion(+), 98 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c4f5749342ec..c069ac1dd75d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -68,10 +68,6 @@ struct Qdisc { void *u32_node; - /* This field is deprecated, but it is still used by CBQ - * and it will live until better solution will be invented. - */ - struct Qdisc *__parent; struct netdev_queue *dev_queue; struct gnet_stats_rate_est64 rate_est; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index fdca45e34230..7f4d6c5a0efe 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -80,9 +80,6 @@ struct cbq_class { unsigned char priority; /* class priority */ unsigned char priority2; /* priority to be used after overlimit */ unsigned char ewma_log; /* time constant for idle time calculation */ -#ifdef CONFIG_NET_CLS_ACT - unsigned char police; -#endif u32 defmap; @@ -377,9 +374,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } -#ifdef CONFIG_NET_CLS_ACT - cl->q->__parent = sch; -#endif ret = qdisc_enqueue(skb, cl->q); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; @@ -524,40 +518,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) -{ - struct Qdisc *sch = child->__parent; - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = q->rx_class; - - q->rx_class = NULL; - - if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { - int ret; - - cbq_mark_toplevel(q, cl); - - q->rx_class = cl; - cl->q->__parent = sch; - - ret = qdisc_enqueue(skb, cl->q); - if (ret == NET_XMIT_SUCCESS) { - sch->q.qlen++; - if (!cl->next_alive) - cbq_activate_class(cl); - return 0; - } - if (net_xmit_drop_count(ret)) - qdisc_qstats_drop(sch); - return 0; - } - - qdisc_qstats_drop(sch); - return -1; -} -#endif - /* * It is mission critical procedure. * @@ -1179,21 +1139,6 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) return 0; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) -{ - cl->police = p->police; - - if (cl->q->handle) { - if (p->police == TC_POLICE_RECLASSIFY) - cl->q->reshape_fail = cbq_reshape_fail; - else - cl->q->reshape_fail = NULL; - } - return 0; -} -#endif - static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) { cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); @@ -1350,35 +1295,11 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NET_CLS_ACT -static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_police opt; - - if (cl->police) { - opt.police = cl->police; - opt.__res1 = 0; - opt.__res2 = 0; - if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) - goto nla_put_failure; - } - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} -#endif - static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) { if (cbq_dump_lss(skb, cl) < 0 || cbq_dump_rate(skb, cl) < 0 || cbq_dump_wrr(skb, cl) < 0 || -#ifdef CONFIG_NET_CLS_ACT - cbq_dump_police(skb, cl) < 0 || -#endif cbq_dump_fopt(skb, cl) < 0) return -1; return 0; @@ -1468,11 +1389,6 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, &pfifo_qdisc_ops, cl->common.classid); if (new == NULL) return -ENOBUFS; - } else { -#ifdef CONFIG_NET_CLS_ACT - if (cl->police == TC_POLICE_RECLASSIFY) - new->reshape_fail = cbq_reshape_fail; -#endif } *old = qdisc_replace(sch, new, &cl->q); @@ -1585,7 +1501,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (err < 0) return err; - if (tb[TCA_CBQ_OVL_STRATEGY]) + if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) return -EOPNOTSUPP; if (cl) { @@ -1636,11 +1552,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); } -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE]) - cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -#endif - if (tb[TCA_CBQ_FOPT]) cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); @@ -1736,10 +1647,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->maxidle = q->link.maxidle; if (cl->avpkt == 0) cl->avpkt = q->link.avpkt; -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE]) - cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); -#endif if (tb[TCA_CBQ_FOPT]) cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); sch_tree_unlock(sch); -- cgit v1.2.3 From c3a173d7dba2d7c74dd4ab871b8f22bf56ac10b2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:41 +0200 Subject: sched: remove qdisc_rehape_fail After the removal of TCA_CBQ_POLICE in cbq scheduler qdisc->reshape_fail is always NULL, i.e. qdisc_rehape_fail is now the same as qdisc_drop. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 19 ------------------- net/sched/sch_fifo.c | 4 ++-- net/sched/sch_netem.c | 4 ++-- net/sched/sch_plug.c | 2 +- net/sched/sch_tbf.c | 4 ++-- 5 files changed, 7 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c069ac1dd75d..a9aec633d467 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -63,9 +63,6 @@ struct Qdisc { struct list_head list; u32 handle; u32 parent; - int (*reshape_fail)(struct sk_buff *skb, - struct Qdisc *q); - void *u32_node; struct netdev_queue *dev_queue; @@ -771,22 +768,6 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_DROP; } -static inline int qdisc_reshape_fail(struct sk_buff *skb, struct Qdisc *sch) -{ - qdisc_qstats_drop(sch); - -#ifdef CONFIG_NET_CLS_ACT - if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) - goto drop; - - return NET_XMIT_SUCCESS; - -drop: -#endif - kfree_skb(skb); - return NET_XMIT_DROP; -} - /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how long it will take to send a packet given its size. */ diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 2177eac0a61e..7857f748a6e4 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -24,7 +24,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) @@ -32,7 +32,7 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 205bed00dd34..31984c708382 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -407,7 +407,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { - qdisc_reshape_fail(skb, sch); + qdisc_drop(skb, sch); return NULL; } consume_skb(skb); @@ -499,7 +499,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); qdisc_qstats_backlog_inc(sch, skb); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 5abfe44678d4..ff0d968750df 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -96,7 +96,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue_tail(skb, sch); } - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 83b90b584fae..801148c8b6ac 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -166,7 +166,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); nb = 0; while (segs) { @@ -198,7 +198,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) return tbf_segment(skb, sch); - return qdisc_reshape_fail(skb, sch); + return qdisc_drop(skb, sch); } ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { -- cgit v1.2.3 From a09ceb0e08140a1eec05b49b4c232d3481339cb0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:42 +0200 Subject: sched: remove qdisc->drop after removal of TCA_CBQ_OVL_STRATEGY from cbq scheduler, there are no more callers of ->drop() outside of other ->drop functions, i.e. nothing calls them. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 36 ------------------------------------ net/sched/sch_atm.c | 15 --------------- net/sched/sch_cbq.c | 26 -------------------------- net/sched/sch_choke.c | 17 ----------------- net/sched/sch_drr.c | 21 --------------------- net/sched/sch_dsmark.c | 18 ------------------ net/sched/sch_fifo.c | 3 --- net/sched/sch_fq_codel.c | 10 ---------- net/sched/sch_gred.c | 35 ----------------------------------- net/sched/sch_hfsc.c | 26 -------------------------- net/sched/sch_hhf.c | 10 ---------- net/sched/sch_htb.c | 26 -------------------------- net/sched/sch_multiq.c | 22 ---------------------- net/sched/sch_netem.c | 30 ------------------------------ net/sched/sch_prio.c | 19 ------------------- net/sched/sch_qfq.c | 47 ----------------------------------------------- net/sched/sch_red.c | 20 -------------------- net/sched/sch_sfq.c | 1 - net/sched/sch_tbf.c | 13 ------------- 19 files changed, 395 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a9aec633d467..4dedb7f12ed5 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -174,7 +174,6 @@ struct Qdisc_ops { int (*enqueue)(struct sk_buff *, struct Qdisc *); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); - unsigned int (*drop)(struct Qdisc *); int (*init)(struct Qdisc *, struct nlattr *arg); void (*reset)(struct Qdisc *); @@ -658,22 +657,6 @@ static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch) return __qdisc_queue_drop_head(sch, &sch->q); } -static inline struct sk_buff *__qdisc_dequeue_tail(struct Qdisc *sch, - struct sk_buff_head *list) -{ - struct sk_buff *skb = __skb_dequeue_tail(list); - - if (likely(skb != NULL)) - qdisc_qstats_backlog_dec(sch, skb); - - return skb; -} - -static inline struct sk_buff *qdisc_dequeue_tail(struct Qdisc *sch) -{ - return __qdisc_dequeue_tail(sch, &sch->q); -} - static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { return skb_peek(&sch->q); @@ -741,25 +724,6 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, return old; } -static inline unsigned int __qdisc_queue_drop(struct Qdisc *sch, - struct sk_buff_head *list) -{ - struct sk_buff *skb = __qdisc_dequeue_tail(sch, list); - - if (likely(skb != NULL)) { - unsigned int len = qdisc_pkt_len(skb); - kfree_skb(skb); - return len; - } - - return 0; -} - -static inline unsigned int qdisc_queue_drop(struct Qdisc *sch) -{ - return __qdisc_queue_drop(sch, &sch->q); -} - static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { kfree_skb(skb); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 34f8f79e56d5..7e6c12dfc66a 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -519,20 +519,6 @@ static struct sk_buff *atm_tc_peek(struct Qdisc *sch) return p->link.q->ops->peek(p->link.q); } -static unsigned int atm_tc_drop(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - unsigned int len; - - pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) - return len; - } - return 0; -} - static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { struct atm_qdisc_data *p = qdisc_priv(sch); @@ -672,7 +658,6 @@ static struct Qdisc_ops atm_qdisc_ops __read_mostly = { .enqueue = atm_tc_enqueue, .dequeue = atm_tc_dequeue, .peek = atm_tc_peek, - .drop = atm_tc_drop, .init = atm_tc_init, .reset = atm_tc_reset, .destroy = atm_tc_destroy, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 7f4d6c5a0efe..f2af31be6370 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1025,31 +1025,6 @@ static void cbq_link_class(struct cbq_class *this) } } -static unsigned int cbq_drop(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl, *cl_head; - int prio; - unsigned int len; - - for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { - cl_head = q->active[prio]; - if (!cl_head) - continue; - - cl = cl_head; - do { - if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { - sch->q.qlen--; - if (!cl->q->q.qlen) - cbq_deactivate_class(cl); - return len; - } - } while ((cl = cl->next_alive) != cl_head); - } - return 0; -} - static void cbq_reset(struct Qdisc *sch) { @@ -1791,7 +1766,6 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { .enqueue = cbq_enqueue, .dequeue = cbq_dequeue, .peek = qdisc_peek_dequeued, - .drop = cbq_drop, .init = cbq_init, .reset = cbq_reset, .destroy = cbq_destroy, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 0a08c860eee4..04e0b0583e00 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -365,22 +365,6 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) return skb; } -static unsigned int choke_drop(struct Qdisc *sch) -{ - struct choke_sched_data *q = qdisc_priv(sch); - unsigned int len; - - len = qdisc_queue_drop(sch); - if (len > 0) - q->stats.other++; - else { - if (!red_is_idling(&q->vars)) - red_start_of_idle_period(&q->vars); - } - - return len; -} - static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); @@ -569,7 +553,6 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .enqueue = choke_enqueue, .dequeue = choke_dequeue, .peek = choke_peek_head, - .drop = choke_drop, .init = choke_init, .destroy = choke_destroy, .reset = choke_reset, diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 1b7e1a27773d..2bec94cf697c 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -421,26 +421,6 @@ out: return NULL; } -static unsigned int drr_drop(struct Qdisc *sch) -{ - struct drr_sched *q = qdisc_priv(sch); - struct drr_class *cl; - unsigned int len; - - list_for_each_entry(cl, &q->active, alist) { - if (cl->qdisc->ops->drop) { - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - sch->q.qlen--; - if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); - return len; - } - } - } - return 0; -} - static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct drr_sched *q = qdisc_priv(sch); @@ -509,7 +489,6 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .enqueue = drr_enqueue, .dequeue = drr_dequeue, .peek = qdisc_peek_dequeued, - .drop = drr_drop, .init = drr_init_qdisc, .reset = drr_reset_qdisc, .destroy = drr_destroy_qdisc, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 34b4ddaca27c..b9ba5f658528 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -320,23 +320,6 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch) return p->q->ops->peek(p->q); } -static unsigned int dsmark_drop(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - unsigned int len; - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - if (p->q->ops->drop == NULL) - return 0; - - len = p->q->ops->drop(p->q); - if (len) - sch->q.qlen--; - - return len; -} - static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { struct dsmark_qdisc_data *p = qdisc_priv(sch); @@ -489,7 +472,6 @@ static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { .enqueue = dsmark_enqueue, .dequeue = dsmark_dequeue, .peek = dsmark_peek, - .drop = dsmark_drop, .init = dsmark_init, .reset = dsmark_reset, .destroy = dsmark_destroy, diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 7857f748a6e4..dea70e3ef0ba 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -99,7 +99,6 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .enqueue = pfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, @@ -114,7 +113,6 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .enqueue = bfifo_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, @@ -129,7 +127,6 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .drop = qdisc_queue_drop_head, .init = fifo_init, .reset = qdisc_reset_queue, .change = fifo_init, diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1daa54237f4e..176a7e2561d7 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -184,15 +184,6 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) return idx; } -static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) -{ - unsigned int prev_backlog; - - prev_backlog = sch->qstats.backlog; - fq_codel_drop(sch, 1U); - return prev_backlog - sch->qstats.backlog; -} - static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -704,7 +695,6 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, - .drop = fq_codel_qdisc_drop, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 80105109f756..b5fb63c7be02 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -276,40 +276,6 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) return NULL; } -static unsigned int gred_drop(struct Qdisc *sch) -{ - struct sk_buff *skb; - struct gred_sched *t = qdisc_priv(sch); - - skb = qdisc_dequeue_tail(sch); - if (skb) { - unsigned int len = qdisc_pkt_len(skb); - struct gred_sched_data *q; - u16 dp = tc_index_to_dp(skb); - - if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", - tc_index_to_dp(skb)); - } else { - q->backlog -= len; - q->stats.other++; - - if (gred_wred_mode(t)) { - if (!sch->qstats.backlog) - red_start_of_idle_period(&t->wred_set); - } else { - if (!q->backlog) - red_start_of_idle_period(&q->vars); - } - } - - qdisc_drop(skb, sch); - return len; - } - - return 0; -} - static void gred_reset(struct Qdisc *sch) { int i; @@ -623,7 +589,6 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .enqueue = gred_enqueue, .dequeue = gred_dequeue, .peek = qdisc_peek_head, - .drop = gred_drop, .init = gred_init, .reset = gred_reset, .destroy = gred_destroy, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 74813dd49053..155e3ce81270 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1677,31 +1677,6 @@ hfsc_dequeue(struct Qdisc *sch) return skb; } -static unsigned int -hfsc_drop(struct Qdisc *sch) -{ - struct hfsc_sched *q = qdisc_priv(sch); - struct hfsc_class *cl; - unsigned int len; - - list_for_each_entry(cl, &q->droplist, dlist) { - if (cl->qdisc->ops->drop != NULL && - (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { - if (cl->qdisc->q.qlen == 0) { - update_vf(cl, 0, 0); - set_passive(cl); - } else { - list_move_tail(&cl->dlist, &q->droplist); - } - cl->qstats.drops++; - qdisc_qstats_drop(sch); - sch->q.qlen--; - return len; - } - } - return 0; -} - static const struct Qdisc_class_ops hfsc_class_ops = { .change = hfsc_change_class, .delete = hfsc_delete_class, @@ -1728,7 +1703,6 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .enqueue = hfsc_enqueue, .dequeue = hfsc_dequeue, .peek = qdisc_peek_dequeued, - .drop = hfsc_drop, .cl_ops = &hfsc_class_ops, .priv_size = sizeof(struct hfsc_sched), .owner = THIS_MODULE diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 13d6f83ec491..c51791848a38 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -368,15 +368,6 @@ static unsigned int hhf_drop(struct Qdisc *sch) return bucket - q->buckets; } -static unsigned int hhf_qdisc_drop(struct Qdisc *sch) -{ - unsigned int prev_backlog; - - prev_backlog = sch->qstats.backlog; - hhf_drop(sch); - return prev_backlog - sch->qstats.backlog; -} - static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); @@ -709,7 +700,6 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, - .drop = hhf_qdisc_drop, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2b057649f24b..b74d06668ab4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -936,31 +936,6 @@ fin: return skb; } -/* try to drop from each class (by prio) until one succeed */ -static unsigned int htb_drop(struct Qdisc *sch) -{ - struct htb_sched *q = qdisc_priv(sch); - int prio; - - for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { - struct list_head *p; - list_for_each(p, q->drops + prio) { - struct htb_class *cl = list_entry(p, struct htb_class, - un.leaf.drop_list); - unsigned int len; - if (cl->un.leaf.q->ops->drop && - (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { - sch->qstats.backlog -= len; - sch->q.qlen--; - if (!cl->un.leaf.q->q.qlen) - htb_deactivate(q, cl); - return len; - } - } - } - return 0; -} - /* reset all classes */ /* always caled under BH & queue lock */ static void htb_reset(struct Qdisc *sch) @@ -1600,7 +1575,6 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .enqueue = htb_enqueue, .dequeue = htb_dequeue, .peek = qdisc_peek_dequeued, - .drop = htb_drop, .init = htb_init, .reset = htb_reset, .destroy = htb_destroy, diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 21e69d2e8347..5ea93305d705 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -151,27 +151,6 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch) } -static unsigned int multiq_drop(struct Qdisc *sch) -{ - struct multiq_sched_data *q = qdisc_priv(sch); - int band; - unsigned int len; - struct Qdisc *qdisc; - - for (band = q->bands - 1; band >= 0; band--) { - qdisc = q->queues[band]; - if (qdisc->ops->drop) { - len = qdisc->ops->drop(qdisc); - if (len != 0) { - sch->q.qlen--; - return len; - } - } - } - return 0; -} - - static void multiq_reset(struct Qdisc *sch) { @@ -416,7 +395,6 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, - .drop = multiq_drop, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 31984c708382..9ca7947ab643 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -576,35 +576,6 @@ finish_segs: return NET_XMIT_SUCCESS; } -static unsigned int netem_drop(struct Qdisc *sch) -{ - struct netem_sched_data *q = qdisc_priv(sch); - unsigned int len; - - len = qdisc_queue_drop(sch); - - if (!len) { - struct rb_node *p = rb_first(&q->t_root); - - if (p) { - struct sk_buff *skb = netem_rb_to_skb(p); - - rb_erase(p, &q->t_root); - sch->q.qlen--; - skb->next = NULL; - skb->prev = NULL; - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - } - } - if (!len && q->qdisc && q->qdisc->ops->drop) - len = q->qdisc->ops->drop(q->qdisc); - if (len) - qdisc_qstats_drop(sch); - - return len; -} - static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); @@ -1143,7 +1114,6 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .enqueue = netem_enqueue, .dequeue = netem_dequeue, .peek = qdisc_peek_dequeued, - .drop = netem_drop, .init = netem_init, .reset = netem_reset, .destroy = netem_destroy, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 06eca7060683..4c77780b55c3 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -125,24 +125,6 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch) } -static unsigned int prio_drop(struct Qdisc *sch) -{ - struct prio_sched_data *q = qdisc_priv(sch); - int prio; - unsigned int len; - struct Qdisc *qdisc; - - for (prio = q->bands-1; prio >= 0; prio--) { - qdisc = q->queues[prio]; - if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { - sch->q.qlen--; - return len; - } - } - return 0; -} - - static void prio_reset(struct Qdisc *sch) { @@ -379,7 +361,6 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, - .drop = prio_drop, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 85d41979d825..cdb3966572f9 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1423,52 +1423,6 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } -static unsigned int qfq_drop_from_slot(struct qfq_sched *q, - struct hlist_head *slot) -{ - struct qfq_aggregate *agg; - struct qfq_class *cl; - unsigned int len; - - hlist_for_each_entry(agg, slot, next) { - list_for_each_entry(cl, &agg->active, alist) { - - if (!cl->qdisc->ops->drop) - continue; - - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - if (cl->qdisc->q.qlen == 0) - qfq_deactivate_class(q, cl); - - return len; - } - } - } - return 0; -} - -static unsigned int qfq_drop(struct Qdisc *sch) -{ - struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; - unsigned int i, j, len; - - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - len = qfq_drop_from_slot(q, &grp->slots[j]); - if (len > 0) { - sch->q.qlen--; - return len; - } - } - - } - - return 0; -} - static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct qfq_sched *q = qdisc_priv(sch); @@ -1563,7 +1517,6 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .enqueue = qfq_enqueue, .dequeue = qfq_dequeue, .peek = qdisc_peek_dequeued, - .drop = qfq_drop, .init = qfq_init_qdisc, .reset = qfq_reset_qdisc, .destroy = qfq_destroy_qdisc, diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 8c0508c0e287..235942f3464e 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -134,25 +134,6 @@ static struct sk_buff *red_peek(struct Qdisc *sch) return child->ops->peek(child); } -static unsigned int red_drop(struct Qdisc *sch) -{ - struct red_sched_data *q = qdisc_priv(sch); - struct Qdisc *child = q->qdisc; - unsigned int len; - - if (child->ops->drop && (len = child->ops->drop(child)) > 0) { - q->stats.other++; - qdisc_qstats_drop(sch); - sch->q.qlen--; - return len; - } - - if (!red_is_idling(&q->vars)) - red_start_of_idle_period(&q->vars); - - return 0; -} - static void red_reset(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); @@ -361,7 +342,6 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = { .enqueue = red_enqueue, .dequeue = red_dequeue, .peek = red_peek, - .drop = red_drop, .init = red_init, .reset = red_reset, .destroy = red_destroy, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 498f0a2cb47f..a2e0b855d1c8 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -896,7 +896,6 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, .peek = qdisc_peek_dequeued, - .drop = sfq_drop, .init = sfq_init, .reset = sfq_reset, .destroy = sfq_destroy, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 801148c8b6ac..1342e46574b2 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -211,18 +211,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static unsigned int tbf_drop(struct Qdisc *sch) -{ - struct tbf_sched_data *q = qdisc_priv(sch); - unsigned int len = 0; - - if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { - sch->q.qlen--; - qdisc_qstats_drop(sch); - } - return len; -} - static bool tbf_peak_present(const struct tbf_sched_data *q) { return q->peak.rate_bytes_ps; @@ -555,7 +543,6 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .enqueue = tbf_enqueue, .dequeue = tbf_dequeue, .peek = qdisc_peek_dequeued, - .drop = tbf_drop, .init = tbf_init, .reset = tbf_reset, .destroy = tbf_destroy, -- cgit v1.2.3 From c8945043cdc687388b7a43fc6f474bddd9607e80 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 9 Jun 2016 00:27:43 +0200 Subject: sched: place state, next_sched and gso_skb in same cacheline again Earlier commits removed two members from struct Qdisc which places next_sched/gso_skb into a different cacheline than ->state. This restores the struct layout to what it was before the removal. Move the two members, then add an annotation so they all reside in the same cacheline. This adds a 16 byte hole after cpu_qstats. The hole could be closed but as it doesn't decrease total struct size just do it this way. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4dedb7f12ed5..49534e28824b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,11 +71,11 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct Qdisc *next_sched; - struct sk_buff *gso_skb; /* * For performance sake on SMP, we put highly modified fields at the end */ + struct Qdisc *next_sched ____cacheline_aligned_in_smp; + struct sk_buff *gso_skb; unsigned long state; struct sk_buff_head q; struct gnet_stats_basic_packed bstats; -- cgit v1.2.3 From 80a83cfc434b1e3afe38974570b460db4898bec6 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 19 May 2016 10:37:48 +0200 Subject: mac80211: skip netdev queue control with software queuing Qdiscs are designed with no regard to 802.11 aggregation requirements and hand out packet-by-packet with no guarantee they are destined to the same tid. This does more bad than good no matter how fairly a given qdisc may behave on an ethernet interface. Software queuing used per-AC netdev subqueue congestion control whenever a global AC limit was hit. This meant in practice a single station or tid queue could starve others rather easily. This could resonate with qdiscs in a bad way or could just end up with poor aggregation performance. Increasing the AC limit would increase induced latency which is also bad. Disabling qdiscs by default and performing taildrop instead of netdev subqueue congestion control on the other hand makes it possible for tid queues to fill up "in the meantime" while preventing stations starving each other. This increases aggregation opportunities and should allow software queuing based drivers achieve better performance by utilizing airtime more efficiently with big aggregates. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 18 +++++++++-- net/mac80211/main.c | 3 -- net/mac80211/sta_info.c | 2 +- net/mac80211/tx.c | 75 ++++++++++++++++++++++++++-------------------- net/mac80211/util.c | 11 +++---- 7 files changed, 66 insertions(+), 49 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index be30b0549b88..a8683aec6dbe 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2147,9 +2147,6 @@ enum ieee80211_hw_flags { * @n_cipher_schemes: a size of an array of cipher schemes definitions. * @cipher_schemes: a pointer to an array of cipher scheme definitions * supported by HW. - * - * @txq_ac_max_pending: maximum number of frames per AC pending in all txq - * entries for a vif. */ struct ieee80211_hw { struct ieee80211_conf conf; @@ -2180,7 +2177,6 @@ struct ieee80211_hw { u8 uapsd_max_sp_len; u8 n_cipher_schemes; const struct ieee80211_cipher_scheme *cipher_schemes; - int txq_ac_max_pending; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9438c9406687..634603320374 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -856,7 +856,7 @@ struct ieee80211_sub_if_data { bool control_port_no_encrypt; int encrypt_headroom; - atomic_t txqs_len[IEEE80211_NUM_ACS]; + atomic_t num_tx_queued; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index c59af3eb9fa4..609c5174d798 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -976,13 +976,13 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->vif.txq) { struct txq_info *txqi = to_txq_info(sdata->vif.txq); + int n = skb_queue_len(&txqi->queue); spin_lock_bh(&txqi->queue.lock); ieee80211_purge_tx_queue(&local->hw, &txqi->queue); + atomic_sub(n, &sdata->num_tx_queued); txqi->byte_cnt = 0; spin_unlock_bh(&txqi->queue.lock); - - atomic_set(&sdata->txqs_len[txqi->txq.ac], 0); } if (local->open_count == 0) @@ -1198,6 +1198,12 @@ static void ieee80211_if_setup(struct net_device *dev) dev->destructor = ieee80211_if_free; } +static void ieee80211_if_setup_no_queue(struct net_device *dev) +{ + ieee80211_if_setup(dev); + dev->priv_flags |= IFF_NO_QUEUE; +} + static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = @@ -1707,6 +1713,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct txq_info *txqi; + void (*if_setup)(struct net_device *dev); int ret, i; int txqs = 1; @@ -1734,12 +1741,17 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; + if (local->ops->wake_tx_queue) + if_setup = ieee80211_if_setup_no_queue; + else + if_setup = ieee80211_if_setup; + if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, - ieee80211_if_setup, txqs, 1); + if_setup, txqs, 1); if (!ndev) return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 7ee91d6151d1..160ac6b8b9a1 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1055,9 +1055,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->dynamic_ps_forced_timeout = -1; - if (!local->hw.txq_ac_max_pending) - local->hw.txq_ac_max_pending = 64; - result = ieee80211_wep_init(local); if (result < 0) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 5ccfdbd406bd..177cc6cd6416 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -116,7 +116,7 @@ static void __cleanup_single_sta(struct sta_info *sta) int n = skb_queue_len(&txqi->queue); ieee80211_purge_tx_queue(&local->hw, &txqi->queue); - atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]); + atomic_sub(n, &sdata->num_tx_queued); txqi->byte_cnt = 0; } } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 203044379ce0..3e77da195ce8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1236,27 +1236,21 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static void ieee80211_drv_tx(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, - struct sk_buff *skb) +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_tx_control control = { - .sta = pubsta, - }; struct ieee80211_txq *txq = NULL; - struct txq_info *txqi; - u8 ac; if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - goto tx_normal; + return NULL; if (!ieee80211_is_data(hdr->frame_control)) - goto tx_normal; + return NULL; if (pubsta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; @@ -1267,25 +1261,28 @@ static void ieee80211_drv_tx(struct ieee80211_local *local, } if (!txq) - goto tx_normal; + return NULL; - ac = txq->ac; - txqi = to_txq_info(txq); - atomic_inc(&sdata->txqs_len[ac]); - if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending) - netif_stop_subqueue(sdata->dev, ac); + return to_txq_info(txq); +} - spin_lock_bh(&txqi->queue.lock); - txqi->byte_cnt += skb->len; - __skb_queue_tail(&txqi->queue, skb); - spin_unlock_bh(&txqi->queue.lock); +static void ieee80211_txq_enqueue(struct ieee80211_local *local, + struct txq_info *txqi, + struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txqi->txq.vif); - drv_wake_tx_queue(local, txqi); + lockdep_assert_held(&txqi->queue.lock); - return; + if (atomic_read(&sdata->num_tx_queued) >= TOTAL_MAX_TX_BUFFER || + txqi->queue.qlen >= STA_MAX_TX_BUFFER) { + ieee80211_free_txskb(&local->hw, skb); + return; + } -tx_normal: - drv_tx(local, &control, skb); + atomic_inc(&sdata->num_tx_queued); + txqi->byte_cnt += skb->len; + __skb_queue_tail(&txqi->queue, skb); } struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, @@ -1296,7 +1293,6 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct txq_info *txqi = container_of(txq, struct txq_info, txq); struct ieee80211_hdr *hdr; struct sk_buff *skb = NULL; - u8 ac = txq->ac; spin_lock_bh(&txqi->queue.lock); @@ -1307,12 +1303,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (!skb) goto out; + atomic_dec(&sdata->num_tx_queued); txqi->byte_cnt -= skb->len; - atomic_dec(&sdata->txqs_len[ac]); - if (__netif_subqueue_stopped(sdata->dev, ac)) - ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]); - hdr = (struct ieee80211_hdr *)skb->data; if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { struct sta_info *sta = container_of(txq->sta, struct sta_info, @@ -1343,7 +1336,9 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, struct sk_buff_head *skbs, bool txpending) { + struct ieee80211_tx_control control = {}; struct sk_buff *skb, *tmp; + struct txq_info *txqi; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { @@ -1358,6 +1353,21 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, } #endif + txqi = ieee80211_get_txq(local, vif, sta, skb); + if (txqi) { + info->control.vif = vif; + + __skb_unlink(skb, skbs); + + spin_lock_bh(&txqi->queue.lock); + ieee80211_txq_enqueue(local, txqi, skb); + spin_unlock_bh(&txqi->queue.lock); + + drv_wake_tx_queue(local, txqi); + + continue; + } + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { @@ -1400,9 +1410,10 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; + control.sta = sta; __skb_unlink(skb, skbs); - ieee80211_drv_tx(local, vif, sta, skb); + drv_tx(local, &control, skb); } return true; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 905003f75c4d..0db46442bdcf 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -244,6 +244,9 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) struct ieee80211_sub_if_data *sdata; int n_acs = IEEE80211_NUM_ACS; + if (local->ops->wake_tx_queue) + return; + if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -260,11 +263,6 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; - if (local->ops->wake_tx_queue && - (atomic_read(&sdata->txqs_len[ac]) > - local->hw.txq_ac_max_pending)) - continue; - if (ac_queue == queue || (sdata->vif.cab_queue == queue && local->queue_stop_reasons[ac_queue] == 0 && @@ -352,6 +350,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) return; + if (local->ops->wake_tx_queue) + return; + if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; -- cgit v1.2.3 From 5caa328e3811b7cfa33fd02c93280ffa622deb0e Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Thu, 19 May 2016 10:37:51 +0200 Subject: mac80211: implement codel on fair queuing flows There is no other limit other than a global packet count limit when using software queuing. This means a single flow queue can grow insanely long. This is particularly bad for TCP congestion algorithms which requires a little more sophisticated frame dropping scheme than a mere headdrop on limit overflow. Hence apply (a slighly modified, to fit the knobs) CoDel5 on flow queues. This improves TCP convergence and stability when combined with wireless driver which keeps its own tx queue/fifo at a minimum fill level for given link conditions. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- include/net/mac80211.h | 14 +++++- net/mac80211/ieee80211_i.h | 5 +++ net/mac80211/tx.c | 109 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 126 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a8683aec6dbe..a52009ffc19f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -21,6 +21,7 @@ #include #include #include +#include #include /** @@ -895,7 +896,18 @@ struct ieee80211_tx_info { unsigned long jiffies; }; /* NB: vif can be NULL for injected frames */ - struct ieee80211_vif *vif; + union { + /* NB: vif can be NULL for injected frames */ + struct ieee80211_vif *vif; + + /* When packets are enqueued on txq it's easy + * to re-construct the vif pointer. There's no + * more space in tx_info so it can be used to + * store the necessary enqueue time for packet + * sojourn time computation. + */ + codel_time_t enqueue_time; + }; struct ieee80211_key_conf *hw_key; u32 flags; /* 4 bytes free */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6f8375f1df88..54edfb6fc1d1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -812,10 +812,12 @@ enum txq_info_flags { * @tin: contains packets split into multiple flows * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin + * @def_cvars: codel vars for @def_flow */ struct txq_info { struct fq_tin tin; struct fq_flow def_flow; + struct codel_vars def_cvars; unsigned long flags; /* keep last! */ @@ -1108,6 +1110,9 @@ struct ieee80211_local { struct ieee80211_hw hw; struct fq fq; + struct codel_vars *cvars; + struct codel_params cparams; + struct codel_stats cstats; const struct ieee80211_ops *ops; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1d8343fca6d4..44ec605a5682 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -1267,11 +1269,92 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, return to_txq_info(txq); } +static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) +{ + IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); +} + +static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) +{ + IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; +} + +static u32 codel_skb_len_func(const struct sk_buff *skb) +{ + return skb->len; +} + +static codel_time_t codel_skb_time_func(const struct sk_buff *skb) +{ + const struct ieee80211_tx_info *info; + + info = (const struct ieee80211_tx_info *)skb->cb; + return info->control.enqueue_time; +} + +static struct sk_buff *codel_dequeue_func(struct codel_vars *cvars, + void *ctx) +{ + struct ieee80211_local *local; + struct txq_info *txqi; + struct fq *fq; + struct fq_flow *flow; + + txqi = ctx; + local = vif_to_sdata(txqi->txq.vif)->local; + fq = &local->fq; + + if (cvars == &txqi->def_cvars) + flow = &txqi->def_flow; + else + flow = &fq->flows[cvars - local->cvars]; + + return fq_flow_dequeue(fq, flow); +} + +static void codel_drop_func(struct sk_buff *skb, + void *ctx) +{ + struct ieee80211_local *local; + struct ieee80211_hw *hw; + struct txq_info *txqi; + + txqi = ctx; + local = vif_to_sdata(txqi->txq.vif)->local; + hw = &local->hw; + + ieee80211_free_txskb(hw, skb); +} + static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, struct fq_tin *tin, struct fq_flow *flow) { - return fq_flow_dequeue(fq, flow); + struct ieee80211_local *local; + struct txq_info *txqi; + struct codel_vars *cvars; + struct codel_params *cparams; + struct codel_stats *cstats; + + local = container_of(fq, struct ieee80211_local, fq); + txqi = container_of(tin, struct txq_info, tin); + cparams = &local->cparams; + cstats = &local->cstats; + + if (flow == &txqi->def_flow) + cvars = &txqi->def_cvars; + else + cvars = &local->cvars[flow - fq->flows]; + + return codel_dequeue(txqi, + &flow->backlog, + cparams, + cvars, + cstats, + codel_skb_len_func, + codel_skb_time_func, + codel_drop_func, + codel_dequeue_func); } static void fq_skb_free_func(struct fq *fq, @@ -1303,6 +1386,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; + ieee80211_set_skb_enqueue_time(skb); fq_tin_enqueue(fq, tin, skb, fq_skb_free_func, fq_flow_get_default_func); @@ -1314,6 +1398,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, { fq_tin_init(&txqi->tin); fq_flow_init(&txqi->def_flow); + codel_vars_init(&txqi->def_cvars); txqi->txq.vif = &sdata->vif; @@ -1342,6 +1427,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; int ret; + int i; if (!local->ops->wake_tx_queue) return 0; @@ -1350,6 +1436,22 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) if (ret) return ret; + codel_params_init(&local->cparams); + codel_stats_init(&local->cstats); + local->cparams.interval = MS2TIME(100); + local->cparams.target = MS2TIME(20); + local->cparams.ecn = true; + + local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), + GFP_KERNEL); + if (!local->cvars) { + fq_reset(fq, fq_skb_free_func); + return -ENOMEM; + } + + for (i = 0; i < fq->flows_cnt; i++) + codel_vars_init(&local->cvars[i]); + return 0; } @@ -1360,6 +1462,9 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) if (!local->ops->wake_tx_queue) return; + kfree(local->cvars); + local->cvars = NULL; + fq_reset(fq, fq_skb_free_func); } @@ -1382,6 +1487,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (!skb) goto out; + ieee80211_set_skb_vif(skb, txqi); + hdr = (struct ieee80211_hdr *)skb->data; if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { struct sta_info *sta = container_of(txq->sta, struct sta_info, -- cgit v1.2.3 From 52fbb2907988aa0583c6d9d53a56aee090b2df7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Jun 2016 07:45:11 -0700 Subject: net: sched: fix qdisc->running lockdep annotations 1) qdisc_run_begin() is really using the equivalent of a trylock. Instead of using write_seqcount_begin(), use a combination of raw_write_seqcount_begin() and correct lockdep annotation. 2) sch_direct_xmit() should use regular spin_lock(root_lock) Fixes: f9eb8aea2a1e ("net_sched: transform qdisc running bit into a seqcount") Signed-off-by: Eric Dumazet Reported-by: David Ahern Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 +++++- net/sched/sch_generic.c | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 49534e28824b..a4c0f1649e2b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -97,7 +97,11 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc_is_running(qdisc)) return false; - write_seqcount_begin(&qdisc->running); + /* Variant of write_seqcount_begin() telling lockdep a trylock + * was attempted. + */ + raw_write_seqcount_begin(&qdisc->running); + seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); return true; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cebea73e70ac..cad810b45e31 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -137,10 +137,10 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); + spin_lock(root_lock); return qdisc_qlen(q); } - spin_lock_nested(root_lock, SINGLE_DEPTH_NESTING); + spin_lock(root_lock); if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ -- cgit v1.2.3 From 6f094b9ec680209c5b7314feee983b2f4c910b1b Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Wed, 8 Jun 2016 21:16:44 -0700 Subject: tcp: add in_flight to tcp_skb_cb Add in_flight (bytes in flight when packet was sent) field to tx component of tcp_skb_cb and make it available to congestion modules' pkts_acked() function through the ack_sample function argument. Signed-off-by: Lawrence Brakmo Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 5 ++++- net/ipv4/tcp_output.c | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0bcc70f4e1fb..a79894b66726 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -767,6 +767,7 @@ struct tcp_skb_cb { union { struct { /* There is space for up to 20 bytes */ + __u32 in_flight;/* Bytes in flight when packet sent */ } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -859,6 +860,7 @@ union tcp_cc_info; struct ack_sample { u32 pkts_acked; s32 rtt_us; + u32 in_flight; }; struct tcp_congestion_ops { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 89dd8d82826f..94d4aff97523 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3115,6 +3115,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; + u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3154,6 +3155,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!first_ackt.v64) first_ackt = last_ackt; + last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) flag |= FLAG_ORIG_SACK_ACKED; @@ -3250,7 +3252,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us }; + .rtt_us = ca_rtt_us, + .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8bd9911fdd16..b1bcba0563f2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -911,9 +911,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); + tp = tcp_sk(sk); if (clone_it) { skb_mstamp_get(&skb->skb_mstamp); + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -924,7 +927,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } inet = inet_sk(sk); - tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); -- cgit v1.2.3 From 45f50bed1d808794e514e9eed0e579a8756ce2ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Jun 2016 16:41:39 -0700 Subject: net_sched: remove generic throttled management __QDISC_STATE_THROTTLED bit manipulation is rather expensive for HTB and few others. I already removed it for sch_fq in commit f2600cf02b5b ("net: sched: avoid costly atomic operation in fq_dequeue()") and so far nobody complained. When one ore more packets are stuck in one or more throttled HTB class, a htb dequeue() performs two atomic operations to clear/set __QDISC_STATE_THROTTLED bit, while root qdisc lock is held. Removing this pair of atomic operations bring me a 8 % performance increase on 200 TCP_RR tests, in presence of throttled classes. This patch has no side effect, since nothing actually uses disc_is_throttled() anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 ++-- include/net/sch_generic.h | 16 ---------------- net/sched/sch_api.c | 7 +------ net/sched/sch_cbq.c | 2 -- net/sched/sch_fq.c | 3 +-- net/sched/sch_hfsc.c | 1 - net/sched/sch_htb.c | 3 +-- net/sched/sch_netem.c | 1 - net/sched/sch_tbf.c | 4 +--- 9 files changed, 6 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index fea53f4d92ca..7caa99b482c6 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -67,12 +67,12 @@ struct qdisc_watchdog { }; void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle); +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { - qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires), true); + qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9f3581980c15..9a0d177884c6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -26,7 +26,6 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, - __QDISC_STATE_THROTTLED, }; struct qdisc_size_table { @@ -125,21 +124,6 @@ static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) #endif } -static inline bool qdisc_is_throttled(const struct Qdisc *qdisc) -{ - return test_bit(__QDISC_STATE_THROTTLED, &qdisc->state) ? true : false; -} - -static inline void qdisc_throttled(struct Qdisc *qdisc) -{ - set_bit(__QDISC_STATE_THROTTLED, &qdisc->state); -} - -static inline void qdisc_unthrottled(struct Qdisc *qdisc) -{ - clear_bit(__QDISC_STATE_THROTTLED, &qdisc->state); -} - struct Qdisc_class_ops { /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d4a8bbfcc953..401eda6de682 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -583,7 +583,6 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) timer); rcu_read_lock(); - qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); rcu_read_unlock(); @@ -598,15 +597,12 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - if (throttle) - qdisc_throttled(wd->qdisc); - if (wd->last_expires == expires) return; @@ -620,7 +616,6 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); - qdisc_unthrottled(wd->qdisc); } EXPORT_SYMBOL(qdisc_watchdog_cancel); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6e61f9aa8783..a29fd811d7b9 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -513,7 +513,6 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); } - qdisc_unthrottled(sch); __netif_schedule(qdisc_root(sch)); return HRTIMER_NORESTART; } @@ -819,7 +818,6 @@ cbq_dequeue(struct Qdisc *sch) if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; - qdisc_unthrottled(sch); return skb; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 3c6a47d66a04..f49c81e91acd 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -445,8 +445,7 @@ begin: if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow, - false); + q->time_next_delayed_flow); return NULL; } } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index eb3d3f5aba80..bd08c363a26d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1664,7 +1664,6 @@ hfsc_dequeue(struct Qdisc *sch) set_passive(cl); } - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index b74d06668ab4..07dcd2933f01 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -889,7 +889,6 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) if (skb != NULL) { ok: qdisc_bstats_update(sch, skb); - qdisc_unthrottled(sch); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; @@ -929,7 +928,7 @@ ok: } qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) - qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true); + qdisc_watchdog_schedule_ns(&q->watchdog, next_event); else schedule_work(&q->work); fin: diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2dbe732ca135..876df13c745a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -587,7 +587,6 @@ tfifo_dequeue: if (skb) { qdisc_qstats_backlog_dec(sch, skb); deliver: - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 7fa3d6e1291c..c12df84d1078 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -254,14 +254,12 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) q->ptokens = ptoks; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; - qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; } qdisc_watchdog_schedule_ns(&q->watchdog, - now + max_t(long, -toks, -ptoks), - true); + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, -- cgit v1.2.3 From cd2a9e62c8a3c5cae7691982667d79a0edc65283 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Jun 2016 13:44:17 -0700 Subject: net: l3mdev: Remove const from flowi6 arg to get_rt6_dst Allow drivers to pass flow arg to functions where the arg is not const and allow the driver to make updates as needed (eg., setting oif). Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 +- include/net/l3mdev.h | 6 +++--- net/l3mdev/l3mdev.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index b4d746943bc5..d2ce76c9dc64 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -861,7 +861,7 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, - const struct flowi6 *fl6) + struct flowi6 *fl6) { struct dst_entry *dst = NULL; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 34f33eb96a5e..f8a416ec674c 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -38,7 +38,7 @@ struct l3mdev_ops { /* IPv6 ops */ struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev, - const struct flowi6 *fl6); + struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV @@ -139,7 +139,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6); +struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) @@ -225,7 +225,7 @@ static inline int l3mdev_get_saddr(struct net *net, int ifindex, } static inline -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6) +struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6) { return NULL; } diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 7da97809a7e8..d90e4ef09e85 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); */ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - const struct flowi6 *fl6) + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; -- cgit v1.2.3 From 9ff74384600aeecba34ebdacbbde0627489ff601 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 13 Jun 2016 13:44:19 -0700 Subject: net: vrf: Handle ipv6 multicast and link-local addresses IPv6 multicast and link-local addresses require special handling by the VRF driver: 1. Rather than using the VRF device index and full FIB lookups, packets to/from these addresses should use direct FIB lookups based on the VRF device table. 2. fail sends/receives on a VRF device to/from a multicast address (e.g, make ping6 ff02::1% fail) 3. move the setting of the flow oif to the first dst lookup and revert the change in icmpv6_echo_reply made in ca254490c8dfd ("net: Add VRF support to IPv6 stack"). Linklocal/mcast addresses require use of the skb->dev. With this change connections into and out of a VRF enslaved device work for multicast and link-local addresses work (icmp, tcp, and udp) e.g., 1. packets into VM with VRF config: ping6 -c3 fe80::e0:f9ff:fe1c:b974%br1 ping6 -c3 ff02::1%br1 ssh -6 fe80::e0:f9ff:fe1c:b974%br1 2. packets going out a VRF enslaved device: ping6 -c3 fe80::18f8:83ff:fe4b:7a2e%eth1 ping6 -c3 ff02::1%eth1 ssh -6 root@fe80::18f8:83ff:fe4b:7a2e%eth1 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++--- include/net/ip6_route.h | 2 + net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 5 ++- 4 files changed, 99 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index d2ce76c9dc64..0b5b3c258c2b 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -785,9 +785,63 @@ out: return rc; } +static struct rt6_info *vrf_ip6_route_lookup(struct net *net, + const struct net_device *dev, + struct flowi6 *fl6, + int ifindex, + int flags) +{ + struct net_vrf *vrf = netdev_priv(dev); + struct fib6_table *table = NULL; + struct rt6_info *rt6; + + rcu_read_lock(); + + /* fib6_table does not have a refcnt and can not be freed */ + rt6 = rcu_dereference(vrf->rt6); + if (likely(rt6)) + table = rt6->rt6i_table; + + rcu_read_unlock(); + + if (!table) + return NULL; + + return ip6_pol_route(net, table, ifindex, fl6, flags); +} + +static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, + int ifindex) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct flowi6 fl6 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, + .flowi6_iif = ifindex, + }; + struct net *net = dev_net(vrf_dev); + struct rt6_info *rt6; + + rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, + RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); + if (unlikely(!rt6)) + return; + + if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) + return; + + skb_dst_set(skb, &rt6->dst); +} + static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { + int orig_iif = skb->skb_iif; + bool need_strict; + /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ @@ -798,8 +852,11 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, goto out; } - /* if packet is NDISC keep the ingress interface */ - if (!ipv6_ndisc_frame(skb)) { + /* if packet is NDISC or addressed to multicast or link-local + * then keep the ingress interface + */ + need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); + if (!ipv6_ndisc_frame(skb) && !need_strict) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; @@ -810,6 +867,9 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } + if (need_strict) + vrf_ip6_input_dst(skb, vrf_dev, orig_iif); + out: return skb; } @@ -863,11 +923,35 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, struct flowi6 *fl6) { + bool need_strict = rt6_need_strict(&fl6->daddr); + struct net_vrf *vrf = netdev_priv(dev); + struct net *net = dev_net(dev); struct dst_entry *dst = NULL; + struct rt6_info *rt; - if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { - struct net_vrf *vrf = netdev_priv(dev); - struct rt6_info *rt; + /* send to link-local or multicast address */ + if (need_strict) { + int flags = RT6_LOOKUP_F_IFACE; + + /* VRF device does not have a link-local address and + * sending packets to link-local or mcast addresses over + * a VRF device does not make sense + */ + if (fl6->flowi6_oif == dev->ifindex) { + struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst; + + dst_hold(dst); + return dst; + } + + if (!ipv6_addr_any(&fl6->saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + if (rt) + dst = &rt->dst; + + } else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { rcu_read_lock(); @@ -880,6 +964,10 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, rcu_read_unlock(); } + /* make sure oif is set to VRF device for lookup */ + if (!need_strict) + fl6->flowi6_oif = dev->ifindex; + return dst; } #endif diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 54c779416eec..f55bf3d294aa 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -76,6 +76,8 @@ static inline struct dst_entry *ip6_route_output(struct net *net, struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, int flags); +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int ifindex, struct flowi6 *fl6, int flags); int ip6_route_init(void); void ip6_route_cleanup(void); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 40454bfb534e..e32a72fb9982 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -587,7 +587,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); + fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c6ae6f9b5fe3..d51a1a48b839 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } -static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, - struct flowi6 *fl6, int flags) +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt; @@ -1139,6 +1139,7 @@ redo_rt6_select: } } +EXPORT_SYMBOL_GPL(ip6_pol_route); static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) -- cgit v1.2.3 From b2313077ed0db35ee186905d8076a737248edd24 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 13 Jun 2016 13:46:28 -0700 Subject: net_sched: make tcf_hash_check() boolean Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 4 ++-- net/sched/act_api.c | 8 ++++---- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 3 ++- net/sched/act_mirred.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_vlan.c | 4 ++-- 8 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index db218a12efb5..fb82b5b5d9e7 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -155,8 +155,8 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct tc_action *a); int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index); u32 tcf_hash_new_index(struct tc_action_net *tn); -int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, - int bind); +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind); int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action *a, int size, int bind, bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b6db56ec8117..f8c61d2a7963 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -224,8 +224,8 @@ int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, - int bind) +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = NULL; @@ -235,9 +235,9 @@ int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, p->tcfc_refcnt++; a->priv = p; a->hinfo = hinfo; - return 1; + return true; } - return 0; + return false; } EXPORT_SYMBOL(tcf_hash_check); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 02f5a8ba95d7..b7fa96926c90 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -423,7 +423,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, u16 ife_type = 0; u8 *daddr = NULL; u8 *saddr = NULL; - int ret = 0, exists = 0; + bool exists = false; + int ret = 0; int err; err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8998a3594e86..6148e323ed93 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -97,7 +97,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; u32 hook = 0; u32 index = 0; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 787751a7981a..5b135d357e1e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -62,7 +62,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; - int ret, ok_push = 0, exists = 0; + int ret, ok_push = 0; + bool exists = false; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index be5fbb51cfed..318328d34d12 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -86,8 +86,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; + bool exists = false; + int ret = 0, err; char *defdata; - int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7e2bc3c2b6da..53d1486cddf7 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -69,7 +69,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index b075d50e0fc3..db9b7ed570ba 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -77,8 +77,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; - int ret = 0, exists = 0; - int err; + bool exists = false; + int ret = 0, err; if (!nla) return -EINVAL; -- cgit v1.2.3 From 1b5c5493e3e68181be344cb51bf9df192d05ffc2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jun 2016 20:21:50 -0700 Subject: net_sched: add the ability to defer skb freeing qdisc are changed under RTNL protection and often while blocking BH and root qdisc spinlock. When lots of skbs need to be dropped, we free them under these locks causing TX/RX freezes, and more generally latency spikes. This commit adds rtnl_kfree_skbs(), used to queue skbs for deferred freeing. Actual freeing happens right after RTNL is released, with appropriate scheduling points. rtnl_qdisc_drop() can also be used in place of disc_drop() when RTNL is held. qdisc_reset_queue() and __qdisc_reset_queue() get the new behavior, so standard qdiscs like pfifo, pfifo_fast... have their ->reset() method automatically handled. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 5 +++-- include/net/sch_generic.h | 16 ++++++++++++---- net/core/rtnetlink.c | 22 ++++++++++++++++++++++ net/sched/sch_generic.c | 2 +- 4 files changed, 38 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c006cc900c44..2daece8979f7 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -89,8 +89,9 @@ void net_inc_egress_queue(void); void net_dec_egress_queue(void); #endif -extern void rtnetlink_init(void); -extern void __rtnl_unlock(void); +void rtnetlink_init(void); +void __rtnl_unlock(void); +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); #define ASSERT_RTNL() do { \ if (unlikely(!rtnl_is_locked())) { \ diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9a0d177884c6..4f7cee8344c4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -683,19 +683,21 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) return skb; } -static inline void __qdisc_reset_queue(struct Qdisc *sch, - struct sk_buff_head *list) +static inline void __qdisc_reset_queue(struct sk_buff_head *list) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ - __skb_queue_purge(list); + if (!skb_queue_empty(list)) { + rtnl_kfree_skbs(list->next, list->prev); + __skb_queue_head_init(list); + } } static inline void qdisc_reset_queue(struct Qdisc *sch) { - __qdisc_reset_queue(sch, &sch->q); + __qdisc_reset_queue(&sch->q); sch->qstats.backlog = 0; } @@ -716,6 +718,12 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, return old; } +static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) +{ + rtnl_kfree_skbs(skb, skb); + qdisc_qstats_drop(sch); +} + static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { kfree_skb(skb); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d69c4644f8f2..eb49ca24274a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -71,9 +71,31 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +static struct sk_buff *defer_kfree_skb_list; +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (head && tail) { + tail->next = defer_kfree_skb_list; + defer_kfree_skb_list = head; + } +} +EXPORT_SYMBOL(rtnl_kfree_skbs); + void __rtnl_unlock(void) { + struct sk_buff *head = defer_kfree_skb_list; + + defer_kfree_skb_list = NULL; + mutex_unlock(&rtnl_mutex); + + while (head) { + struct sk_buff *next = head->next; + + kfree_skb(head); + cond_resched(); + head = next; + } } void rtnl_unlock(void) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0c9cb516f2e3..773b632e1e33 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -493,7 +493,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(qdisc, band2list(priv, prio)); + __qdisc_reset_queue(band2list(priv, prio)); priv->bitmap = 0; qdisc->qstats.backlog = 0; -- cgit v1.2.3 From 8626a0c83b0d471d859bcd908d016874df951fc3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:16 +0200 Subject: 6lowpan: add private neighbour data This patch will introduce a 6lowpan neighbour private data. Like the interface private data we handle private data for generic 6lowpan and for link-layer specific 6lowpan. The current first use case if to save the short address for a 802.15.4 6lowpan neighbour. Cc: David S. Miller Reviewed-by: Stefan Schmidt Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +-- include/net/6lowpan.h | 10 ++++++++++ net/ieee802154/6lowpan/core.c | 12 ++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d101e4d904ba..36e43bd422f8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1483,8 +1483,7 @@ enum netdev_priv_flags { * @perm_addr: Permanent hw address * @addr_assign_type: Hw address assignment type * @addr_len: Hardware address length - * @neigh_priv_len; Used in neigh_alloc(), - * initialized only in atm/clip.c + * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address * @dev_port: Used to differentiate devices that share diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index da84cf920b78..2d9b9d39221e 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -141,6 +141,16 @@ struct lowpan_dev { u8 priv[0] __aligned(sizeof(void *)); }; +struct lowpan_802154_neigh { + __le16 short_addr; +}; + +static inline +struct lowpan_802154_neigh *lowpan_802154_neigh(void *neigh_priv) +{ + return neigh_priv; +} + static inline struct lowpan_dev *lowpan_dev(const struct net_device *dev) { diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 4e2b30894224..8c004a0c8d64 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -81,11 +81,21 @@ static int lowpan_stop(struct net_device *dev) return 0; } +static int lowpan_neigh_construct(struct neighbour *n) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + + /* default no short_addr is available for a neighbour */ + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + return 0; +} + static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, + .ndo_neigh_construct = lowpan_neigh_construct, }; static void lowpan_setup(struct net_device *ldev) @@ -150,6 +160,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; + ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); + ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); -- cgit v1.2.3 From 2ad3ed59198c5404c34515cfcfd9a2b3c54d964f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:17 +0200 Subject: 6lowpan: add 802.15.4 short addr slaac This patch adds the autoconfiguration if a valid 802.15.4 short address is available for 802.15.4 6LoWPAN interfaces. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: Hannes Frederic Sowa Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/6lowpan.h | 6 ++++++ include/net/addrconf.h | 3 +++ net/6lowpan/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/addrconf.c | 5 +++-- 4 files changed, 58 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index 2d9b9d39221e..5ab4c9901ccc 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -254,6 +254,12 @@ static inline bool lowpan_fetch_skb(struct sk_buff *skb, void *data, return false; } +static inline bool lowpan_802154_is_valid_src_short_addr(__le16 addr) +{ + /* First bit of addr is multicast, reserved or 802.15.4 specific */ + return !(addr & cpu_to_le16(0x8000)); +} + static inline void lowpan_push_hc_data(u8 **hc_ptr, const void *data, const size_t len) { diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 730d856683e5..b1774eb03f37 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -94,6 +94,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); +void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags); + static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 7a240b3eaed1..801404ceea23 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -14,6 +14,7 @@ #include #include +#include #include "6lowpan_i.h" @@ -72,16 +73,61 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); +static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) +{ + struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + /* Set short_addr autoconfiguration if short_addr is present only */ + if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + return -1; + + /* For either address format, all zero addresses MUST NOT be used */ + if (wpan_dev->pan_id == cpu_to_le16(0x0000) && + wpan_dev->short_addr == cpu_to_le16(0x0000)) + return -1; + + /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ + if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + memset(eui, 0, 2); + else + ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); + + /* The "Universal/Local" (U/L) bit shall be set to zero */ + eui[0] &= ~2; + eui[2] = 0; + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[5] = 0; + ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); + return 0; +} + static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct inet6_dev *idev; + struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; + idev = __in6_dev_get(dev); + if (!idev) + return NOTIFY_DONE; + switch (event) { + case NETDEV_UP: + case NETDEV_CHANGE: + /* (802.15.4 6LoWPAN short address slaac handling */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && + addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { + __ipv6_addr_set_half(&addr.s6_addr32[0], + htonl(0xFE800000), 0); + addrconf_add_linklocal(idev, &addr, 0); + } + break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b12553905e42..1ce4048d1b5e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2947,8 +2947,8 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, - const struct in6_addr *addr, u32 flags) +void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; u32 addr_flags = flags | IFA_F_PERMANENT; @@ -2967,6 +2967,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, in6_ifa_put(ifp); } } +EXPORT_SYMBOL_GPL(addrconf_add_linklocal); static bool ipv6_reserved_interfaceid(struct in6_addr address) { -- cgit v1.2.3 From 1e82f961ac8e94c50a933e89ee08071fc81a4bbd Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:19 +0200 Subject: ndisc: add __ndisc_opt_addr_space function This patch adds __ndisc_opt_addr_space as low-level function for ndisc_opt_addr_space which doesn't depend on net_device parameter. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/ndisc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 2d8edaad29cb..4cee82654fe5 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -127,10 +127,15 @@ static inline int ndisc_addr_option_pad(unsigned short type) } } +static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad) +{ + return NDISC_OPT_SPACE(addr_len + pad); +} + static inline int ndisc_opt_addr_space(struct net_device *dev) { - return NDISC_OPT_SPACE(dev->addr_len + - ndisc_addr_option_pad(dev->type)); + return __ndisc_opt_addr_space(dev->addr_len, + ndisc_addr_option_pad(dev->type)); } static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, -- cgit v1.2.3 From 4f36ce84c54c971cd67c882035d9bde7b1a2ee53 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:20 +0200 Subject: ndisc: add __ndisc_opt_addr_data function This patch adds __ndisc_opt_addr_data as low-level function for ndisc_opt_addr_data which doesn't depend on net_device parameter. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/ndisc.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 4cee82654fe5..c8962adf24d0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -138,17 +138,23 @@ static inline int ndisc_opt_addr_space(struct net_device *dev) ndisc_addr_option_pad(dev->type)); } -static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, - struct net_device *dev) +static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p, + unsigned char addr_len, int prepad) { u8 *lladdr = (u8 *)(p + 1); int lladdrlen = p->nd_opt_len << 3; - int prepad = ndisc_addr_option_pad(dev->type); - if (lladdrlen != ndisc_opt_addr_space(dev)) + if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad)) return NULL; return lladdr + prepad; } +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + return __ndisc_opt_addr_data(p, dev->addr_len, + ndisc_addr_option_pad(dev->type)); +} + static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { const u32 *p32 = pkey; -- cgit v1.2.3 From f997c55c1dc8841b3ee4df0493d0ac7966d42165 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:23 +0200 Subject: ipv6: introduce neighbour discovery ops This patch introduces neighbour discovery ops callback structure. The idea is to separate the handling for 6LoWPAN into the 6lowpan module. These callback offers 6lowpan different handling, such as 802.15.4 short address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over 6LoWPANs). Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ++ include/net/ndisc.h | 197 +++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/addrconf.c | 13 ++- net/ipv6/ndisc.c | 101 ++++++++++++++++-------- net/ipv6/route.c | 8 +- 5 files changed, 284 insertions(+), 40 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 36e43bd422f8..890158e99159 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1456,6 +1456,8 @@ enum netdev_priv_flags { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @ethtool_ops: Management operations + * @ndisc_ops: Includes callbacks for different IPv6 neighbour + * discovery handling. Necessary for e.g. 6LoWPAN. * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * @@ -1672,6 +1674,9 @@ struct net_device { #ifdef CONFIG_NET_L3_MASTER_DEV const struct l3mdev_ops *l3mdev_ops; #endif +#if IS_ENABLED(CONFIG_IPV6) + const struct ndisc_ops *ndisc_ops; +#endif const struct header_ops *header_ops; diff --git a/include/net/ndisc.h b/include/net/ndisc.h index c8962adf24d0..a5e276703cb3 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -58,6 +58,7 @@ struct inet6_dev; struct net_device; struct net_proto_family; struct sk_buff; +struct prefix_info; extern struct neigh_table nd_tbl; @@ -110,9 +111,182 @@ struct ndisc_options { #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, + u8 *opt, int opt_len, struct ndisc_options *ndopts); +#define NDISC_OPS_REDIRECT_DATA_SPACE 2 + +/* + * This structure defines the hooks for IPv6 neighbour discovery. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * int (*is_useropt)(u8 nd_opt_type): + * This function is called when IPv6 decide RA userspace options. if + * this function returns 1 then the option given by nd_opt_type will + * be handled as userspace option additional to the IPv6 options. + * + * int (*parse_options)(const struct net_device *dev, + * struct nd_opt_hdr *nd_opt, + * struct ndisc_options *ndopts): + * This function is called while parsing ndisc ops and put each position + * as pointer into ndopts. If this function return unequal 0, then this + * function took care about the ndisc option, if 0 then the IPv6 ndisc + * option parser will take care about that option. + * + * void (*update)(const struct net_device *dev, struct neighbour *n, + * u32 flags, u8 icmp6_type, + * const struct ndisc_options *ndopts): + * This function is called when IPv6 ndisc updates the neighbour cache + * entry. Additional options which can be updated may be previously + * parsed by parse_opts callback and accessible over ndopts parameter. + * + * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, + * struct neighbour *neigh, u8 *ha_buf, + * u8 **ha): + * This function is called when the necessary option space will be + * calculated before allocating a skb. The parameters neigh, ha_buf + * abd ha are available on NDISC_REDIRECT messages only. + * + * void (*fill_addr_option)(const struct net_device *dev, + * struct sk_buff *skb, u8 icmp6_type, + * const u8 *ha): + * This function is called when the skb will finally fill the option + * fields inside skb. NOTE: this callback should fill the option + * fields to the skb which are previously indicated by opt_space + * parameter. That means the decision to add such option should + * not lost between these two callbacks, e.g. protected by interface + * up state. + * + * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, + * const struct prefix_info *pinfo, + * struct inet6_dev *in6_dev, + * struct in6_addr *addr, + * int addr_type, u32 addr_flags, + * bool sllao, bool tokenized, + * __u32 valid_lft, u32 prefered_lft, + * bool dev_addr_generated): + * This function is called when a RA messages is received with valid + * PIO option fields and an IPv6 address will be added to the interface + * for autoconfiguration. The parameter dev_addr_generated reports about + * if the address was based on dev->dev_addr or not. This can be used + * to add a second address if link-layer operates with two link layer + * addresses. E.g. 802.15.4 6LoWPAN. + */ +struct ndisc_ops { + int (*is_useropt)(u8 nd_opt_type); + int (*parse_options)(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts); + void (*update)(const struct net_device *dev, struct neighbour *n, + u32 flags, u8 icmp6_type, + const struct ndisc_options *ndopts); + int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, + struct neighbour *neigh, u8 *ha_buf, + u8 **ha); + void (*fill_addr_option)(const struct net_device *dev, + struct sk_buff *skb, u8 icmp6_type, + const u8 *ha); + void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft, + bool dev_addr_generated); +}; + +#if IS_ENABLED(CONFIG_IPV6) +static inline int ndisc_ops_is_useropt(const struct net_device *dev, + u8 nd_opt_type) +{ + if (dev->ndisc_ops && dev->ndisc_ops->is_useropt) + return dev->ndisc_ops->is_useropt(nd_opt_type); + else + return 0; +} + +static inline int ndisc_ops_parse_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + if (dev->ndisc_ops && dev->ndisc_ops->parse_options) + return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts); + else + return 0; +} + +static inline void ndisc_ops_update(const struct net_device *dev, + struct neighbour *n, u32 flags, + u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + if (dev->ndisc_ops && dev->ndisc_ops->update) + dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts); +} + +static inline int ndisc_ops_opt_addr_space(const struct net_device *dev, + u8 icmp6_type) +{ + if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space && + icmp6_type != NDISC_REDIRECT) + return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL, + NULL, NULL); + else + return 0; +} + +static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev, + struct neighbour *neigh, + u8 *ha_buf, u8 **ha) +{ + if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space) + return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT, + neigh, ha_buf, ha); + else + return 0; +} + +static inline void ndisc_ops_fill_addr_option(const struct net_device *dev, + struct sk_buff *skb, + u8 icmp6_type) +{ + if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option && + icmp6_type != NDISC_REDIRECT) + dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL); +} + +static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev, + struct sk_buff *skb, + const u8 *ha) +{ + if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option) + dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha); +} + +static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, + u32 prefered_lft, + bool dev_addr_generated) +{ + if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr) + dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + addr, addr_type, + addr_flags, sllao, + tokenized, valid_lft, + prefered_lft, + dev_addr_generated); +} +#endif + /* * Return the padding between the option length and the start of the * link addr. Currently only IP-over-InfiniBand needs this, although @@ -132,11 +306,25 @@ static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad) return NDISC_OPT_SPACE(addr_len + pad); } -static inline int ndisc_opt_addr_space(struct net_device *dev) +#if IS_ENABLED(CONFIG_IPV6) +static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type) +{ + return __ndisc_opt_addr_space(dev->addr_len, + ndisc_addr_option_pad(dev->type)) + + ndisc_ops_opt_addr_space(dev, icmp6_type); +} + +static inline int ndisc_redirect_opt_addr_space(struct net_device *dev, + struct neighbour *neigh, + u8 *ops_data_buf, + u8 **ops_data) { return __ndisc_opt_addr_space(dev->addr_len, - ndisc_addr_option_pad(dev->type)); + ndisc_addr_option_pad(dev->type)) + + ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf, + ops_data); } +#endif static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p, unsigned char addr_len, int prepad) @@ -205,6 +393,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target); int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir); +void ndisc_update(const struct net_device *dev, struct neighbour *neigh, + const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, + struct ndisc_options *ndopts); /* * IGMP diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 44c1cefdb299..b6e9bdc610f2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2531,7 +2531,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (pinfo->autoconf && in6_dev->cnf.autoconf) { struct in6_addr addr; - bool tokenized = false; + bool tokenized = false, dev_addr_generated = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); @@ -2550,6 +2550,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { goto put; + } else { + dev_addr_generated = true; } goto ok; } @@ -2565,6 +2567,15 @@ ok: prefered_lft); if (err) goto put; + + /* Ignore error case here because previous prefix add addr was + * successful which will be notified. + */ + ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr, + addr_type, addr_flags, sllao, + tokenized, valid_lft, + prefered_lft, + dev_addr_generated); } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); put: diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a7b9468e684a..2f4afd17954b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -172,10 +172,19 @@ static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, } static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, - void *data) + void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); + ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); +} + +static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, + void *ha, + const u8 *ops_data) +{ + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); + ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, @@ -191,24 +200,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, return cur <= end && cur->nd_opt_type == type ? cur : NULL; } -static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) +static inline int ndisc_is_useropt(const struct net_device *dev, + struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_RDNSS || - opt->nd_opt_type == ND_OPT_DNSSL; + opt->nd_opt_type == ND_OPT_DNSSL || + ndisc_ops_is_useropt(dev, opt->nd_opt_type); } -static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, +static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, + struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while (cur < end && !ndisc_is_useropt(cur)); - return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; + } while (cur < end && !ndisc_is_useropt(dev, cur)); + return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, + u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -223,6 +236,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; + if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) + goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: @@ -249,7 +264,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, break; #endif default: - if (ndisc_is_useropt(nd_opt)) { + if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; @@ -266,6 +281,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, nd_opt->nd_opt_len); } } +next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } @@ -515,7 +531,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (!dev->addr_len) inc_opt = 0; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -534,8 +551,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, - dev->dev_addr); - + dev->dev_addr, + NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } @@ -580,7 +597,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -596,7 +614,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_NEIGHBOUR_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -632,7 +651,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, } #endif if (send_sllao) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -647,7 +666,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -708,6 +728,15 @@ static int pndisc_is_router(const void *pkey, return ret; } +void ndisc_update(const struct net_device *dev, struct neighbour *neigh, + const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, + struct ndisc_options *ndopts) +{ + neigh_update(neigh, lladdr, new, flags); + /* report ndisc ops about neighbour update */ + ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); +} + static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); @@ -744,7 +773,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } @@ -862,9 +891,10 @@ have_ifp: neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, + NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); @@ -917,7 +947,7 @@ static void ndisc_recv_na(struct sk_buff *skb) idev->cnf.drop_unsolicited_na) return; - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } @@ -973,12 +1003,13 @@ static void ndisc_recv_na(struct sk_buff *skb) goto out; } - neigh_update(neigh, lladdr, + ndisc_update(dev, neigh, lladdr, msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), + NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* @@ -1023,7 +1054,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } @@ -1037,10 +1068,11 @@ static void ndisc_recv_rs(struct sk_buff *skb) neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + NEIGH_UPDATE_F_OVERRIDE_ISROUTER, + NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); } out: @@ -1141,7 +1173,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ndisc_parse_options(opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } @@ -1335,11 +1367,12 @@ skip_linkparms: goto out; } } - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER); + NEIGH_UPDATE_F_ISROUTER, + NDISC_ROUTER_ADVERTISEMENT, &ndopts); } if (!ipv6_accept_ra(in6_dev)) { @@ -1427,7 +1460,8 @@ skip_routeinfo: struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; - p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + p = ndisc_next_useropt(skb->dev, p, + ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } @@ -1465,7 +1499,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) + if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return; if (!ndopts.nd_opts_rh) { @@ -1510,7 +1544,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct dst_entry *dst; struct flowi6 fl6; int rd_len; - u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, + ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; int oif = l3mdev_fib_oif(dev); bool ret; @@ -1569,7 +1604,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_redirect_opt_addr_space(dev, neigh, + ops_data_buf, + &ops_data); } else read_unlock_bh(&neigh->lock); @@ -1600,7 +1637,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) */ if (ha) - ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha); + ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d51a1a48b839..9e1516785dac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2201,7 +2201,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * first-hop router for the specified ICMP Destination Address. */ - if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } @@ -2236,12 +2236,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * We have finally decided to accept it. */ - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER)) - ); + NEIGH_UPDATE_F_ISROUTER)), + NDISC_REDIRECT, &ndopts); nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) -- cgit v1.2.3 From cc84b3c6b48ae81748c5e25d3558872385196162 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:24 +0200 Subject: ipv6: export several functions This patch exports some neighbour discovery functions which can be used by 6lowpan neighbour discovery ops functionality then. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/addrconf.h | 7 +++++++ include/net/ndisc.h | 12 ++++++++++++ net/ipv6/addrconf.c | 15 +++++++-------- net/ipv6/ndisc.c | 14 +++----------- 4 files changed, 29 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index b1774eb03f37..9826d3a9464c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -97,6 +97,13 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); +int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, int addr_type, + u32 addr_flags, bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft); + static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index a5e276703cb3..3f0f41ddbeb0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -53,6 +53,15 @@ enum { #include +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(val, level, fmt, ...) \ +do { \ + if (val <= ND_DEBUG) \ + net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ +} while (0) + struct ctl_table; struct inet6_dev; struct net_device; @@ -115,6 +124,9 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts); +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad); + #define NDISC_OPS_REDIRECT_DATA_SPACE 2 /* diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b6e9bdc610f2..6c8fc3f96b11 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2333,14 +2333,12 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev) idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; } -static int addrconf_prefix_rcv_add_addr(struct net *net, - struct net_device *dev, - const struct prefix_info *pinfo, - struct inet6_dev *in6_dev, - const struct in6_addr *addr, - int addr_type, u32 addr_flags, - bool sllao, bool tokenized, - __u32 valid_lft, u32 prefered_lft) +int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, int addr_type, + u32 addr_flags, bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); int create = 0, update_lft = 0; @@ -2430,6 +2428,7 @@ static int addrconf_prefix_rcv_add_addr(struct net *net, return 0; } +EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2f4afd17954b..fe65cdc28a45 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -73,15 +73,6 @@ #include #include -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); @@ -150,8 +141,8 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, - int data_len, int pad) +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); @@ -170,6 +161,7 @@ static void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, if (space > 0) memset(opt, 0, space); } +EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, u8 icmp6_type) -- cgit v1.2.3 From bbe5f5cefe2818eda0392c178de141ffc5734d90 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 15 Jun 2016 21:20:25 +0200 Subject: 6lowpan: introduce 6lowpan-nd This patch introduce different 6lowpan handling for receive and transmit NS/NA messages for the ipv6 neighbour discovery. The first use-case is for supporting 802.15.4 short addresses inside the option fields and handling for RFC6775 6CO option field as userspace option. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Reviewed-by: Stefan Schmidt Acked-by: YOSHIFUJI Hideaki Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/ndisc.h | 18 ++-- net/6lowpan/6lowpan_i.h | 4 + net/6lowpan/Makefile | 2 +- net/6lowpan/core.c | 4 +- net/6lowpan/ndisc.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 254 insertions(+), 8 deletions(-) create mode 100644 net/6lowpan/ndisc.c (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 3f0f41ddbeb0..be1fe2283254 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -35,6 +35,7 @@ enum { ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ ND_OPT_DNSSL = 31, /* RFC6106 */ + ND_OPT_6CO = 34, /* RFC6775 */ __ND_OPT_MAX }; @@ -109,14 +110,19 @@ struct ndisc_options { #endif struct nd_opt_hdr *nd_useropts; struct nd_opt_hdr *nd_useropts_end; +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1]; +#endif }; -#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] -#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] -#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] -#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] -#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] -#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] +#define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index 97ecc27aeca6..a67caee11929 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev, return lowpan_dev(dev)->lltype == lltype; } +extern const struct ndisc_ops lowpan_ndisc_ops; + +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev); + #ifdef CONFIG_6LOWPAN_DEBUGFS int lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index e44f3bf2dd42..12d131ab2324 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := core.o iphc.o nhc.o +6lowpan-y := core.o iphc.o nhc.o ndisc.o 6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o #rfc6282 nhcs diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 1c7a42b48524..5945f7e19c67 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -34,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev, for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; + dev->ndisc_ops = &lowpan_ndisc_ops; + ret = register_netdevice(dev); if (ret < 0) return ret; @@ -73,7 +75,7 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); -static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) { struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c new file mode 100644 index 000000000000..ae1d4199aa4c --- /dev/null +++ b/net/6lowpan/ndisc.c @@ -0,0 +1,234 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2016 Pengutronix, Alexander Aring + */ + +#include +#include +#include + +#include "6lowpan_i.h" + +static int lowpan_ndisc_is_useropt(u8 nd_opt_type) +{ + return nd_opt_type == ND_OPT_6CO; +} + +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) +#define NDISC_802154_SHORT_ADDR_LENGTH 1 +static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_len) { + case NDISC_802154_SHORT_ADDR_LENGTH: + if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type]) + ND_PRINTK(2, warn, + "%s: duplicated short addr ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); + else + ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt; + return 1; + default: + /* all others will be handled by ndisc IPv6 option parsing */ + return 0; + } +} + +static int lowpan_ndisc_parse_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts); + default: + return 0; + } +} + +static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, + u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + u8 *lladdr_short = NULL; + + switch (icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + if (ndopts->nd_802154_opts_src_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + case NDISC_REDIRECT: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + if (ndopts->nd_802154_opts_tgt_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + default: + break; + } + + write_lock_bh(&n->lock); + if (lladdr_short) + ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); + else + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + write_unlock_bh(&n->lock); +} + +static void lowpan_ndisc_update(const struct net_device *dev, + struct neighbour *n, u32 flags, u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + /* react on overrides only. TODO check if this is really right. */ + if (flags & NEIGH_UPDATE_F_OVERRIDE) + lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts); +} + +static int lowpan_ndisc_opt_addr_space(const struct net_device *dev, + u8 icmp6_type, struct neighbour *neigh, + u8 *ha_buf, u8 **ha) +{ + struct lowpan_802154_neigh *n; + struct wpan_dev *wpan_dev; + int addr_space = 0; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + switch (icmp6_type) { + case NDISC_REDIRECT: + n = lowpan_802154_neigh(neighbour_priv(neigh)); + + read_lock_bh(&neigh->lock); + if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) { + memcpy(ha_buf, &n->short_addr, + IEEE802154_SHORT_ADDR_LEN); + read_unlock_bh(&neigh->lock); + addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + *ha = ha_buf; + } + read_unlock_bh(&neigh->lock); + break; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_ROUTER_SOLICITATION: + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + break; + default: + break; + } + + return addr_space; +} + +static void lowpan_ndisc_fill_addr_option(const struct net_device *dev, + struct sk_buff *skb, u8 icmp6_type, + const u8 *ha) +{ + struct wpan_dev *wpan_dev; + __be16 short_addr; + u8 opt_type; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + switch (icmp6_type) { + case NDISC_REDIRECT: + if (ha) { + ieee802154_le16_to_be16(&short_addr, ha); + __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } + return; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + opt_type = ND_OPT_TARGET_LL_ADDR; + break; + case NDISC_ROUTER_SOLICITATION: + case NDISC_NEIGHBOUR_SOLICITATION: + opt_type = ND_OPT_SOURCE_LL_ADDR; + break; + default: + return; + } + + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + ieee802154_le16_to_be16(&short_addr, + &wpan_dev->short_addr); + __ndisc_fill_addr_option(skb, opt_type, &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } +} + +static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, + u32 prefered_lft, + bool dev_addr_generated) +{ + int err; + + /* generates short based address for RA PIO's */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated && + !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) { + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + addr, addr_type, addr_flags, + sllao, tokenized, valid_lft, + prefered_lft); + if (err) + ND_PRINTK(2, warn, + "RA: could not add a short address based address for prefix: %pI6c\n", + &pinfo->prefix); + } +} +#endif + +const struct ndisc_ops lowpan_ndisc_ops = { + .is_useropt = lowpan_ndisc_is_useropt, +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + .parse_options = lowpan_ndisc_parse_options, + .update = lowpan_ndisc_update, + .opt_addr_space = lowpan_ndisc_opt_addr_space, + .fill_addr_option = lowpan_ndisc_fill_addr_option, + .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr, +#endif +}; -- cgit v1.2.3 From 22a59be8b7693eb2d0897a9638f5991f2f8e4ddd Mon Sep 17 00:00:00 2001 From: Philip Prindeville Date: Tue, 14 Jun 2016 15:53:02 -0600 Subject: net: ipv4: Add ability to have GRE ignore DF bit in IPv4 payloads In the presence of firewalls which improperly block ICMP Unreachable (including Fragmentation Required) messages, Path MTU Discovery is prevented from working. A workaround is to handle IPv4 payloads opaquely, ignoring the DF bit--as is done for other payloads like AppleTalk--and doing transparent fragmentation and reassembly. Redux includes the enforcement of mutual exclusion between this feature and Path MTU Discovery as suggested by Alexander Duyck. Cc: Alexander Duyck Reviewed-by: Stephen Hemminger Signed-off-by: Philip Prindeville Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 1 + include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_gre.c | 42 +++++++++++++++++++++++++++++++++--------- net/ipv4/ip_tunnel.c | 2 +- 4 files changed, 36 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index dbf444428437..9222678426a1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -132,6 +132,7 @@ struct ip_tunnel { int ip_tnl_net_id; struct gro_cells gro_cells; bool collect_md; + bool ignore_df; }; #define TUNNEL_CSUM __cpu_to_be16(0x01) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index af4de90ba27d..1046f5515174 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -113,6 +113,7 @@ enum { IFLA_GRE_ENCAP_SPORT, IFLA_GRE_ENCAP_DPORT, IFLA_GRE_COLLECT_METADATA, + IFLA_GRE_IGNORE_DF, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4d2025f7ec57..0f8ca3fca00a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -841,17 +841,19 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct net_device *dev, +static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm *parms) { + struct ip_tunnel *t = netdev_priv(dev); + memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) - return; + return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); @@ -880,16 +882,26 @@ static void ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { + if (t->ignore_df) + return -EINVAL; parms->iph.frag_off = htons(IP_DF); + } if (data[IFLA_GRE_COLLECT_METADATA]) { - struct ip_tunnel *t = netdev_priv(dev); - t->collect_md = true; if (dev->type == ARPHRD_IPGRE) dev->type = ARPHRD_NONE; } + + if (data[IFLA_GRE_IGNORE_DF]) { + if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) + && (parms->iph.frag_off & htons(IP_DF))) + return -EINVAL; + t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); + } + + return 0; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -960,16 +972,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_newlink(dev, tb, &p); } @@ -978,16 +993,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_changelink(dev, tb, &p); } @@ -1024,6 +1042,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_GRE_IGNORE_DF */ + nla_total_size(1) + 0; } @@ -1057,6 +1077,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) + goto nla_put_failure; + if (t->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; @@ -1084,6 +1107,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d8f5e0a269f5..95649ebd2874 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) -- cgit v1.2.3 From 86a98057256020e75e1be0f88d7617491a06e8f1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:20:44 -0700 Subject: vxlan/geneve: Include udp_tunnel.h in vxlan/geneve.h and fixup includes This patch makes it so that we add udp_tunnel.h to vxlan.h and geneve.h header files. This is useful as I plan to move the generic handlers for the port offloads into the udp_tunnel header file and leave the vxlan and geneve headers to be a bit more protocol specific. I also went through and cleaned out a number of redundant includes that where in the .h and .c files for these drivers. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 1 - drivers/net/vxlan.c | 17 ----------------- include/net/geneve.h | 3 --- include/net/udp_tunnel.h | 2 ++ include/net/vxlan.h | 6 +----- 5 files changed, 3 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index cadefe4fdaa2..e5e33cd01082 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f999db2f97b4..10ad41d60652 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -11,32 +11,18 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include -#include -#include -#include -#include -#include #include #include -#include #include -#include -#include #include #include #include #include -#include #include -#include -#include #include -#include -#include #include #include #include @@ -44,12 +30,9 @@ #include #if IS_ENABLED(CONFIG_IPV6) -#include -#include #include #include #endif -#include #define VXLAN_VERSION "0.1" diff --git a/include/net/geneve.h b/include/net/geneve.h index cb544a530146..f8aff18d6702 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -1,10 +1,7 @@ #ifndef __NET_GENEVE_H #define __NET_GENEVE_H 1 -#ifdef CONFIG_INET #include -#endif - /* Geneve Header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 9d14f707e534..59019562d14c 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -105,12 +105,14 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, __be16 flags, __be64 tunnel_id, int md_size); +#ifdef CONFIG_INET static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; return iptunnel_handle_offloads(skb, type); } +#endif static inline void udp_tunnel_encap_enable(struct socket *sock) { diff --git a/include/net/vxlan.h b/include/net/vxlan.h index b8803165df91..7d944941f32f 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -1,12 +1,8 @@ #ifndef __NET_VXLAN_H #define __NET_VXLAN_H 1 -#include -#include #include -#include -#include -#include +#include #include /* VXLAN protocol (RFC 7348) header: -- cgit v1.2.3 From e7b3db5e60e8f471c3f5ef93b497bafe5863e56a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:20:52 -0700 Subject: net: Combine GENEVE and VXLAN port notifiers into single functions This patch merges the GENEVE and VXLAN code so that both functions pass through a shared code path. This way we can start the effort of using a single function on the network device drivers to handle both of these tunnel types. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 58 ++++----------------------- drivers/net/vxlan.c | 60 +++++----------------------- include/net/udp_tunnel.h | 33 +++++++++++++++ net/ipv4/udp_tunnel.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 102 deletions(-) (limited to 'include/net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index e5e33cd01082..af6f676ca444 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -396,23 +396,6 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, return sock; } -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct net_device *dev; - struct sock *sk = gs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = geneve_get_sk_family(gs); - __be16 port = inet_sk(sk)->inet_sport; - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->netdev_ops->ndo_add_geneve_port) - dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, - port); - } - rcu_read_unlock(); -} - static int geneve_hlen(struct genevehdr *gh) { return sizeof(*gh) + gh->opt_len * 4; @@ -532,7 +515,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, INIT_HLIST_HEAD(&gs->vni_list[h]); /* Initialize the geneve udp offloads structure */ - geneve_notify_add_rx_port(gs); + udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); @@ -547,31 +530,13 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, return gs; } -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct net_device *dev; - struct sock *sk = gs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = geneve_get_sk_family(gs); - __be16 port = inet_sk(sk)->inet_sport; - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->netdev_ops->ndo_del_geneve_port) - dev->netdev_ops->ndo_del_geneve_port(dev, sa_family, - port); - } - - rcu_read_unlock(); -} - static void __geneve_sock_release(struct geneve_sock *gs) { if (!gs || --gs->refcnt) return; list_del(&gs->list); - geneve_notify_del_rx_port(gs); + udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE); udp_tunnel_sock_release(gs->sock); kfree_rcu(gs, rcu); } @@ -1164,29 +1129,20 @@ static struct device_type geneve_type = { .name = "geneve", }; -/* Calls the ndo_add_geneve_port of the caller in order to +/* Calls the ndo_add_udp_enc_port of the caller in order to * supply the listening GENEVE udp ports. Callers are expected - * to implement the ndo_add_geneve_port. + * to implement the ndo_add_udp_enc_port. */ static void geneve_push_rx_ports(struct net_device *dev) { struct net *net = dev_net(dev); struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; - sa_family_t sa_family; - struct sock *sk; - __be16 port; - - if (!dev->netdev_ops->ndo_add_geneve_port) - return; rcu_read_lock(); - list_for_each_entry_rcu(gs, &gn->sock_list, list) { - sk = gs->sock->sk; - sa_family = sk->sk_family; - port = inet_sk(sk)->inet_sport; - dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, port); - } + list_for_each_entry_rcu(gs, &gn->sock_list, list) + udp_tunnel_push_rx_port(dev, gs->sock, + UDP_TUNNEL_TYPE_GENEVE); rcu_read_unlock(); } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 10ad41d60652..adbd979da22d 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -602,42 +602,6 @@ static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } -/* Notify netdevs that UDP port started listening */ -static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) -{ - struct net_device *dev; - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = vxlan_get_sk_family(vs); - __be16 port = inet_sk(sk)->inet_sport; - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->netdev_ops->ndo_add_vxlan_port) - dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, - port); - } - rcu_read_unlock(); -} - -/* Notify netdevs that UDP port is no more listening */ -static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) -{ - struct net_device *dev; - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = vxlan_get_sk_family(vs); - __be16 port = inet_sk(sk)->inet_sport; - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->netdev_ops->ndo_del_vxlan_port) - dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family, - port); - } - rcu_read_unlock(); -} - /* Add new entry to forwarding table -- assumes lock held */ static int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, @@ -1033,7 +997,8 @@ static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); - vxlan_notify_del_rx_port(vs); + udp_tunnel_notify_del_rx_port(vs->sock, + UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); return true; @@ -2508,30 +2473,22 @@ static struct device_type vxlan_type = { .name = "vxlan", }; -/* Calls the ndo_add_vxlan_port of the caller in order to +/* Calls the ndo_add_udp_enc_port of the caller in order to * supply the listening VXLAN udp ports. Callers are expected - * to implement the ndo_add_vxlan_port. + * to implement the ndo_add_udp_enc_port. */ static void vxlan_push_rx_ports(struct net_device *dev) { struct vxlan_sock *vs; struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); - sa_family_t sa_family; - __be16 port; unsigned int i; - if (!dev->netdev_ops->ndo_add_vxlan_port) - return; - spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { - port = inet_sk(vs->sock->sk)->inet_sport; - sa_family = vxlan_get_sk_family(vs); - dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, - port); - } + hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) + udp_tunnel_push_rx_port(dev, vs->sock, + UDP_TUNNEL_TYPE_VXLAN); } spin_unlock(&vn->sock_lock); } @@ -2733,7 +2690,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); - vxlan_notify_add_rx_port(vs); + udp_tunnel_notify_add_rx_port(sock, + UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 59019562d14c..71afbea873a0 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -84,6 +84,39 @@ struct udp_tunnel_sock_cfg { void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *sock_cfg); +/* -- List of parsable UDP tunnel types -- + * + * Adding to this list will result in serious debate. The main issue is + * that this list is essentially a list of workarounds for either poorly + * designed tunnels, or poorly designed device offloads. + * + * The parsing supported via these types should really be used for Rx + * traffic only as the network stack will have already inserted offsets for + * the location of the headers in the skb. In addition any ports that are + * pushed should be kept within the namespace without leaking to other + * devices such as VFs or other ports on the same device. + * + * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the + * need to use this for Rx checksum offload. It should not be necessary to + * call this function to perform Tx offloads on outgoing traffic. + */ +enum udp_parsable_tunnel_type { + UDP_TUNNEL_TYPE_VXLAN, /* RFC 7348 */ + UDP_TUNNEL_TYPE_GENEVE, /* draft-ietf-nvo3-geneve */ +}; + +struct udp_tunnel_info { + unsigned short type; + sa_family_t sa_family; + __be16 port; +}; + +/* Notify network devices of offloadable types */ +void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type); +void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); +void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); + /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 47f12c73d959..8174753e6494 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -76,6 +76,108 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); +static void __udp_tunnel_push_rx_port(struct net_device *dev, + struct udp_tunnel_info *ti) +{ + switch (ti->type) { + case UDP_TUNNEL_TYPE_VXLAN: + if (!dev->netdev_ops->ndo_add_vxlan_port) + break; + + dev->netdev_ops->ndo_add_vxlan_port(dev, + ti->sa_family, + ti->port); + break; + case UDP_TUNNEL_TYPE_GENEVE: + if (!dev->netdev_ops->ndo_add_geneve_port) + break; + + dev->netdev_ops->ndo_add_geneve_port(dev, + ti->sa_family, + ti->port); + break; + default: + break; + } +} + +void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + __udp_tunnel_push_rx_port(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); + +/* Notify netdevs that UDP port started listening */ +void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + __udp_tunnel_push_rx_port(dev, &ti); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); + +static void __udp_tunnel_pull_rx_port(struct net_device *dev, + struct udp_tunnel_info *ti) +{ + switch (ti->type) { + case UDP_TUNNEL_TYPE_VXLAN: + if (!dev->netdev_ops->ndo_del_vxlan_port) + break; + + dev->netdev_ops->ndo_del_vxlan_port(dev, + ti->sa_family, + ti->port); + break; + case UDP_TUNNEL_TYPE_GENEVE: + if (!dev->netdev_ops->ndo_del_geneve_port) + break; + + dev->netdev_ops->ndo_del_geneve_port(dev, + ti->sa_family, + ti->port); + break; + default: + break; + } +} + +/* Notify netdevs that UDP port is no more listening */ +void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + __udp_tunnel_pull_rx_port(dev, &ti); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); + void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, -- cgit v1.2.3 From 7c46a640de6fcc4f35d0702710356a024eadf68f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:21:00 -0700 Subject: net: Merge VXLAN and GENEVE push notifiers into a single notifier This patch merges the notifiers for VXLAN and GENEVE into a single UDP tunnel notifier. The idea is that we will want to only have to make one notifier call to receive the list of ports for VXLAN and GENEVE tunnels that need to be offloaded. In addition we add a new set of ndo functions named ndo_udp_tunnel_add and ndo_udp_tunnel_del that are meant to allow us to track the tunnel meta-data such as port and address family as tunnels are added and removed. The tunnel meta-data is now transported in a structure named udp_tunnel_info which for now carries the type, address family, and port number. In the future this could be updated so that we can include a tuple of values including things such as the destination IP address and other fields. I also ended up going with a naming scheme that consisted of using the prefix udp_tunnel on function names. I applied this to the notifier and ndo ops as well so that it hopefully points to the fact that these are primarily used in the udp_tunnel functions. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 2 +- include/linux/netdevice.h | 22 ++++++++++++++++++++-- include/net/geneve.h | 3 +-- include/net/udp_tunnel.h | 6 ++++++ include/net/vxlan.h | 4 ++-- net/ipv4/udp_tunnel.c | 10 ++++++++++ 7 files changed, 41 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index af6f676ca444..aa61708bea69 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1497,7 +1497,7 @@ static int geneve_netdevice_event(struct notifier_block *unused, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (event == NETDEV_OFFLOAD_PUSH_GENEVE) + if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) geneve_push_rx_ports(dev); return NOTIFY_DONE; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index adbd979da22d..31aeec967175 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3239,7 +3239,7 @@ static int vxlan_netdevice_event(struct notifier_block *unused, if (event == NETDEV_UNREGISTER) vxlan_handle_lowerdev_unregister(vn, dev); - else if (event == NETDEV_OFFLOAD_PUSH_VXLAN) + else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) vxlan_push_rx_ports(dev); return NOTIFY_DONE; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 890158e99159..577d2a1814b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -61,6 +61,8 @@ struct wireless_dev; /* 802.15.4 specific */ struct wpan_dev; struct mpls_dev; +/* UDP Tunnel offloads */ +struct udp_tunnel_info; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); @@ -1050,6 +1052,19 @@ struct tc_to_netdev { * address family that vxlan is not listening to anymore. The operation * is protected by the vxlan_net->sock_lock. * + * void (*ndo_udp_tunnel_add)(struct net_device *dev, + * struct udp_tunnel_info *ti); + * Called by UDP tunnel to notify a driver about the UDP port and socket + * address family that a UDP tunnel is listnening to. It is called only + * when a new port starts listening. The operation is protected by the + * RTNL. + * + * void (*ndo_udp_tunnel_del)(struct net_device *dev, + * struct udp_tunnel_info *ti); + * Called by UDP tunnel to notify the driver about a UDP port and socket + * address family that the UDP tunnel is not listening to anymore. The + * operation is protected by the RTNL. + * * void* (*ndo_dfwd_add_station)(struct net_device *pdev, * struct net_device *dev) * Called by upper layer devices to accelerate switching or other @@ -1269,6 +1284,10 @@ struct net_device_ops { void (*ndo_del_geneve_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); + void (*ndo_udp_tunnel_add)(struct net_device *dev, + struct udp_tunnel_info *ti); + void (*ndo_udp_tunnel_del)(struct net_device *dev, + struct udp_tunnel_info *ti); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, @@ -2255,8 +2274,7 @@ struct netdev_lag_lower_state_info { #define NETDEV_BONDING_INFO 0x0019 #define NETDEV_PRECHANGEUPPER 0x001A #define NETDEV_CHANGELOWERSTATE 0x001B -#define NETDEV_OFFLOAD_PUSH_VXLAN 0x001C -#define NETDEV_OFFLOAD_PUSH_GENEVE 0x001D +#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); diff --git a/include/net/geneve.h b/include/net/geneve.h index f8aff18d6702..3410c4b5a382 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -61,8 +61,7 @@ struct genevehdr { static inline void geneve_get_rx_port(struct net_device *netdev) { - ASSERT_RTNL(); - call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_GENEVE, netdev); + udp_tunnel_get_rx_info(netdev); } #ifdef CONFIG_INET diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 71afbea873a0..1c9408a04213 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -117,6 +117,12 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); +static inline void udp_tunnel_get_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); +} + /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 7d944941f32f..c62e2ed1c3af 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -4,6 +4,7 @@ #include #include #include +#include /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -390,8 +391,7 @@ static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) static inline void vxlan_get_rx_port(struct net_device *netdev) { - ASSERT_RTNL(); - call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_VXLAN, netdev); + udp_tunnel_get_rx_info(netdev); } static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 8174753e6494..683e494d9000 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -79,6 +79,11 @@ EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); static void __udp_tunnel_push_rx_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (dev->netdev_ops->ndo_udp_tunnel_add) { + dev->netdev_ops->ndo_udp_tunnel_add(dev, ti); + return; + } + switch (ti->type) { case UDP_TUNNEL_TYPE_VXLAN: if (!dev->netdev_ops->ndo_add_vxlan_port) @@ -137,6 +142,11 @@ EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); static void __udp_tunnel_pull_rx_port(struct net_device *dev, struct udp_tunnel_info *ti) { + if (dev->netdev_ops->ndo_udp_tunnel_del) { + dev->netdev_ops->ndo_udp_tunnel_del(dev, ti); + return; + } + switch (ti->type) { case UDP_TUNNEL_TYPE_VXLAN: if (!dev->netdev_ops->ndo_del_vxlan_port) -- cgit v1.2.3 From 1938ee1fd3de74d761a60806b048df652666afec Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:23:12 -0700 Subject: net: Remove deprecated tunnel specific UDP offload functions Now that we have all the drivers using udp_tunnel_get_rx_ports, ndo_add_udp_enc_rx_port, and ndo_del_udp_enc_rx_port we can drop the function calls that were specific to VXLAN and GENEVE. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 38 ----------------------- include/net/geneve.h | 5 --- include/net/vxlan.h | 5 --- net/ipv4/udp_tunnel.c | 79 +++++++++-------------------------------------- 4 files changed, 14 insertions(+), 113 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 577d2a1814b1..e84d9d23c2d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1026,32 +1026,6 @@ struct tc_to_netdev { * not implement this, it is assumed that the hw is not able to have * multiple net devices on single physical port. * - * void (*ndo_add_vxlan_port)(struct net_device *dev, - * sa_family_t sa_family, __be16 port); - * Called by vxlan to notify a driver about the UDP port and socket - * address family that vxlan is listening to. It is called only when - * a new port starts listening. The operation is protected by the - * vxlan_net->sock_lock. - * - * void (*ndo_add_geneve_port)(struct net_device *dev, - * sa_family_t sa_family, __be16 port); - * Called by geneve to notify a driver about the UDP port and socket - * address family that geneve is listnening to. It is called only when - * a new port starts listening. The operation is protected by the - * geneve_net->sock_lock. - * - * void (*ndo_del_geneve_port)(struct net_device *dev, - * sa_family_t sa_family, __be16 port); - * Called by geneve to notify the driver about a UDP port and socket - * address family that geneve is not listening to anymore. The operation - * is protected by the geneve_net->sock_lock. - * - * void (*ndo_del_vxlan_port)(struct net_device *dev, - * sa_family_t sa_family, __be16 port); - * Called by vxlan to notify the driver about a UDP port and socket - * address family that vxlan is not listening to anymore. The operation - * is protected by the vxlan_net->sock_lock. - * * void (*ndo_udp_tunnel_add)(struct net_device *dev, * struct udp_tunnel_info *ti); * Called by UDP tunnel to notify a driver about the UDP port and socket @@ -1272,18 +1246,6 @@ struct net_device_ops { struct netdev_phys_item_id *ppid); int (*ndo_get_phys_port_name)(struct net_device *dev, char *name, size_t len); - void (*ndo_add_vxlan_port)(struct net_device *dev, - sa_family_t sa_family, - __be16 port); - void (*ndo_del_vxlan_port)(struct net_device *dev, - sa_family_t sa_family, - __be16 port); - void (*ndo_add_geneve_port)(struct net_device *dev, - sa_family_t sa_family, - __be16 port); - void (*ndo_del_geneve_port)(struct net_device *dev, - sa_family_t sa_family, - __be16 port); void (*ndo_udp_tunnel_add)(struct net_device *dev, struct udp_tunnel_info *ti); void (*ndo_udp_tunnel_del)(struct net_device *dev, diff --git a/include/net/geneve.h b/include/net/geneve.h index 3410c4b5a382..ec0327d4331b 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -59,11 +59,6 @@ struct genevehdr { struct geneve_opt options[]; }; -static inline void geneve_get_rx_port(struct net_device *netdev) -{ - udp_tunnel_get_rx_info(netdev); -} - #ifdef CONFIG_INET struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index c62e2ed1c3af..b96d0360c095 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -389,11 +389,6 @@ static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) return vni_field; } -static inline void vxlan_get_rx_port(struct net_device *netdev) -{ - udp_tunnel_get_rx_info(netdev); -} - static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 683e494d9000..58bd39fb14b4 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -76,47 +76,20 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); -static void __udp_tunnel_push_rx_port(struct net_device *dev, - struct udp_tunnel_info *ti) -{ - if (dev->netdev_ops->ndo_udp_tunnel_add) { - dev->netdev_ops->ndo_udp_tunnel_add(dev, ti); - return; - } - - switch (ti->type) { - case UDP_TUNNEL_TYPE_VXLAN: - if (!dev->netdev_ops->ndo_add_vxlan_port) - break; - - dev->netdev_ops->ndo_add_vxlan_port(dev, - ti->sa_family, - ti->port); - break; - case UDP_TUNNEL_TYPE_GENEVE: - if (!dev->netdev_ops->ndo_add_geneve_port) - break; - - dev->netdev_ops->ndo_add_geneve_port(dev, - ti->sa_family, - ti->port); - break; - default: - break; - } -} - void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; + if (!dev->netdev_ops->ndo_udp_tunnel_add) + return; + ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - __udp_tunnel_push_rx_port(dev, &ti); + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); @@ -133,42 +106,15 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); - for_each_netdev_rcu(net, dev) - __udp_tunnel_push_rx_port(dev, &ti); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_add) + continue; + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); -static void __udp_tunnel_pull_rx_port(struct net_device *dev, - struct udp_tunnel_info *ti) -{ - if (dev->netdev_ops->ndo_udp_tunnel_del) { - dev->netdev_ops->ndo_udp_tunnel_del(dev, ti); - return; - } - - switch (ti->type) { - case UDP_TUNNEL_TYPE_VXLAN: - if (!dev->netdev_ops->ndo_del_vxlan_port) - break; - - dev->netdev_ops->ndo_del_vxlan_port(dev, - ti->sa_family, - ti->port); - break; - case UDP_TUNNEL_TYPE_GENEVE: - if (!dev->netdev_ops->ndo_del_geneve_port) - break; - - dev->netdev_ops->ndo_del_geneve_port(dev, - ti->sa_family, - ti->port); - break; - default: - break; - } -} - /* Notify netdevs that UDP port is no more listening */ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) { @@ -182,8 +128,11 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); - for_each_netdev_rcu(net, dev) - __udp_tunnel_pull_rx_port(dev, &ti); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_del) + continue; + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); -- cgit v1.2.3 From b9adcd69bd7b41625201686b4cfec7ff13357afc Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 16 Jun 2016 12:23:19 -0700 Subject: vxlan: Add new UDP encapsulation offload type for VXLAN-GPE The fact is VXLAN with Generic Protocol Extensions cannot be supported by the same hardware parsers that support VXLAN. The protocol extensions allow for things like a Next Protocol field which in turn allows for things other than Ethernet to be passed over the tunnel. Most existing parsers will not know how to interpret this. To resolve this I am giving VXLAN-GPE its own UDP encapsulation offload type. This way hardware that does support GPE can simply add this type to the switch statement for VXLAN, and if they don't support it then this will fix any issues where headers might be interpreted incorrectly. Signed-off-by: Alexander Duyck Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 ++++++ include/net/udp_tunnel.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 31aeec967175..abb9cd2df9e9 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -998,6 +998,8 @@ static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); udp_tunnel_notify_del_rx_port(vs->sock, + (vs->flags & VXLAN_F_GPE) ? + UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); @@ -2488,6 +2490,8 @@ static void vxlan_push_rx_ports(struct net_device *dev) for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) udp_tunnel_push_rx_port(dev, vs->sock, + (vs->flags & VXLAN_F_GPE) ? + UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); } spin_unlock(&vn->sock_lock); @@ -2691,6 +2695,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); udp_tunnel_notify_add_rx_port(sock, + (vs->flags & VXLAN_F_GPE) ? + UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 1c9408a04213..02c5be037451 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -103,6 +103,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, enum udp_parsable_tunnel_type { UDP_TUNNEL_TYPE_VXLAN, /* RFC 7348 */ UDP_TUNNEL_TYPE_GENEVE, /* draft-ietf-nvo3-geneve */ + UDP_TUNNEL_TYPE_VXLAN_GPE, /* draft-ietf-nvo3-vxlan-gpe */ }; struct udp_tunnel_info { -- cgit v1.2.3 From a2e2ff560f5113e8ca31432fbd985f5f1889efdc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:24 -0700 Subject: net: ipv6: Move ip6_route_get_saddr to inline VRF driver needs access to ip6_route_get_saddr code. Since it does little beyond ipv6_dev_get_saddr and ipv6_dev_get_saddr is already exported for modules move ip6_route_get_saddr to the header as an inline. Code move only; no functional change. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 21 ++++++++++++++++++--- net/ipv6/route.c | 17 ----------------- 2 files changed, 18 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f55bf3d294aa..d97305d0e71f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -18,6 +18,7 @@ struct route_info { __u8 prefix[0]; /* 0,8 or 16 */ }; +#include #include #include #include @@ -88,9 +89,23 @@ int ip6_route_add(struct fib6_config *cfg); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); -int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, - const struct in6_addr *daddr, unsigned int prefs, - struct in6_addr *saddr); +static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr) +{ + struct inet6_dev *idev = + rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; + int err = 0; + + if (rt && rt->rt6i_prefsrc.plen) + *saddr = rt->rt6i_prefsrc.addr; + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, + daddr, prefs, saddr); + + return err; +} struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int flags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9e1516785dac..08b77f421268 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2586,23 +2586,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, return rt; } -int ip6_route_get_saddr(struct net *net, - struct rt6_info *rt, - const struct in6_addr *daddr, - unsigned int prefs, - struct in6_addr *saddr) -{ - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; - int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); - return err; -} - /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net_device *dev; -- cgit v1.2.3 From 0d240e7811c4ec1965760ee4643b5bbc9cfacbb3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:25 -0700 Subject: net: vrf: Implement get_saddr for IPv6 IPv6 source address selection needs to consider the real egress route. Similar to IPv4 implement a get_saddr6 method which is called if source address has not been set. The get_saddr6 method does a full lookup which means pulling a route from the VRF FIB table and properly considering linklocal/multicast destination addresses. Lookup failures (eg., unreachable) then cause the source address selection to fail which gets propagated back to the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/net/l3mdev.h | 11 +++++++++++ net/ipv6/ip6_output.c | 12 ++++++++++-- net/l3mdev/l3mdev.c | 24 ++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 32173aa9208e..b3762822b653 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -999,6 +999,46 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, return dst; } + +/* called under rcu_read_lock */ +static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk, + struct flowi6 *fl6) +{ + struct net *net = dev_net(dev); + struct dst_entry *dst; + struct rt6_info *rt; + int err; + + if (rt6_need_strict(&fl6->daddr)) { + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, + RT6_LOOKUP_F_IFACE); + if (unlikely(!rt)) + return 0; + + dst = &rt->dst; + } else { + __u8 flags = fl6->flowi6_flags; + + fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; + fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF; + + dst = ip6_route_output(net, sk, fl6); + rt = (struct rt6_info *)dst; + + fl6->flowi6_flags = flags; + } + + err = dst->error; + if (!err) { + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); + } + + dst_release(dst); + + return err; +} #endif static const struct l3mdev_ops vrf_l3mdev_ops = { @@ -1008,6 +1048,7 @@ static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_l3_rcv = vrf_l3_rcv, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_get_rt6_dst = vrf_get_rt6_dst, + .l3mdev_get_saddr6 = vrf_get_saddr6, #endif }; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index f8a416ec674c..818fd4f100fc 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -39,6 +39,9 @@ struct l3mdev_ops { /* IPv6 ops */ struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev, struct flowi6 *fl6); + int (*l3mdev_get_saddr6)(struct net_device *dev, + const struct sock *sk, + struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV @@ -140,6 +143,8 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6); +int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) @@ -230,6 +235,12 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6) return NULL; } +static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + return 0; +} + static inline struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fd3217579b65..1dfc402d9ad1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -910,6 +910,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; + if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && + (!*dst || !(*dst)->error)) { + err = l3mdev_get_saddr6(net, sk, fl6); + if (err) + goto out_err; + } + /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -999,10 +1006,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, return 0; out_err_release: - if (err == -ENETUNREACH) - IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); dst_release(*dst); *dst = NULL; +out_err: + if (err == -ENETUNREACH) + IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index d90e4ef09e85..c4a1c3e84e12 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -162,6 +162,30 @@ int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) } EXPORT_SYMBOL_GPL(l3mdev_get_saddr); +int l3mdev_get_saddr6(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + struct net_device *dev; + int rc = 0; + + if (fl6->flowi6_oif) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_saddr6) + rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); + + rcu_read_unlock(); + } + + return rc; +} +EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); + /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device -- cgit v1.2.3 From afbac6010aec514998214fb19a1f37732b7a1d77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 16 Jun 2016 16:24:26 -0700 Subject: net: ipv6: Address selection needs to consider L3 domains IPv6 version of 3f2fb9a834cb ("net: l3mdev: address selection should only consider devices in L3 domain") and the follow up commit, a17b693cdd876 ("net: l3mdev: prefer VRF master for source address selection"). That is, if outbound device is given then the address preference order is an address from that device, an address from the master device if it is enslaved, and then an address from a device in the same L3 domain. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 31 +++++++++++++++++++++++++++++++ net/ipv6/addrconf.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include/net') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 818fd4f100fc..e90095091aa0 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -79,6 +79,31 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) return rc; } +static inline +const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) +{ + /* netdev_master_upper_dev_get_rcu calls + * list_first_or_null_rcu to walk the upper dev list. + * list_first_or_null_rcu does not handle a const arg. We aren't + * making changes, just want the master device from that list so + * typecast to remove the const + */ + struct net_device *dev = (struct net_device *)_dev; + const struct net_device *master; + + if (!dev) + return NULL; + + if (netif_is_l3_master(dev)) + master = dev; + else if (netif_is_l3_slave(dev)) + master = netdev_master_upper_dev_get_rcu(dev); + else + master = NULL; + + return master; +} + /* get index of an interface to use for FIB lookups. For devices * enslaved to an L3 master device FIB lookups are based on the * master index @@ -190,6 +215,12 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) return 0; } +static inline +const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) +{ + return NULL; +} + static inline int l3mdev_fib_oif_rcu(struct net_device *dev) { return dev ? dev->ifindex : 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8fc3f96b11..a1f6b7b31531 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1524,6 +1524,28 @@ out: return hiscore_idx; } +static int ipv6_get_saddr_master(struct net *net, + const struct net_device *dst_dev, + const struct net_device *master, + struct ipv6_saddr_dst *dst, + struct ipv6_saddr_score *scores, + int hiscore_idx) +{ + struct inet6_dev *idev; + + idev = __in6_dev_get(dst_dev); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + idev = __in6_dev_get(master); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev, + scores, hiscore_idx); + + return hiscore_idx; +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -1577,13 +1599,39 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, if (idev) hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } else { + const struct net_device *master; + int master_idx = 0; + + /* if dst_dev exists and is enslaved to an L3 device, then + * prefer addresses from dst_dev and then the master over + * any other enslaved devices in the L3 domain. + */ + master = l3mdev_master_dev_rcu(dst_dev); + if (master) { + master_idx = master->ifindex; + + hiscore_idx = ipv6_get_saddr_master(net, dst_dev, + master, &dst, + scores, hiscore_idx); + + if (scores[hiscore_idx].ifa) + goto out; + } + for_each_netdev_rcu(net, dev) { + /* only consider addresses on devices in the + * same L3 domain + */ + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; idev = __in6_dev_get(dev); if (!idev) continue; hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } } + +out: rcu_read_unlock(); hiscore = &scores[hiscore_idx]; -- cgit v1.2.3 From 9b8c6d7bf2e08a7d3eb6660a2bfaf29b8b49c329 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jun 2016 21:52:05 -0700 Subject: gre: better support for ICMP messages for gre+ipv6 ipgre_err() can call ip6_err_gen_icmpv6_unreach() for proper support of ipv4+gre+icmp+ipv6+... frames, used for example by traceroute/mtr. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 1 + net/ipv4/gre_demux.c | 1 + net/ipv4/ip_gre.c | 6 ++++++ 3 files changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9222678426a1..a5e7035fb93f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -157,6 +157,7 @@ struct tnl_ptk_info { __be16 proto; __be32 key; __be32 seq; + int hdr_len; }; #define PACKET_RCVD 0 diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 4c39f4fd332a..c4c3e439f424 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -117,6 +117,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if ((*(u8 *)options & 0xF0) != 0x40) hdr_len += 4; } + tpi->hdr_len = hdr_len; return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0f8ca3fca00a..ab4cff8e563d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -187,6 +187,12 @@ static void ipgre_err(struct sk_buff *skb, u32 info, if (!t) return; +#if IS_ENABLED(CONFIG_IPV6) + if (tpi->proto == htons(ETH_P_IPV6) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type)) + return; +#endif + if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) return; -- cgit v1.2.3 From 506e65df52f2bf250aa9b4264efd180d1646bdec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 10 Jun 2016 23:09:01 +0200 Subject: netfilter: make comparision helpers stub functions in ZONES=n case Those comparisions are useless in case of ZONES=n; all conntracks will reside in the same zone by definition. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_zones.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 4e32512cef32..bd4692690914 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -68,22 +68,34 @@ static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { +#ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_matches_dir(zone, dir) ? zone->id : NF_CT_DEFAULT_ZONE_ID; +#else + return NF_CT_DEFAULT_ZONE_ID; +#endif } static inline bool nf_ct_zone_equal(const struct nf_conn *a, const struct nf_conntrack_zone *b, enum ip_conntrack_dir dir) { +#ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_id(nf_ct_zone(a), dir) == nf_ct_zone_id(b, dir); +#else + return true; +#endif } static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, const struct nf_conntrack_zone *b) { +#ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone(a)->id == b->id; +#else + return true; +#endif } #endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */ #endif /* _NF_CONNTRACK_ZONES_H */ -- cgit v1.2.3 From 6c8dee9842461e6ee6eb46081478999b3d5cb297 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 11 Jun 2016 21:57:35 +0200 Subject: netfilter: move zone info into struct nf_conn Curently we store zone information as a conntrack extension. This has one drawback: for every lookup we need to fetch the zone data from the extension area. This change place the zone data directly into the main conntrack object structure and then removes the zone conntrack extension. The zone data is just 4 bytes, it fits into a padding hole before the tuplehash info, so we do not even increase the nf_conn structure size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 +++ include/net/netfilter/nf_conntrack_extend.h | 4 ---- include/net/netfilter/nf_conntrack_zones.h | 33 ++++++++++------------------- net/netfilter/nf_conntrack_core.c | 33 ++--------------------------- 4 files changed, 16 insertions(+), 57 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index dd78bea227c8..9c0ed3d7af89 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -85,6 +85,9 @@ struct nf_conn { spinlock_t lock; u16 cpu; +#ifdef CONFIG_NF_CONNTRACK_ZONES + struct nf_conntrack_zone zone; +#endif /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 55d15049ab2f..b925395fa5ed 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -15,9 +15,6 @@ enum nf_ct_ext_id { #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, #endif -#ifdef CONFIG_NF_CONNTRACK_ZONES - NF_CT_EXT_ZONE, -#endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_CT_EXT_TSTAMP, #endif @@ -38,7 +35,6 @@ enum nf_ct_ext_id { #define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj #define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache -#define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index bd4692690914..64a718b60839 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -9,12 +9,11 @@ static inline const struct nf_conntrack_zone * nf_ct_zone(const struct nf_conn *ct) { - const struct nf_conntrack_zone *nf_ct_zone = NULL; - #ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_zone = nf_ct_ext_find(ct, NF_CT_EXT_ZONE); + return &ct->zone; +#else + return &nf_ct_zone_dflt; #endif - return nf_ct_zone ? nf_ct_zone : &nf_ct_zone_dflt; } static inline const struct nf_conntrack_zone * @@ -31,32 +30,22 @@ static inline const struct nf_conntrack_zone * nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb, struct nf_conntrack_zone *tmp) { - const struct nf_conntrack_zone *zone; - +#ifdef CONFIG_NF_CONNTRACK_ZONES if (!tmpl) return &nf_ct_zone_dflt; - zone = nf_ct_zone(tmpl); - if (zone->flags & NF_CT_FLAG_MARK) - zone = nf_ct_zone_init(tmp, skb->mark, zone->dir, 0); - - return zone; + if (tmpl->zone.flags & NF_CT_FLAG_MARK) + return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0); +#endif + return nf_ct_zone(tmpl); } -static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags, - const struct nf_conntrack_zone *info) +static inline void nf_ct_zone_add(struct nf_conn *ct, + const struct nf_conntrack_zone *zone) { #ifdef CONFIG_NF_CONNTRACK_ZONES - struct nf_conntrack_zone *nf_ct_zone; - - nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags); - if (!nf_ct_zone) - return -ENOMEM; - - nf_ct_zone_init(nf_ct_zone, info->id, info->dir, - info->flags); + ct->zone = *zone; #endif - return 0; } static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2903bb43547c..a459176c3253 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -327,16 +327,10 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); - - if (nf_ct_zone_add(tmpl, flags, zone) < 0) - goto out_free; - + nf_ct_zone_add(tmpl, zone); atomic_set(&tmpl->ct_general.use, 0); return tmpl; -out_free: - kfree(tmpl); - return NULL; } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); @@ -929,16 +923,13 @@ __nf_conntrack_alloc(struct net *net, offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, __nfct_init_offset[0])); - if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) - goto out_free; + nf_ct_zone_add(ct, zone); /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ atomic_set(&ct->ct_general.use, 0); return ct; -out_free: - kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -1342,14 +1333,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -#ifdef CONFIG_NF_CONNTRACK_ZONES -static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { - .len = sizeof(struct nf_conntrack_zone), - .align = __alignof__(struct nf_conntrack_zone), - .id = NF_CT_EXT_ZONE, -}; -#endif - #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include @@ -1532,9 +1515,6 @@ void nf_conntrack_cleanup_end(void) nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_extend_unregister(&nf_ct_zone_extend); -#endif nf_conntrack_proto_fini(); nf_conntrack_seqadj_fini(); nf_conntrack_labels_fini(); @@ -1771,11 +1751,6 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_seqadj; -#ifdef CONFIG_NF_CONNTRACK_ZONES - ret = nf_ct_extend_register(&nf_ct_zone_extend); - if (ret < 0) - goto err_extend; -#endif ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; @@ -1791,10 +1766,6 @@ int nf_conntrack_init_start(void) return 0; err_proto: -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_extend_unregister(&nf_ct_zone_extend); -err_extend: -#endif nf_conntrack_seqadj_fini(); err_seqadj: nf_conntrack_labels_fini(); -- cgit v1.2.3 From 7643507fe8b5bd8ab7522f6a81058cc1209d2585 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 21 Jun 2016 14:58:46 -0400 Subject: netfilter: xt_NFLOG: nflog-range does not truncate packets li->u.ulog.copy_len is currently ignored by the kernel, we should truncate the packet to either li->u.ulog.copy_len (if set) or copy_range before sending it to userspace. 0 is a valid input for copy_len, so add a new flag to indicate whether this was option was specified by the user or not. Add two flags to indicate whether nflog-size/copy_len was set or not. XT_NFLOG_F_COPY_LEN is for XT_NFLOG and NFLOG_F_COPY_LEN for nfnetlink_log On the userspace side, this was initially represented by the option nflog-range, this will be replaced by --nflog-size now. --nflog-range would still exist but does not do anything. Reported-by: Joe Dollard Reviewed-by: Josh Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 7 +++++++ include/uapi/linux/netfilter/xt_NFLOG.h | 6 +++++- net/netfilter/nfnetlink_log.c | 9 ++++++--- net/netfilter/xt_NFLOG.c | 3 +++ 4 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 57639fca223a..83d855ba6af1 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -12,6 +12,9 @@ #define NF_LOG_UID 0x08 /* Log UID owning local socket */ #define NF_LOG_MASK 0x0f +/* This flag indicates that copy_len field in nf_loginfo is set */ +#define NF_LOG_F_COPY_LEN 0x1 + enum nf_log_type { NF_LOG_TYPE_LOG = 0, NF_LOG_TYPE_ULOG, @@ -22,9 +25,13 @@ struct nf_loginfo { u_int8_t type; union { struct { + /* copy_len will be used iff you set + * NF_LOG_F_COPY_LEN in flags + */ u_int32_t copy_len; u_int16_t group; u_int16_t qthreshold; + u_int16_t flags; } ulog; struct { u_int8_t level; diff --git a/include/uapi/linux/netfilter/xt_NFLOG.h b/include/uapi/linux/netfilter/xt_NFLOG.h index 87b58311ce6b..f33070730fc8 100644 --- a/include/uapi/linux/netfilter/xt_NFLOG.h +++ b/include/uapi/linux/netfilter/xt_NFLOG.h @@ -6,9 +6,13 @@ #define XT_NFLOG_DEFAULT_GROUP 0x1 #define XT_NFLOG_DEFAULT_THRESHOLD 0 -#define XT_NFLOG_MASK 0x0 +#define XT_NFLOG_MASK 0x1 + +/* This flag indicates that 'len' field in xt_nflog_info is set*/ +#define XT_NFLOG_F_COPY_LEN 0x1 struct xt_nflog_info { + /* 'len' will be used iff you set XT_NFLOG_F_COPY_LEN in flags */ __u32 len; __u16 group; __u16 threshold; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 11f81c8385fc..cbcfdfb586a6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net, break; case NFULNL_COPY_PACKET: - if (inst->copy_range > skb->len) + data_len = inst->copy_range; + if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) && + (li->u.ulog.copy_len < data_len)) + data_len = li->u.ulog.copy_len; + + if (data_len > skb->len) data_len = skb->len; - else - data_len = inst->copy_range; size += nla_total_size(data_len); break; diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a1fa2c800cb9..018eed7e1ff1 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; + if (info->flags & XT_NFLOG_F_COPY_LEN) + li.u.ulog.flags |= NF_LOG_F_COPY_LEN; + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, &li, info->prefix); return XT_CONTINUE; -- cgit v1.2.3 From 889f7ee7c6e84251215d43cbc856ea116c72d3f2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 12 Jun 2016 18:07:07 +0200 Subject: netfilter: nf_tables: add generic macros to check for generation mask Thus, we can reuse these to check the genmask of any object type, not only rules. This is required now that tables, chain and sets will get a generation mask field too in follow up patches. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 24 ++++++++++++++++++++ net/netfilter/nf_tables_api.c | 46 +++++++-------------------------------- 2 files changed, 32 insertions(+), 38 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 092235458691..d0778cbaf7f1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -969,6 +969,30 @@ static inline u8 nft_genmask_cur(const struct net *net) #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) +/* + * Generic transaction helpers + */ + +/* Check if this object is currently active. */ +#define nft_is_active(__net, __obj) \ + (((__obj)->genmask & nft_genmask_cur(__net)) == 0) + +/* Check if this object is active in the next generation. */ +#define nft_is_active_next(__net, __obj) \ + (((__obj)->genmask & nft_genmask_next(__net)) == 0) + +/* This object becomes active in the next generation. */ +#define nft_activate_next(__net, __obj) \ + (__obj)->genmask = nft_genmask_cur(__net) + +/* This object becomes inactive in the next generation. */ +#define nft_deactivate_next(__net, __obj) \ + (__obj)->genmask = nft_genmask_next(__net) + +/* After committing the ruleset, clear the stale generation bit. */ +#define nft_clear(__net, __obj) \ + (__obj)->genmask &= ~nft_genmask_next(__net) + /* * Set element transaction helpers */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4d292b933b5c..d9f0f0797dec 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -234,42 +234,12 @@ static int nft_delchain(struct nft_ctx *ctx) return err; } -static inline bool -nft_rule_is_active(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & nft_genmask_cur(net)) == 0; -} - -static inline int -nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & nft_genmask_next(net)) == 0; -} - -static inline void -nft_rule_activate_next(struct net *net, struct nft_rule *rule) -{ - /* Now inactive, will be active in the future */ - rule->genmask = nft_genmask_cur(net); -} - -static inline void -nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) -{ - rule->genmask = nft_genmask_next(net); -} - -static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) -{ - rule->genmask &= ~nft_genmask_next(net); -} - static int nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { /* You cannot delete the same rule twice */ - if (nft_rule_is_active_next(ctx->net, rule)) { - nft_rule_deactivate_next(ctx->net, rule); + if (nft_is_active_next(ctx->net, rule)) { + nft_deactivate_next(ctx->net, rule); ctx->chain->use--; return 0; } @@ -1898,7 +1868,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, list_for_each_entry_rcu(table, &afi->tables, list) { list_for_each_entry_rcu(chain, &table->chains, list) { list_for_each_entry_rcu(rule, &chain->rules, list) { - if (!nft_rule_is_active(net, rule)) + if (!nft_is_active(net, rule)) goto cont; if (idx < s_idx) goto cont; @@ -2102,7 +2072,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (rule == NULL) goto err1; - nft_rule_activate_next(net, rule); + nft_activate_next(net, rule); rule->handle = handle; rule->dlen = size; @@ -2124,14 +2094,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (nft_rule_is_active_next(net, old_rule)) { + if (nft_is_active_next(net, old_rule)) { trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, old_rule); if (trans == NULL) { err = -ENOMEM; goto err2; } - nft_rule_deactivate_next(net, old_rule); + nft_deactivate_next(net, old_rule); chain->use--; list_add_tail_rcu(&rule->list, &old_rule->list); } else { @@ -3980,7 +3950,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) trans->ctx.afi->nops); break; case NFT_MSG_NEWRULE: - nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_clear(trans->ctx.net, nft_trans_rule(trans)); nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); @@ -4116,7 +4086,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; - nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: -- cgit v1.2.3 From f2a6d766765d2794e26e25655d4ffcfe29c3ec2f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 14 Jun 2016 17:29:18 +0200 Subject: netfilter: nf_tables: add generation mask to tables This patch addresses two problems: 1) The netlink dump is inconsistent when interfering with an ongoing transaction update for several reasons: 1.a) We don't honor the internal NFT_TABLE_INACTIVE flag, and we should be skipping these inactive objects in the dump. 1.b) We perform speculative deletion during the preparation phase, that may result in skipping active objects. 1.c) The listing order changes, which generates noise when tracking incremental ruleset update via tools like git or our own testsuite. 2) We don't allow to add and to update the object in the same batch, eg. add table x; add table x { flags dormant\; }. In order to resolve these problems: 1) If the user requests a deletion, the object becomes inactive in the next generation. Then, ignore objects that scheduled to be deleted from the lookup path, as they will be effectively removed in the next generation. 2) From the get/dump path, if the object is not currently active, we skip it. 3) Support 'add X -> update X' sequence from a transaction. After this update, we obtain a consistent list as long as we stay in the same generation. The userspace side can detect interferences through the generation counter so it can restart the dumping. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 ++- net/netfilter/nf_tables_api.c | 101 +++++++++++++++++++++----------------- 2 files changed, 62 insertions(+), 45 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d0778cbaf7f1..05c9a64b39aa 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -838,6 +838,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @hgenerator: handle generator state * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) + * @genmask: generation mask * @name: name of the table */ struct nft_table { @@ -846,7 +847,8 @@ struct nft_table { struct list_head sets; u64 hgenerator; u32 use; - u16 flags; + u16 flags:14, + genmask:2; char name[NFT_TABLE_MAXNAMELEN]; }; @@ -992,6 +994,8 @@ static inline u8 nft_genmask_cur(const struct net *net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) +#define nft_active_genmask(__obj, __genmask) \ + !((__obj)->genmask & __genmask) /* * Set element transaction helpers diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d9f0f0797dec..a4a77d60e631 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -175,9 +175,6 @@ static void nf_tables_unregister_hooks(const struct nft_table *table, nft_unregister_basechain(nft_base_chain(chain), hook_nops); } -/* Internal table flags */ -#define NFT_TABLE_INACTIVE (1 << 15) - static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; @@ -187,7 +184,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) return -ENOMEM; if (msg_type == NFT_MSG_NEWTABLE) - ctx->table->flags |= NFT_TABLE_INACTIVE; + nft_activate_next(ctx->net, ctx->table); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -201,7 +198,7 @@ static int nft_deltable(struct nft_ctx *ctx) if (err < 0) return err; - list_del_rcu(&ctx->table->list); + nft_deactivate_next(ctx->net, ctx->table); return err; } @@ -334,26 +331,29 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) */ static struct nft_table *nft_table_lookup(const struct nft_af_info *afi, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; list_for_each_entry(table, &afi->tables, list) { - if (!nla_strcmp(nla, table->name)) + if (!nla_strcmp(nla, table->name) && + nft_active_genmask(table, genmask)) return table; } return NULL; } static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - table = nft_table_lookup(afi, nla); + table = nft_table_lookup(afi, nla, genmask); if (table != NULL) return table; @@ -494,6 +494,8 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, table)) + continue; if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -518,6 +520,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; @@ -535,11 +538,9 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -648,6 +649,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; @@ -661,7 +663,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, return PTR_ERR(afi); name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(afi, name); + table = nf_tables_table_lookup(afi, name, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); @@ -669,8 +671,6 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, } if (table != NULL) { - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) @@ -765,6 +765,9 @@ static int nft_flush(struct nft_ctx *ctx, int family) ctx->afi = afi; list_for_each_entry_safe(table, nt, &afi->tables, list) { + if (!nft_is_active_next(ctx->net, table)) + continue; + if (nla[NFTA_TABLE_NAME] && nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) continue; @@ -785,6 +788,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; int family = nfmsg->nfgen_family; @@ -798,7 +802,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); + table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1074,6 +1078,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1092,11 +1097,9 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) @@ -1201,6 +1204,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; + u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct net_device *dev = NULL; u8 policy = NF_ACCEPT; @@ -1217,7 +1221,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1449,6 +1453,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; @@ -1459,7 +1464,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -1901,6 +1906,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); const struct nft_af_info *afi; const struct nft_table *table; const struct nft_chain *chain; @@ -1920,11 +1926,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); if (IS_ERR(chain)) @@ -1979,6 +1983,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; @@ -1999,7 +2004,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2144,6 +2149,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain = NULL; @@ -2155,7 +2161,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2309,7 +2315,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; @@ -2325,7 +2332,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, if (afi == NULL) return -EAFNOSUPPORT; - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], + genmask); if (IS_ERR(table)) return PTR_ERR(table); } @@ -2586,6 +2594,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2593,7 +2602,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -2661,6 +2670,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); const struct nft_set_ops *ops; struct nft_af_info *afi; struct nft_table *table; @@ -2758,7 +2768,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -2863,6 +2873,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); struct nft_set *set; struct nft_ctx ctx; int err; @@ -2872,7 +2883,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3001,7 +3012,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; @@ -3011,7 +3023,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, if (IS_ERR(afi)) return PTR_ERR(afi); - table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); + table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], + genmask); if (IS_ERR(table)) return PTR_ERR(table); @@ -3108,6 +3121,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3124,11 +3138,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) return err; err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, - (void *)nla); + (void *)nla, genmask); if (err < 0) return err; - if (ctx.table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) @@ -3187,15 +3199,14 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_cur(net); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; - if (ctx.table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) @@ -3519,6 +3530,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3527,7 +3539,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3641,6 +3653,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3649,7 +3662,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); if (err < 0) return err; @@ -3926,12 +3939,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } } else { - trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE; + nft_clear(net, trans->ctx.table); } nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE); nft_trans_destroy(trans); break; case NFT_MSG_DELTABLE: + list_del_rcu(&trans->ctx.table->list); nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE); break; case NFT_MSG_NEWCHAIN: @@ -4057,8 +4071,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } break; case NFT_MSG_DELTABLE: - list_add_tail_rcu(&trans->ctx.table->list, - &trans->ctx.afi->tables); + nft_clear(trans->ctx.net, trans->ctx.table); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: -- cgit v1.2.3 From 664b0f8cd8c66d02d14168ee7ac6a957cc88177f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 12 Jun 2016 19:21:31 +0200 Subject: netfilter: nf_tables: add generation mask to chains Similar to ("netfilter: nf_tables: add generation mask to tables"). Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 +- net/netfilter/nf_tables_api.c | 89 +++++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 05c9a64b39aa..b023e287ea92 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -732,7 +732,6 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) enum nft_chain_flags { NFT_BASE_CHAIN = 0x1, - NFT_CHAIN_INACTIVE = 0x2, }; /** @@ -754,7 +753,8 @@ struct nft_chain { u64 handle; u32 use; u16 level; - u8 flags; + u8 flags:6, + genmask:2; char name[NFT_CHAIN_MAXNAMELEN]; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a4a77d60e631..cae88f8a990e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -211,7 +211,7 @@ static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) return -ENOMEM; if (msg_type == NFT_MSG_NEWCHAIN) - ctx->chain->flags |= NFT_CHAIN_INACTIVE; + nft_activate_next(ctx->net, ctx->chain); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -226,7 +226,7 @@ static int nft_delchain(struct nft_ctx *ctx) return err; ctx->table->use--; - list_del_rcu(&ctx->chain->list); + nft_deactivate_next(ctx->net, ctx->chain); return err; } @@ -559,13 +559,16 @@ err: return err; } -static int nf_tables_table_enable(const struct nft_af_info *afi, +static int nf_tables_table_enable(struct net *net, + const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; int err, i = 0; list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (!(chain->flags & NFT_BASE_CHAIN)) continue; @@ -578,6 +581,8 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, return 0; err: list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (!(chain->flags & NFT_BASE_CHAIN)) continue; @@ -589,12 +594,15 @@ err: return err; } -static void nf_tables_table_disable(const struct nft_af_info *afi, +static void nf_tables_table_disable(struct net *net, + const struct nft_af_info *afi, struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; if (chain->flags & NFT_BASE_CHAIN) nft_unregister_basechain(nft_base_chain(chain), afi->nops); @@ -627,7 +635,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_enable(trans) = false; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ret = nf_tables_table_enable(ctx->afi, ctx->table); + ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table); if (ret >= 0) { ctx->table->flags &= ~NFT_TABLE_F_DORMANT; nft_trans_table_enable(trans) = true; @@ -722,6 +730,9 @@ static int nft_flush_table(struct nft_ctx *ctx) struct nft_set *set, *ns; list_for_each_entry(chain, &ctx->table->chains, list) { + if (!nft_is_active_next(ctx->net, chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -740,6 +751,9 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { + if (!nft_is_active_next(ctx->net, chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -849,12 +863,14 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type); */ static struct nft_chain * -nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) +nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, + u8 genmask) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { - if (chain->handle == handle) + if (chain->handle == handle && + nft_active_genmask(chain, genmask)) return chain; } @@ -862,7 +878,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle) } static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_chain *chain; @@ -870,7 +887,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, return ERR_PTR(-EINVAL); list_for_each_entry(chain, &table->chains, list) { - if (!nla_strcmp(nla, chain->name)) + if (!nla_strcmp(nla, chain->name) && + nft_active_genmask(chain, genmask)) return chain; } @@ -1053,6 +1071,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); + if (!nft_is_active(net, chain)) + continue; if (nf_tables_fill_chain_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -1101,11 +1121,9 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1230,11 +1248,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle); + chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { - chain = nf_tables_chain_lookup(table, name); + chain = nf_tables_chain_lookup(table, name, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) return PTR_ERR(chain); @@ -1265,16 +1283,20 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_stats *stats = NULL; struct nft_trans *trans; - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - if (nla[NFTA_CHAIN_HANDLE] && name && - !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]))) - return -EEXIST; + if (nla[NFTA_CHAIN_HANDLE] && name) { + struct nft_chain *chain2; + + chain2 = nf_tables_chain_lookup(table, + nla[NFTA_CHAIN_NAME], + genmask); + if (IS_ERR(chain2)) + return PTR_ERR(chain2); + } if (nla[NFTA_CHAIN_COUNTERS]) { if (!(chain->flags & NFT_BASE_CHAIN)) @@ -1468,7 +1490,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); + chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (chain->use > 0) @@ -1930,11 +1952,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) @@ -2008,7 +2028,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (IS_ERR(table)) return PTR_ERR(table); - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); @@ -2166,7 +2186,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); if (nla[NFTA_RULE_CHAIN]) { - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); + chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } @@ -2186,6 +2207,9 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, } } else { list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; + ctx.chain = chain; err = nft_delrule_by_chain(&ctx); if (err < 0) @@ -3934,7 +3958,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(trans->ctx.afi, + nf_tables_table_disable(net, + trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -3952,12 +3977,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) nft_chain_commit_update(trans); else - trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE; + nft_clear(net, trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); nft_trans_destroy(trans); break; case NFT_MSG_DELCHAIN: + list_del_rcu(&trans->ctx.chain->list); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); nf_tables_unregister_hooks(trans->ctx.table, trans->ctx.chain, @@ -4061,7 +4087,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(trans->ctx.afi, + nf_tables_table_disable(net, + trans->ctx.afi, trans->ctx.table); trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; } @@ -4089,8 +4116,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELCHAIN: trans->ctx.table->use++; - list_add_tail_rcu(&trans->ctx.chain->list, - &trans->ctx.table->chains); + nft_clear(trans->ctx.net, trans->ctx.chain); nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: @@ -4413,6 +4439,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla) { + u8 genmask = nft_genmask_next(ctx->net); struct nlattr *tb[NFTA_VERDICT_MAX + 1]; struct nft_chain *chain; int err; @@ -4445,7 +4472,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; chain = nf_tables_chain_lookup(ctx->table, - tb[NFTA_VERDICT_CHAIN]); + tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (chain->flags & NFT_BASE_CHAIN) -- cgit v1.2.3 From 37a9cc52552579f22e18cca401cfc4351b6cbc72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 12 Jun 2016 22:52:45 +0200 Subject: netfilter: nf_tables: add generation mask to sets Similar to ("netfilter: nf_tables: add generation mask to tables"). Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 +++-- net/netfilter/nf_tables_api.c | 68 +++++++++++++++++++++++---------------- net/netfilter/nft_dynset.c | 7 ++-- net/netfilter/nft_lookup.c | 6 ++-- 4 files changed, 54 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b023e287ea92..07a5ba47cbda 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -296,6 +296,7 @@ void nft_unregister_set(struct nft_set_ops *ops); * @ops: set ops * @pnet: network namespace * @flags: set flags + * @genmask: generation mask * @klen: key length * @dlen: data length * @data: private set data @@ -317,7 +318,8 @@ struct nft_set { /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; possible_net_t pnet; - u16 flags; + u16 flags:14, + genmask:2; u8 klen; u8 dlen; unsigned char data[] @@ -335,9 +337,9 @@ static inline struct nft_set *nft_set_container_of(const void *priv) } struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla); + const struct nlattr *nla, u8 genmask); struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla); + const struct nlattr *nla, u8 genmask); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cae88f8a990e..3316bce0a878 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -289,9 +289,6 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -/* Internal set flag */ -#define NFT_SET_INACTIVE (1 << 15) - static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, struct nft_set *set) { @@ -304,7 +301,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { nft_trans_set_id(trans) = ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); - set->flags |= NFT_SET_INACTIVE; + nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -320,7 +317,7 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; - list_del_rcu(&set->list); + nft_deactivate_next(ctx->net, set); ctx->table->use--; return err; @@ -741,6 +738,9 @@ static int nft_flush_table(struct nft_ctx *ctx) } list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, set)) + continue; + if (set->flags & NFT_SET_ANONYMOUS && !list_empty(&set->bindings)) continue; @@ -2367,7 +2367,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, } struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2375,22 +2375,27 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-EINVAL); list_for_each_entry(set, &table->sets, list) { - if (!nla_strcmp(nla, set->name)) + if (!nla_strcmp(nla, set->name) && + nft_active_genmask(set, genmask)) return set; } return ERR_PTR(-ENOENT); } struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla) + const struct nlattr *nla, + u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_set *set = nft_trans_set(trans); + if (trans->msg_type == NFT_MSG_NEWSET && - id == nft_trans_set_id(trans)) - return nft_trans_set(trans); + id == nft_trans_set_id(trans) && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -2415,6 +2420,8 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { int tmp; + if (!nft_is_active_next(ctx->net, set)) + continue; if (!sscanf(i->name, name, &tmp)) continue; if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE) @@ -2434,6 +2441,8 @@ cont: snprintf(set->name, sizeof(set->name), name, min + n); list_for_each_entry(i, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, i)) + continue; if (!strcmp(set->name, i->name)) return -ENFILE; } @@ -2582,6 +2591,8 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) list_for_each_entry_rcu(set, &table->sets, list) { if (idx < s_idx) goto cont; + if (!nft_is_active(net, set)) + goto cont; ctx_set = *ctx; ctx_set.table = table; @@ -2651,11 +2662,9 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb2 == NULL) @@ -2798,7 +2807,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); - set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) return PTR_ERR(set); @@ -2911,7 +2920,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings)) @@ -2980,7 +2989,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, list_del_rcu(&binding->list); if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && - !(set->flags & NFT_SET_INACTIVE)) + nft_is_active(ctx->net, set)) nf_tables_set_destroy(ctx, set); } @@ -3166,11 +3175,10 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; event = NFT_MSG_NEWSETELEM; event |= NFNL_SUBSYS_NFTABLES << 8; @@ -3232,11 +3240,10 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -3567,11 +3574,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) { if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { set = nf_tables_set_lookup_byid(net, - nla[NFTA_SET_ELEM_LIST_SET_ID]); + nla[NFTA_SET_ELEM_LIST_SET_ID], + genmask); } if (IS_ERR(set)) return PTR_ERR(set); @@ -3690,7 +3699,8 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); + set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) @@ -4003,7 +4013,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NFT_MSG_DELRULE); break; case NFT_MSG_NEWSET: - nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE; + nft_clear(net, nft_trans_set(trans)); /* This avoids hitting -EBUSY when deleting the table * from the transaction. */ @@ -4016,6 +4026,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); break; @@ -4134,8 +4145,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: trans->ctx.table->use++; - list_add_tail_rcu(&nft_trans_set(trans)->list, - &trans->ctx.table->sets); + nft_clear(trans->ctx.net, nft_trans_set(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -4282,6 +4292,8 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, } list_for_each_entry(set, &ctx->table->sets, list) { + if (!nft_is_active_next(ctx->net, set)) + continue; if (!(set->flags & NFT_SET_MAP) || set->dtype != NFT_DATA_VERDICT) continue; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 78d4914fb39c..0af26699bf04 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -103,6 +103,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_dynset *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err; @@ -112,11 +113,13 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]); + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], + genmask); if (IS_ERR(set)) { if (tb[NFTA_DYNSET_SET_ID]) set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_DYNSET_SET_ID]); + tb[NFTA_DYNSET_SET_ID], + genmask); if (IS_ERR(set)) return PTR_ERR(set); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b3c31ef8015d..8a102cf855d0 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -54,6 +54,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_lookup *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; int err; @@ -61,11 +62,12 @@ static int nft_lookup_init(const struct nft_ctx *ctx, tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]); + set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET], genmask); if (IS_ERR(set)) { if (tb[NFTA_LOOKUP_SET_ID]) { set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_LOOKUP_SET_ID]); + tb[NFTA_LOOKUP_SET_ID], + genmask); } if (IS_ERR(set)) return PTR_ERR(set); -- cgit v1.2.3 From 3183ab8997a477c8d9ad175a1cef70dff77c6dbc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 22 Jun 2016 13:26:10 +0200 Subject: netfilter: conntrack: allow increasing bucket size via sysctl too No need to restrict this to module parameter. We export a copy of the real hash size -- when user alters the value we allocate the new table, copy entries etc before we update the real size to the requested one. This is also needed because the real size is used by concurrent readers and cannot be changed without synchronizing the conntrack generation seqcnt. We only allow changing this value from the initial net namespace. Tested using http-client-benchmark vs. httpterm with concurrent while true;do echo $RANDOM > /proc/sys/net/netfilter/nf_conntrack_buckets done Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_conntrack-sysctl.txt | 3 +- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 41 ++++++++++++++++-------- net/netfilter/nf_conntrack_standalone.c | 36 ++++++++++++++++++--- 4 files changed, 62 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt index f55599c62c9d..4fb51d32fccc 100644 --- a/Documentation/networking/nf_conntrack-sysctl.txt +++ b/Documentation/networking/nf_conntrack-sysctl.txt @@ -7,12 +7,13 @@ nf_conntrack_acct - BOOLEAN Enable connection tracking flow accounting. 64-bit byte and packet counters per flow are added. -nf_conntrack_buckets - INTEGER (read-only) +nf_conntrack_buckets - INTEGER Size of hash table. If not specified as parameter during module loading, the default size is calculated by dividing total memory by 16384 to determine the number of buckets but the hash table will never have fewer than 32 and limited to 16384 buckets. For systems with more than 4GB of memory it will be 65536 buckets. + This sysctl is only writeable in the initial net namespace. nf_conntrack_checksum - BOOLEAN 0 - disabled diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 9c0ed3d7af89..5d3397f34583 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -290,6 +290,7 @@ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) struct kernel_param; int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); +int nf_conntrack_hash_resize(unsigned int hashsize); extern unsigned int nf_conntrack_htable_size; extern unsigned int nf_conntrack_max; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a459176c3253..e17d5c7faca0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1595,24 +1595,14 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) } EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +int nf_conntrack_hash_resize(unsigned int hashsize) { - int i, bucket, rc; - unsigned int hashsize, old_size; + int i, bucket; + unsigned int old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - if (current->nsproxy->net_ns != &init_net) - return -EOPNOTSUPP; - - /* On boot, we can set this without any fancy locking. */ - if (!nf_conntrack_htable_size) - return param_set_uint(val, kp); - - rc = kstrtouint(val, 0, &hashsize); - if (rc) - return rc; if (!hashsize) return -EINVAL; @@ -1620,6 +1610,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; + old_size = nf_conntrack_htable_size; + if (old_size == hashsize) { + nf_ct_free_hashtable(hash, hashsize); + return 0; + } + local_bh_disable(); nf_conntrack_all_lock(); write_seqcount_begin(&nf_conntrack_generation); @@ -1655,6 +1651,25 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) nf_ct_free_hashtable(old_hash, old_size); return 0; } + +int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +{ + unsigned int hashsize; + int rc; + + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + rc = kstrtouint(val, 0, &hashsize); + if (rc) + return rc; + + return nf_conntrack_hash_resize(hashsize); +} EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index f87e84ebcec3..a0cc1919f081 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -434,8 +434,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) #ifdef CONFIG_SYSCTL /* Log invalid packets of a given protocol */ -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; +static int log_invalid_proto_min __read_mostly; +static int log_invalid_proto_max __read_mostly = 255; + +/* size the user *wants to set */ +static unsigned int nf_conntrack_htable_size_user __read_mostly; + +static int +nf_conntrack_hash_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + /* update ret, we might not be able to satisfy request */ + ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); + + /* update it to the actual value used by conntrack */ + nf_conntrack_htable_size_user = nf_conntrack_htable_size; + return ret; +} static struct ctl_table_header *nf_ct_netfilter_header; @@ -456,10 +477,10 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = nf_conntrack_hash_sysctl, }, { .procname = "nf_conntrack_checksum", @@ -517,6 +538,9 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (net->user_ns != &init_user_ns) table[0].procname = NULL; + if (!net_eq(&init_net, net)) + table[2].mode = 0444; + net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) goto out_unregister_netfilter; @@ -606,6 +630,8 @@ static int __init nf_conntrack_standalone_init(void) ret = -ENOMEM; goto out_sysctl; } + + nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif ret = register_pernet_subsys(&nf_conntrack_net_ops); -- cgit v1.2.3 From 82bec71d46b83f39860e2838ff8394e4fcd6efab Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Jun 2016 14:26:33 +0200 Subject: netfilter: nf_tables: get rid of NFT_BASECHAIN_DISABLED This flag was introduced to restore rulesets from the new netdev family, but since 5ebe0b0eec9d6f7 ("netfilter: nf_tables: destroy basechain and rules on netdevice removal") the ruleset is released once the netdev is gone. This also removes nft_register_basechain() and nft_unregister_basechain() since they have no clients anymore after this rework. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 62 ++++++++++++++++----------------------- 2 files changed, 25 insertions(+), 38 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 07a5ba47cbda..1ea19a6e72e6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -798,7 +798,6 @@ struct nft_stats { }; #define NFT_HOOK_OPS_MAX 2 -#define NFT_BASECHAIN_DISABLED (1 << 0) /** * struct nft_base_chain - nf_tables base chain diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3316bce0a878..92c9faeb2bf8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -131,29 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static int nft_register_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) -{ - struct net *net = read_pnet(&basechain->pnet); - - if (basechain->flags & NFT_BASECHAIN_DISABLED) - return 0; - - return nf_register_net_hooks(net, basechain->ops, hook_nops); -} - -static void nft_unregister_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) -{ - struct net *net = read_pnet(&basechain->pnet); - - if (basechain->flags & NFT_BASECHAIN_DISABLED) - return; - - nf_unregister_net_hooks(net, basechain->ops, hook_nops); -} - -static int nf_tables_register_hooks(const struct nft_table *table, +static int nf_tables_register_hooks(struct net *net, + const struct nft_table *table, struct nft_chain *chain, unsigned int hook_nops) { @@ -161,10 +140,12 @@ static int nf_tables_register_hooks(const struct nft_table *table, !(chain->flags & NFT_BASE_CHAIN)) return 0; - return nft_register_basechain(nft_base_chain(chain), hook_nops); + return nf_register_net_hooks(net, nft_base_chain(chain)->ops, + hook_nops); } -static void nf_tables_unregister_hooks(const struct nft_table *table, +static void nf_tables_unregister_hooks(struct net *net, + const struct nft_table *table, struct nft_chain *chain, unsigned int hook_nops) { @@ -172,7 +153,7 @@ static void nf_tables_unregister_hooks(const struct nft_table *table, !(chain->flags & NFT_BASE_CHAIN)) return; - nft_unregister_basechain(nft_base_chain(chain), hook_nops); + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -569,7 +550,8 @@ static int nf_tables_table_enable(struct net *net, if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nft_register_basechain(nft_base_chain(chain), afi->nops); + err = nf_register_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); if (err < 0) goto err; @@ -586,7 +568,8 @@ err: if (i-- <= 0) break; - nft_unregister_basechain(nft_base_chain(chain), afi->nops); + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); } return err; } @@ -600,9 +583,11 @@ static void nf_tables_table_disable(struct net *net, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (chain->flags & NFT_BASE_CHAIN) - nft_unregister_basechain(nft_base_chain(chain), - afi->nops); + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, + afi->nops); } } @@ -1451,7 +1436,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - err = nf_tables_register_hooks(table, chain, afi->nops); + err = nf_tables_register_hooks(net, table, chain, afi->nops); if (err < 0) goto err1; @@ -1464,7 +1449,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: - nf_tables_unregister_hooks(table, chain, afi->nops); + nf_tables_unregister_hooks(net, table, chain, afi->nops); err1: nf_tables_chain_destroy(chain); return err; @@ -3995,7 +3980,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DELCHAIN: list_del_rcu(&trans->ctx.chain->list); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - nf_tables_unregister_hooks(trans->ctx.table, + nf_tables_unregister_hooks(trans->ctx.net, + trans->ctx.table, trans->ctx.chain, trans->ctx.afi->nops); break; @@ -4120,7 +4106,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) } else { trans->ctx.table->use--; list_del_rcu(&trans->ctx.chain->list); - nf_tables_unregister_hooks(trans->ctx.table, + nf_tables_unregister_hooks(trans->ctx.net, + trans->ctx.table, trans->ctx.chain, trans->ctx.afi->nops); } @@ -4662,7 +4649,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN)); - nf_tables_unregister_hooks(ctx->chain->table, ctx->chain, + nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain, ctx->afi->nops); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); @@ -4691,7 +4678,8 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) list_for_each_entry_safe(table, nt, &afi->tables, list) { list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hooks(table, chain, afi->nops); + nf_tables_unregister_hooks(net, table, chain, + afi->nops); /* No packets are walking on these chains anymore. */ ctx.table = table; list_for_each_entry(chain, &table->chains, list) { -- cgit v1.2.3 From 520ac30f45519b0a82dd92117c181d1d6144677b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:49 -0700 Subject: net_sched: drop packets after root qdisc lock is released Qdisc performance suffers when packets are dropped at enqueue() time because drops (kfree_skb()) are done while qdisc lock is held, delaying a dequeue() draining the queue. Nominal throughput can be reduced by 50 % when this happens, at a time we would like the dequeue() to proceed as fast as possible. Even FQ is vulnerable to this problem, while one of FQ goals was to provide some flow isolation. This patch adds a 'struct sk_buff **to_free' parameter to all qdisc->enqueue(), and in qdisc_drop() helper. I measured a performance increase of up to 12 %, but this patch is a prereq so that future batches in enqueue() can fly. Signed-off-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/sch_generic.h | 41 ++++++++++++++++++++++++++++++----------- net/core/dev.c | 7 +++++-- net/sched/sch_atm.c | 9 +++++---- net/sched/sch_blackhole.c | 5 +++-- net/sched/sch_cbq.c | 7 ++++--- net/sched/sch_choke.c | 16 +++++++++------- net/sched/sch_codel.c | 8 +++++--- net/sched/sch_drr.c | 7 ++++--- net/sched/sch_dsmark.c | 9 +++++---- net/sched/sch_fifo.c | 15 +++++++++------ net/sched/sch_fq.c | 7 ++++--- net/sched/sch_fq_codel.c | 15 +++++++++------ net/sched/sch_generic.c | 10 ++++++---- net/sched/sch_gred.c | 7 ++++--- net/sched/sch_hfsc.c | 6 +++--- net/sched/sch_hhf.c | 10 +++++----- net/sched/sch_htb.c | 10 ++++++---- net/sched/sch_multiq.c | 7 ++++--- net/sched/sch_netem.c | 25 +++++++++++++++---------- net/sched/sch_pie.c | 5 +++-- net/sched/sch_plug.c | 5 +++-- net/sched/sch_prio.c | 4 ++-- net/sched/sch_qfq.c | 7 ++++--- net/sched/sch_red.c | 7 ++++--- net/sched/sch_sfb.c | 7 ++++--- net/sched/sch_sfq.c | 8 ++++---- net/sched/sch_tbf.c | 16 +++++++++------- net/sched/sch_teql.c | 4 ++-- 28 files changed, 170 insertions(+), 114 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4f7cee8344c4..04e84c07c94f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -37,8 +37,10 @@ struct qdisc_size_table { }; struct Qdisc { - int (*enqueue)(struct sk_buff *skb, struct Qdisc *dev); - struct sk_buff * (*dequeue)(struct Qdisc *dev); + int (*enqueue)(struct sk_buff *skb, + struct Qdisc *sch, + struct sk_buff **to_free); + struct sk_buff * (*dequeue)(struct Qdisc *sch); unsigned int flags; #define TCQ_F_BUILTIN 1 #define TCQ_F_INGRESS 2 @@ -160,7 +162,9 @@ struct Qdisc_ops { char id[IFNAMSIZ]; int priv_size; - int (*enqueue)(struct sk_buff *, struct Qdisc *); + int (*enqueue)(struct sk_buff *skb, + struct Qdisc *sch, + struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); @@ -498,10 +502,11 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, #endif } -static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { qdisc_calculate_pkt_len(skb, sch); - return sch->enqueue(skb, sch); + return sch->enqueue(skb, sch, to_free); } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) @@ -626,24 +631,36 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) return __qdisc_dequeue_head(sch, &sch->q); } +/* Instead of calling kfree_skb() while root qdisc lock is held, + * queue the skb for future freeing at end of __dev_xmit_skb() + */ +static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) +{ + skb->next = *to_free; + *to_free = skb; +} + static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, - struct sk_buff_head *list) + struct sk_buff_head *list, + struct sk_buff **to_free) { struct sk_buff *skb = __skb_dequeue(list); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return len; } return 0; } -static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch) +static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch, + struct sk_buff **to_free) { - return __qdisc_queue_drop_head(sch, &sch->q); + return __qdisc_queue_drop_head(sch, &sch->q, to_free); } static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) @@ -724,9 +741,11 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) qdisc_qstats_drop(sch); } -static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) + +static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { - kfree_skb(skb); + __qdisc_drop(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; diff --git a/net/core/dev.c b/net/core/dev.c index d40593b3b9fb..aba10d2a8bc3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3070,6 +3070,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); + struct sk_buff *to_free = NULL; bool contended; int rc; @@ -3086,7 +3087,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - kfree_skb(skb); + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { @@ -3109,7 +3110,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - rc = q->enqueue(skb, q) & NET_XMIT_MASK; + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3119,6 +3120,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } } spin_unlock(root_lock); + if (unlikely(to_free)) + kfree_skb_list(to_free); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e04ea6994d1c..481e4f12aeb4 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -357,7 +357,8 @@ static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, /* --------------------------- Qdisc operations ---------------------------- */ -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; @@ -398,10 +399,10 @@ done: switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: - kfree_skb(skb); + __qdisc_drop(skb, to_free); goto drop; case TC_ACT_RECLASSIFY: if (flow->excess) @@ -413,7 +414,7 @@ done: #endif } - ret = qdisc_enqueue(skb, flow->q); + ret = qdisc_enqueue(skb, flow->q, to_free); if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index 3fee70d9814f..c98a61e980ba 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -17,9 +17,10 @@ #include #include -static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index a29fd811d7b9..beb554aa8cfb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -358,7 +358,8 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) } static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct cbq_sched_data *q = qdisc_priv(sch); int uninitialized_var(ret); @@ -370,11 +371,11 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } - ret = qdisc_enqueue(skb, cl->q); + ret = qdisc_enqueue(skb, cl->q, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; cbq_mark_toplevel(q, cl); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 789b69ee9e51..3b6d5bd69101 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -115,7 +115,8 @@ static void choke_zap_tail_holes(struct choke_sched_data *q) } /* Drop packet from queue array by creating a "hole" */ -static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, + struct sk_buff **to_free) { struct choke_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = q->tab[idx]; @@ -129,7 +130,7 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) qdisc_qstats_backlog_dec(sch, skb); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); --sch->q.qlen; } @@ -261,7 +262,8 @@ static bool choke_match_random(const struct choke_sched_data *q, return choke_match_flow(oskb, nskb); } -static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); @@ -288,7 +290,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Draw a packet at random from queue and compare flow */ if (choke_match_random(q, skb, &idx)) { q->stats.matched++; - choke_drop_by_idx(sch, idx); + choke_drop_by_idx(sch, idx, to_free); goto congestion_drop; } @@ -331,16 +333,16 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.pdrop++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c5bc424e3b3c..4002df3c7d9f 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -82,7 +82,8 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_drop(skb, sch); + kfree_skb(skb); + qdisc_qstats_drop(sch); } static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) @@ -107,7 +108,8 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) return skb; } -static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct codel_sched_data *q; @@ -117,7 +119,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 22609e4e845f..8af5c59eef84 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -350,7 +350,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, return NULL; } -static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; @@ -360,11 +361,11 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index b9ba5f658528..1308bbf460f7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -191,7 +191,8 @@ static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, /* --------------------------- Qdisc operations ---------------------------- */ -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; @@ -234,7 +235,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT case TC_ACT_QUEUED: case TC_ACT_STOLEN: - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; case TC_ACT_SHOT: @@ -251,7 +252,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - err = qdisc_enqueue(skb, p->q); + err = qdisc_enqueue(skb, p->q, to_free); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); @@ -264,7 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index dea70e3ef0ba..6ea0db427f91 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,29 +19,32 @@ /* 1 band FIFO pseudo-"scheduler" */ -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } -static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { if (likely(skb_queue_len(&sch->q) < sch->limit)) return qdisc_enqueue_tail(skb, sch); /* queue full, remove one skb to fulfill the limit */ - __qdisc_queue_drop_head(sch, &sch->q); + __qdisc_queue_drop_head(sch, &sch->q, to_free); qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 6eb06674f778..e5458b99e09c 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -368,18 +368,19 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) } } -static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct fq_sched_data *q = qdisc_priv(sch); struct fq_flow *f; if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } f->qlen++; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2dc0a849515a..f715195459c9 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -139,7 +139,8 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, skb->next = NULL; } -static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) +static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, + struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -172,7 +173,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += skb->truesize; - kfree_skb(skb); + __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); flow->dropped += i; @@ -184,7 +185,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) return idx; } -static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct fq_codel_sched_data *q = qdisc_priv(sch); unsigned int idx, prev_backlog, prev_qlen; @@ -197,7 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } idx--; @@ -229,7 +231,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) * So instead of dropping a single packet, drop half of its backlog * with a 64 packets limit to not add a too big cpu spike here. */ - ret = fq_codel_drop(sch, q->drop_batch_size); + ret = fq_codel_drop(sch, q->drop_batch_size, to_free); prev_qlen -= sch->q.qlen; prev_backlog -= sch->qstats.backlog; @@ -276,7 +278,8 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - qdisc_drop(skb, sch); + kfree_skb(skb); + qdisc_qstats_drop(sch); } static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 773b632e1e33..ff86606954f2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -348,9 +348,10 @@ EXPORT_SYMBOL(netif_carrier_off); cheaper. */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, + struct sk_buff **to_free) { - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_CN; } @@ -439,7 +440,8 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, return priv->q + band; } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, + struct sk_buff **to_free) { if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; @@ -451,7 +453,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) return __qdisc_enqueue_tail(skb, qdisc, list); } - return qdisc_drop(skb, qdisc); + return qdisc_drop(skb, qdisc, to_free); } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index b5fb63c7be02..c78a093c551a 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -149,7 +149,8 @@ static inline int gred_use_harddrop(struct gred_sched *t) return t->red_flags & TC_RED_HARDDROP; } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct gred_sched_data *q = NULL; struct gred_sched *t = qdisc_priv(sch); @@ -237,10 +238,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->stats.pdrop++; drop: - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index bd08c363a26d..8cb5eff7b79c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1572,7 +1572,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) } static int -hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hfsc_class *cl; int uninitialized_var(err); @@ -1581,11 +1581,11 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c44593b8e65a..e3d0458af17b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -345,7 +345,7 @@ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) skb->next = NULL; } -static unsigned int hhf_drop(struct Qdisc *sch) +static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; @@ -359,16 +359,16 @@ static unsigned int hhf_drop(struct Qdisc *sch) struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; - qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); + qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } -static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; @@ -406,7 +406,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Return Congestion Notification only if we dropped a packet from this * bucket. */ - if (hhf_drop(sch) == idx) + if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index a454605ab5cb..f3882259c385 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -569,7 +569,8 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) list_del_init(&cl->un.leaf.drop_list); } -static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { int uninitialized_var(ret); struct htb_sched *q = qdisc_priv(sch); @@ -581,16 +582,17 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; } else { - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { + } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, + to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); cl->qstats.drops++; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 5ea93305d705..9ffbb025b37e 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -65,7 +65,8 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; @@ -76,12 +77,12 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } #endif - ret = qdisc_enqueue(skb, qdisc); + ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e271967439bf..ccca8ca4c722 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -397,7 +397,8 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) * when we statistically choose to corrupt one, we instead segment it, returning * the first packet to be corrupted, and re-enqueue the remaining frames */ -static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) +static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); @@ -405,7 +406,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NULL; } consume_skb(skb); @@ -418,7 +419,8 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ -static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ @@ -443,7 +445,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (count == 0) { qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -463,7 +465,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; - rootq->enqueue(skb2, rootq); + rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; } @@ -475,7 +477,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (skb_is_gso(skb)) { - segs = netem_segment(skb, sch); + segs = netem_segment(skb, sch, to_free); if (!segs) return NET_XMIT_DROP; } else { @@ -488,7 +490,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))) { - rc = qdisc_drop(skb, sch); + rc = qdisc_drop(skb, sch, to_free); goto finish_segs; } @@ -497,7 +499,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); @@ -557,7 +559,7 @@ finish_segs: segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; - rc = qdisc_enqueue(segs, sch); + rc = qdisc_enqueue(segs, sch, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); @@ -615,8 +617,11 @@ deliver: #endif if (q->qdisc) { - int err = qdisc_enqueue(skb, q->qdisc); + struct sk_buff *to_free = NULL; + int err; + err = qdisc_enqueue(skb, q->qdisc, &to_free); + kfree_skb_list(to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 912a46a5d02e..a570b0bb254c 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -134,7 +134,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) return false; } -static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -166,7 +167,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) out: q->stats.dropped++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index a12cd37680f8..1c6cbab3e7b9 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -88,7 +88,8 @@ struct plug_sched_data { u32 pkts_to_release; }; -static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); @@ -98,7 +99,7 @@ static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) return qdisc_enqueue_tail(skb, sch); } - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index de492682caee..f4d443aeae54 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -67,7 +67,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } static int -prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) +prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; @@ -83,7 +83,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) } #endif - ret = qdisc_enqueue(skb, qdisc); + ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 0427fa8b23f2..f27ffee106f6 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1217,7 +1217,8 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) return agg; } -static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; @@ -1240,11 +1241,11 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) qdisc_pkt_len(skb)); if (err) { cl->qstats.drops++; - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } } - err = qdisc_enqueue(skb, cl->qdisc); + err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index a0d57530335e..249b2a18acbd 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -56,7 +56,8 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } -static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; @@ -95,7 +96,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; } - ret = qdisc_enqueue(skb, child); + ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -106,7 +107,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; congestion_drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index c69611640fa5..add3cc7d37ec 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -275,7 +275,8 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, return false; } -static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct sfb_sched_data *q = qdisc_priv(sch); @@ -397,7 +398,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } enqueue: - ret = qdisc_enqueue(skb, child); + ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { sch->q.qlen++; increment_qlen(skb, q); @@ -408,7 +409,7 @@ enqueue: return ret; drop: - qdisc_drop(skb, sch); + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 57d118b41cad..7f195ed4d568 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -343,7 +343,7 @@ static int sfq_headdrop(const struct sfq_sched_data *q) } static int -sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; @@ -367,7 +367,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; @@ -424,14 +424,14 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; - qdisc_drop(head, sch); + qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); return NET_XMIT_CN; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index c12df84d1078..303355c449ab 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -155,7 +155,8 @@ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ -static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; @@ -166,7 +167,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); nb = 0; while (segs) { @@ -174,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) segs->next = NULL; qdisc_skb_cb(segs)->pkt_len = segs->len; len += segs->len; - ret = qdisc_enqueue(segs, q->qdisc); + ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); @@ -190,17 +191,18 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) - return tbf_segment(skb, sch); - return qdisc_drop(skb, sch); + return tbf_segment(skb, sch, to_free); + return qdisc_drop(skb, sch, to_free); } - ret = qdisc_enqueue(skb, q->qdisc); + ret = qdisc_enqueue(skb, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index e02687185a59..2cd9b4478b92 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -77,7 +77,7 @@ struct teql_sched_data { /* "teql*" qdisc routines */ static int -teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); @@ -87,7 +87,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - return qdisc_drop(skb, sch); + return qdisc_drop(skb, sch, to_free); } static struct sk_buff * -- cgit v1.2.3 From 008830bc321c0fc22c0db8d5b0b56f854ed90a5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:50 -0700 Subject: net_sched: fq_codel: cache skb->truesize into skb->cb Now we defer skb drops, it makes sense to keep a copy of skb->truesize in struct codel_skb_cb to avoid one cache line miss per dropped skb in fq_codel_drop(), to reduce latencies a bit further. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/codel_qdisc.h | 1 + net/sched/sch_fq_codel.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/codel_qdisc.h b/include/net/codel_qdisc.h index 8144d9cd2908..098630f83a55 100644 --- a/include/net/codel_qdisc.h +++ b/include/net/codel_qdisc.h @@ -52,6 +52,7 @@ /* Qdiscs using codel plugin must use codel_skb_cb in their own cb[] */ struct codel_skb_cb { codel_time_t enqueue_time; + unsigned int mem_usage; }; static struct codel_skb_cb *get_codel_cb(const struct sk_buff *skb) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index f715195459c9..a5ea0e9b6be4 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -172,7 +172,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, do { skb = dequeue_head(flow); len += qdisc_pkt_len(skb); - mem += skb->truesize; + mem += get_codel_cb(skb)->mem_usage; __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -216,7 +216,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->deficit = q->quantum; flow->dropped = 0; } - q->memory_usage += skb->truesize; + get_codel_cb(skb)->mem_usage = skb->truesize; + q->memory_usage += get_codel_cb(skb)->mem_usage; memory_limited = q->memory_usage > q->memory_limit; if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; @@ -267,7 +268,7 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) if (flow->head) { skb = dequeue_head(flow); q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); - q->memory_usage -= skb->truesize; + q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); } -- cgit v1.2.3 From 4d202a0d31b96ab3324b21e7500d9a2da9ef57dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jun 2016 23:16:52 -0700 Subject: net_sched: generalize bulk dequeue When qdisc bulk dequeue was added in linux-3.18 (commit 5772e9a3463b "qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE"), it was constrained to some specific qdiscs. With some extra care, we can extend this to all qdiscs, so that typical traffic shaping solutions can benefit from small batches (8 packets in this patch). For example, HTB is often used on some multi queue device. And bonding/team are multi queue devices... Idea is to bulk-dequeue packets mapping to the same transmit queue. This brings between 35 and 80 % performance increase in HTB setup under pressure on a bonding setup : 1) NUMA node contention : 610,000 pps -> 1,110,000 pps 2) No node contention : 1,380,000 pps -> 1,930,000 pps Now we should work to add batches on the enqueue() side ;) Signed-off-by: Eric Dumazet Cc: John Fastabend Cc: Jesper Dangaard Brouer Cc: Hannes Frederic Sowa Cc: Florian Westphal Cc: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 ++--- net/sched/sch_generic.c | 68 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 62 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 04e84c07c94f..909aff2db2b3 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -75,13 +75,14 @@ struct Qdisc { /* * For performance sake on SMP, we put highly modified fields at the end */ - struct Qdisc *next_sched ____cacheline_aligned_in_smp; - struct sk_buff *gso_skb; - unsigned long state; + struct sk_buff *gso_skb ____cacheline_aligned_in_smp; struct sk_buff_head q; struct gnet_stats_basic_packed bstats; seqcount_t running; struct gnet_stats_queue qstats; + unsigned long state; + struct Qdisc *next_sched; + struct sk_buff *skb_bad_txq; struct rcu_head rcu_head; int padded; atomic_t refcnt; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff86606954f2..e95b67cd5718 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -77,6 +77,34 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, skb->next = NULL; } +/* This variant of try_bulk_dequeue_skb() makes sure + * all skbs in the chain are for the same txq + */ +static void try_bulk_dequeue_skb_slow(struct Qdisc *q, + struct sk_buff *skb, + int *packets) +{ + int mapping = skb_get_queue_mapping(skb); + struct sk_buff *nskb; + int cnt = 0; + + do { + nskb = q->dequeue(q); + if (!nskb) + break; + if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { + q->skb_bad_txq = nskb; + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + break; + } + skb->next = nskb; + skb = nskb; + } while (++cnt < 8); + (*packets) += cnt; + skb->next = NULL; +} + /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ @@ -87,8 +115,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, const struct netdev_queue *txq = q->dev_queue; *packets = 1; - *validate = true; if (unlikely(skb)) { + /* skb in gso_skb were already validated */ + *validate = false; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { @@ -97,15 +126,30 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, q->q.qlen--; } else skb = NULL; - /* skb in gso_skb were already validated */ - *validate = false; - } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) { - skb = q->dequeue(q); - if (skb && qdisc_may_bulk(q)) - try_bulk_dequeue_skb(q, skb, txq, packets); + return skb; + } + *validate = true; + skb = q->skb_bad_txq; + if (unlikely(skb)) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + q->skb_bad_txq = NULL; + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + goto bulk; } + return NULL; + } + if (!(q->flags & TCQ_F_ONETXQUEUE) || + !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); + if (skb) { +bulk: + if (qdisc_may_bulk(q)) + try_bulk_dequeue_skb(q, skb, txq, packets); + else + try_bulk_dequeue_skb_slow(q, skb, packets); } return skb; } @@ -624,11 +668,14 @@ void qdisc_reset(struct Qdisc *qdisc) if (ops->reset) ops->reset(qdisc); + kfree_skb(qdisc->skb_bad_txq); + qdisc->skb_bad_txq = NULL; + if (qdisc->gso_skb) { kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; - qdisc->q.qlen = 0; } + qdisc->q.qlen = 0; } EXPORT_SYMBOL(qdisc_reset); @@ -667,6 +714,7 @@ void qdisc_destroy(struct Qdisc *qdisc) dev_put(qdisc_dev(qdisc)); kfree_skb_list(qdisc->gso_skb); + kfree_skb(qdisc->skb_bad_txq); /* * gen_estimator est_timer() might access qdisc->q.lock, * wait a RCU grace period before freeing qdisc. -- cgit v1.2.3 From cb72d38211eacda2dd90b09540542b6582da614e Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:46 -0400 Subject: netlabel: Initial support for the CALIPSO netlink protocol. CALIPSO is a packet labelling protocol for IPv6 which is very similar to CIPSO. It is specified in RFC 5570. Much of the code is based on the current CIPSO code. This adds support for adding passthrough-type CALIPSO DOIs through the NLBL_CALIPSO_C_ADD command. It requires attributes: NLBL_CALIPSO_A_TYPE which must be CALIPSO_MAP_PASS. NLBL_CALIPSO_A_DOI. In passthrough mode the CALIPSO engine will map MLS secattr levels and categories directly to the packet label. At this stage, the major difference between this and the CIPSO code is that IPv6 may be compiled as a module. To allow for this the CALIPSO functions are registered at module init time. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/calipso.h | 79 +++++++++++++++ include/net/netlabel.h | 23 +++++ include/uapi/linux/audit.h | 2 + net/ipv6/Makefile | 1 + net/ipv6/af_inet6.c | 9 +- net/ipv6/calipso.c | 169 +++++++++++++++++++++++++++++++ net/netlabel/Makefile | 2 +- net/netlabel/netlabel_calipso.c | 216 ++++++++++++++++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.h | 90 +++++++++++++++++ net/netlabel/netlabel_kapi.c | 1 + net/netlabel/netlabel_mgmt.c | 9 ++ net/netlabel/netlabel_user.c | 5 + 12 files changed, 604 insertions(+), 2 deletions(-) create mode 100644 include/net/calipso.h create mode 100644 net/ipv6/calipso.c create mode 100644 net/netlabel/netlabel_calipso.c create mode 100644 net/netlabel/netlabel_calipso.h (limited to 'include/net') diff --git a/include/net/calipso.h b/include/net/calipso.h new file mode 100644 index 000000000000..38dbb4707150 --- /dev/null +++ b/include/net/calipso.h @@ -0,0 +1,79 @@ +/* + * CALIPSO - Common Architecture Label IPv6 Security Option + * + * This is an implementation of the CALIPSO protocol as specified in + * RFC 5570. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#ifndef _CALIPSO_H +#define _CALIPSO_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* known doi values */ +#define CALIPSO_DOI_UNKNOWN 0x00000000 + +/* doi mapping types */ +#define CALIPSO_MAP_UNKNOWN 0 +#define CALIPSO_MAP_PASS 2 + +/* + * CALIPSO DOI definitions + */ + +/* DOI definition struct */ +struct calipso_doi { + u32 doi; + u32 type; + + atomic_t refcount; + struct list_head list; + struct rcu_head rcu; +}; + +#ifdef CONFIG_NETLABEL +int __init calipso_init(void); +void calipso_exit(void); +#else +static inline int __init calipso_init(void) +{ + return 0; +} + +static inline void calipso_exit(void) +{ +} +#endif /* CONFIG_NETLABEL */ + +#endif /* _CALIPSO_H */ diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 7b5a300de7f5..6af1bb6df4ab 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -40,6 +40,7 @@ #include struct cipso_v4_doi; +struct calipso_doi; /* * NetLabel - A management interface for maintaining network packet label @@ -94,6 +95,8 @@ struct cipso_v4_doi; #define NETLBL_NLTYPE_UNLABELED_NAME "NLBL_UNLBL" #define NETLBL_NLTYPE_ADDRSELECT 6 #define NETLBL_NLTYPE_ADDRSELECT_NAME "NLBL_ADRSEL" +#define NETLBL_NLTYPE_CALIPSO 7 +#define NETLBL_NLTYPE_CALIPSO_NAME "NLBL_CALIPSO" /* * NetLabel - Kernel API for accessing the network packet label mappings. @@ -216,6 +219,23 @@ struct netlbl_lsm_secattr { } attr; }; +/** + * struct netlbl_calipso_ops - NetLabel CALIPSO operations + * @doi_add: add a CALIPSO DOI + * @doi_free: free a CALIPSO DOI + * + * Description: + * This structure is filled out by the CALIPSO engine and passed + * to the NetLabel core via a call to netlbl_calipso_ops_register(). + * It enables the CALIPSO engine (and hence IPv6) to be compiled + * as a module. + */ +struct netlbl_calipso_ops { + int (*doi_add)(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info); + void (*doi_free)(struct calipso_doi *doi_def); +}; + /* * LSM security attribute operations (inline) */ @@ -598,4 +618,7 @@ static inline struct audit_buffer *netlbl_audit_start(int type, } #endif /* CONFIG_NETLABEL */ +const struct netlbl_calipso_ops * +netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops); + #endif /* _NETLABEL_H */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index d820aa979620..82e8aa59446b 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -130,6 +130,8 @@ #define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ #define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */ #define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */ +#define AUDIT_MAC_CALIPSO_ADD 1418 /* NetLabel: add CALIPSO DOI entry */ +#define AUDIT_MAC_CALIPSO_DEL 1419 /* NetLabel: del CALIPSO DOI entry */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2fbd90bf8d33..88ad4477705c 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -21,6 +21,7 @@ ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o +ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b11c37cfd67c..c241c1805728 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,7 @@ #ifdef CONFIG_IPV6_TUNNEL #include #endif +#include #include #include @@ -970,6 +971,10 @@ static int __init inet6_init(void) if (err) goto pingv6_fail; + err = calipso_init(); + if (err) + goto calipso_fail; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) @@ -980,8 +985,10 @@ out: #ifdef CONFIG_SYSCTL sysctl_fail: - pingv6_exit(); + calipso_exit(); #endif +calipso_fail: + pingv6_exit(); pingv6_fail: ipv6_packet_cleanup(); ipv6_packet_fail: diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c new file mode 100644 index 000000000000..aa44310120be --- /dev/null +++ b/net/ipv6/calipso.c @@ -0,0 +1,169 @@ +/* + * CALIPSO - Common Architecture Label IPv6 Security Option + * + * This is an implementation of the CALIPSO protocol as specified in + * RFC 5570. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* List of available DOI definitions */ +static DEFINE_SPINLOCK(calipso_doi_list_lock); +static LIST_HEAD(calipso_doi_list); + +/* DOI List Functions + */ + +/** + * calipso_doi_search - Searches for a DOI definition + * @doi: the DOI to search for + * + * Description: + * Search the DOI definition list for a DOI definition with a DOI value that + * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). + * Returns a pointer to the DOI definition on success and NULL on failure. + */ +static struct calipso_doi *calipso_doi_search(u32 doi) +{ + struct calipso_doi *iter; + + list_for_each_entry_rcu(iter, &calipso_doi_list, list) + if (iter->doi == doi && atomic_read(&iter->refcount)) + return iter; + return NULL; +} + +/** + * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CALIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see calipso.h for details). Returns + * zero on success and non-zero on failure. + * + */ +static int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -EINVAL; + u32 doi; + u32 doi_type; + struct audit_buffer *audit_buf; + + doi = doi_def->doi; + doi_type = doi_def->type; + + if (doi_def->doi == CALIPSO_DOI_UNKNOWN) + goto doi_add_return; + + atomic_set(&doi_def->refcount, 1); + + spin_lock(&calipso_doi_list_lock); + if (calipso_doi_search(doi_def->doi)) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -EEXIST; + goto doi_add_return; + } + list_add_tail_rcu(&doi_def->list, &calipso_doi_list); + spin_unlock(&calipso_doi_list_lock); + ret_val = 0; + +doi_add_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); + if (audit_buf) { + const char *type_str; + + switch (doi_type) { + case CALIPSO_MAP_PASS: + type_str = "pass"; + break; + default: + type_str = "(unknown)"; + } + audit_log_format(audit_buf, + " calipso_doi=%u calipso_type=%s res=%u", + doi, type_str, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * calipso_doi_free - Frees a DOI definition + * @doi_def: the DOI definition + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +static void calipso_doi_free(struct calipso_doi *doi_def) +{ + kfree(doi_def); +} + +static const struct netlbl_calipso_ops ops = { + .doi_add = calipso_doi_add, + .doi_free = calipso_doi_free, +}; + +/** + * calipso_init - Initialize the CALIPSO module + * + * Description: + * Initialize the CALIPSO module and prepare it for use. Returns zero on + * success and negative values on failure. + * + */ +int __init calipso_init(void) +{ + netlbl_calipso_ops_register(&ops); + return 0; +} + +void calipso_exit(void) +{ + netlbl_calipso_ops_register(NULL); +} diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile index d2732fc952e2..d341ede0dca5 100644 --- a/net/netlabel/Makefile +++ b/net/netlabel/Makefile @@ -12,4 +12,4 @@ obj-y += netlabel_mgmt.o # protocol modules obj-y += netlabel_unlabeled.o obj-y += netlabel_cipso_v4.o - +obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c new file mode 100644 index 000000000000..8a113f91b7a1 --- /dev/null +++ b/net/netlabel/netlabel_calipso.c @@ -0,0 +1,216 @@ +/* + * NetLabel CALIPSO/IPv6 Support + * + * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and CALIPSO. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netlabel_user.h" +#include "netlabel_calipso.h" +#include "netlabel_mgmt.h" +#include "netlabel_domainhash.h" + +/* NetLabel Generic NETLINK CALIPSO family */ +static struct genl_family netlbl_calipso_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_CALIPSO_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_CALIPSO_A_MAX, +}; + +/* NetLabel Netlink attribute policy */ +static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { + [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 }, + [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, +}; + +/* NetLabel Command Handlers + */ +/** + * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message + * and add it to the CALIPSO engine. Return zero on success and non-zero on + * error. + * + */ +static int netlbl_calipso_add_pass(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct calipso_doi *doi_def = NULL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (!doi_def) + return -ENOMEM; + doi_def->type = CALIPSO_MAP_PASS; + doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + ret_val = calipso_doi_add(doi_def, audit_info); + if (ret_val != 0) + calipso_doi_free(doi_def); + + return ret_val; +} + +/** + * netlbl_calipso_add - Handle an ADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Create a new DOI definition based on the given ADD message and add it to the + * CALIPSO engine. Returns zero on success, negative values on failure. + * + */ +static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) + +{ + int ret_val = -EINVAL; + struct netlbl_audit audit_info; + + if (!info->attrs[NLBL_CALIPSO_A_DOI] || + !info->attrs[NLBL_CALIPSO_A_MTYPE]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { + case CALIPSO_MAP_PASS: + ret_val = netlbl_calipso_add_pass(info, &audit_info); + break; + } + if (ret_val == 0) + atomic_inc(&netlabel_mgmt_protocount); + + return ret_val; +} + +/* NetLabel Generic NETLINK Command Definitions + */ + +static const struct genl_ops netlbl_calipso_ops[] = { + { + .cmd = NLBL_CALIPSO_C_ADD, + .flags = GENL_ADMIN_PERM, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_add, + .dumpit = NULL, + }, +}; + +/* NetLabel Generic NETLINK Protocol Functions + */ + +/** + * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component + * + * Description: + * Register the CALIPSO packet NetLabel component with the Generic NETLINK + * mechanism. Returns zero on success, negative values on failure. + * + */ +int __init netlbl_calipso_genl_init(void) +{ + return genl_register_family_with_ops(&netlbl_calipso_gnl_family, + netlbl_calipso_ops); +} + +static const struct netlbl_calipso_ops *calipso_ops; + +/** + * netlbl_calipso_ops_register - Register the CALIPSO operations + * + * Description: + * Register the CALIPSO packet engine operations. + * + */ +const struct netlbl_calipso_ops * +netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) +{ + return xchg(&calipso_ops, ops); +} +EXPORT_SYMBOL(netlbl_calipso_ops_register); + +static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) +{ + return ACCESS_ONCE(calipso_ops); +} + +/** + * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CALIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see calipso.h for details). Returns + * zero on success and non-zero on failure. + * + */ +int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_add(doi_def, audit_info); + return ret_val; +} + +/** + * calipso_doi_free - Frees a DOI definition + * @doi_def: the DOI definition + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +void calipso_doi_free(struct calipso_doi *doi_def) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->doi_free(doi_def); +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h new file mode 100644 index 000000000000..f78790a6ce4f --- /dev/null +++ b/net/netlabel/netlabel_calipso.h @@ -0,0 +1,90 @@ +/* + * NetLabel CALIPSO Support + * + * This file defines the CALIPSO functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Authors: Paul Moore + * Huw Davies + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies , 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + * + */ + +#ifndef _NETLABEL_CALIPSO +#define _NETLABEL_CALIPSO + +#include +#include + +/* The following NetLabel payloads are supported by the CALIPSO subsystem. + * + * o ADD: + * Sent by an application to add a new DOI mapping table. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * NLBL_CALIPSO_A_MTYPE + * + * If using CALIPSO_MAP_PASS no additional attributes are required. + * + */ + +/* NetLabel CALIPSO commands */ +enum { + NLBL_CALIPSO_C_UNSPEC, + NLBL_CALIPSO_C_ADD, + NLBL_CALIPSO_C_REMOVE, + NLBL_CALIPSO_C_LIST, + NLBL_CALIPSO_C_LISTALL, + __NLBL_CALIPSO_C_MAX, +}; + +/* NetLabel CALIPSO attributes */ +enum { + NLBL_CALIPSO_A_UNSPEC, + NLBL_CALIPSO_A_DOI, + /* (NLA_U32) + * the DOI value */ + NLBL_CALIPSO_A_MTYPE, + /* (NLA_U32) + * the mapping table type (defined in the calipso.h header as + * CALIPSO_MAP_*) */ + __NLBL_CALIPSO_A_MAX, +}; + +#define NLBL_CALIPSO_A_MAX (__NLBL_CALIPSO_A_MAX - 1) + +/* NetLabel protocol functions */ +#if IS_ENABLED(CONFIG_IPV6) +int netlbl_calipso_genl_init(void); +#else +static inline int netlbl_calipso_genl_init(void) +{ + return 0; +} +#endif + +int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info); +void calipso_doi_free(struct calipso_doi *doi_def); + +#endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 7e2a68f9d165..8d2e483167a8 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1170,6 +1170,7 @@ struct audit_buffer *netlbl_audit_start(int type, { return netlbl_audit_start_common(type, audit_info); } +EXPORT_SYMBOL(netlbl_audit_start); /* * Setup Functions diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index b2aeb5d44601..975b1e93271e 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -674,6 +674,15 @@ static int netlbl_mgmt_protocols(struct sk_buff *skb, goto protocols_return; protos_sent++; } +#if IS_ENABLED(CONFIG_IPV6) + if (protos_sent == 2) { + if (netlbl_mgmt_protocols_cb(skb, + cb, + NETLBL_NLTYPE_CALIPSO) < 0) + goto protocols_return; + protos_sent++; + } +#endif protocols_return: cb->args[0] = protos_sent; diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index adf8b7900da2..58495f44c62a 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -44,6 +44,7 @@ #include "netlabel_mgmt.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" +#include "netlabel_calipso.h" #include "netlabel_user.h" /* @@ -71,6 +72,10 @@ int __init netlbl_netlink_init(void) if (ret_val != 0) return ret_val; + ret_val = netlbl_calipso_genl_init(); + if (ret_val != 0) + return ret_val; + return netlbl_unlabel_genl_init(); } -- cgit v1.2.3 From a5e34490c3160e09814403d040765b0ae0003121 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:47 -0400 Subject: netlabel: Add support for querying a CALIPSO DOI. Query a specified DOI through the NLBL_CALIPSO_C_LIST command. It requires the attribute: NLBL_CALIPSO_A_DOI. The reply will contain: NLBL_CALIPSO_A_MTYPE Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 4 ++ net/ipv6/calipso.c | 68 +++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.c | 102 ++++++++++++++++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.h | 19 ++++++++ 4 files changed, 193 insertions(+) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 6af1bb6df4ab..0f05b83851e2 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -223,6 +223,8 @@ struct netlbl_lsm_secattr { * struct netlbl_calipso_ops - NetLabel CALIPSO operations * @doi_add: add a CALIPSO DOI * @doi_free: free a CALIPSO DOI + * @doi_getdef: returns a reference to a DOI + * @doi_putdef: releases a reference of a DOI * * Description: * This structure is filled out by the CALIPSO engine and passed @@ -234,6 +236,8 @@ struct netlbl_calipso_ops { int (*doi_add)(struct calipso_doi *doi_def, struct netlbl_audit *audit_info); void (*doi_free)(struct calipso_doi *doi_def); + struct calipso_doi *(*doi_getdef)(u32 doi); + void (*doi_putdef)(struct calipso_doi *doi_def); }; /* diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index aa44310120be..128cc6945e34 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -144,9 +144,77 @@ static void calipso_doi_free(struct calipso_doi *doi_def) kfree(doi_def); } +/** + * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to the DOI definition can be released + * safely. + * + */ +static void calipso_doi_free_rcu(struct rcu_head *entry) +{ + struct calipso_doi *doi_def; + + doi_def = container_of(entry, struct calipso_doi, rcu); + calipso_doi_free(doi_def); +} + +/** + * calipso_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * calipso_doi_putdef() is called when the caller is done. + * + */ +static struct calipso_doi *calipso_doi_getdef(u32 doi) +{ + struct calipso_doi *doi_def; + + rcu_read_lock(); + doi_def = calipso_doi_search(doi); + if (!doi_def) + goto doi_getdef_return; + if (!atomic_inc_not_zero(&doi_def->refcount)) + doi_def = NULL; + +doi_getdef_return: + rcu_read_unlock(); + return doi_def; +} + +/** + * calipso_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from calipso_doi_getdef(). + * + */ +static void calipso_doi_putdef(struct calipso_doi *doi_def) +{ + if (!doi_def) + return; + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; + spin_lock(&calipso_doi_list_lock); + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); +} + static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, + .doi_getdef = calipso_doi_getdef, + .doi_putdef = calipso_doi_putdef, }; /** diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 8a113f91b7a1..6161170dc5f4 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -124,6 +124,65 @@ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) return ret_val; } +/** + * netlbl_calipso_list - Handle a LIST message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated LIST message and respond accordingly. + * Returns zero on success and negative values on error. + * + */ +static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val; + struct sk_buff *ans_skb = NULL; + void *data; + u32 doi; + struct calipso_doi *doi_def; + + if (!info->attrs[NLBL_CALIPSO_A_DOI]) { + ret_val = -EINVAL; + goto list_failure; + } + + doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + + doi_def = calipso_doi_getdef(doi); + if (!doi_def) { + ret_val = -EINVAL; + goto list_failure; + } + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ans_skb) { + ret_val = -ENOMEM; + goto list_failure_put; + } + data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family, + 0, NLBL_CALIPSO_C_LIST); + if (!data) { + ret_val = -ENOMEM; + goto list_failure_put; + } + + ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); + if (ret_val != 0) + goto list_failure_put; + + calipso_doi_putdef(doi_def); + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +list_failure_put: + calipso_doi_putdef(doi_def); +list_failure: + kfree_skb(ans_skb); + return ret_val; +} + /* NetLabel Generic NETLINK Command Definitions */ @@ -135,6 +194,13 @@ static const struct genl_ops netlbl_calipso_ops[] = { .doit = netlbl_calipso_add, .dumpit = NULL, }, + { + .cmd = NLBL_CALIPSO_C_LIST, + .flags = 0, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_list, + .dumpit = NULL, + }, }; /* NetLabel Generic NETLINK Protocol Functions @@ -214,3 +280,39 @@ void calipso_doi_free(struct calipso_doi *doi_def) if (ops) ops->doi_free(doi_def); } + +/** + * calipso_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * calipso_doi_putdef() is called when the caller is done. + * + */ +struct calipso_doi *calipso_doi_getdef(u32 doi) +{ + struct calipso_doi *ret_val = NULL; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_getdef(doi); + return ret_val; +} + +/** + * calipso_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from calipso_doi_getdef(). + * + */ +void calipso_doi_putdef(struct calipso_doi *doi_def) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->doi_putdef(doi_def); +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index f78790a6ce4f..6da737a9f37a 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -46,6 +46,23 @@ * * If using CALIPSO_MAP_PASS no additional attributes are required. * + * o LIST: + * Sent by an application to list the details of a DOI definition. On + * success the kernel should send a response using the following format. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * + * The valid response message format depends on the type of the DOI mapping, + * the defined formats are shown below. + * + * Required attributes: + * + * NLBL_CALIPSO_A_MTYPE + * + * If using CALIPSO_MAP_PASS no additional attributes are required. + * */ /* NetLabel CALIPSO commands */ @@ -86,5 +103,7 @@ static inline int netlbl_calipso_genl_init(void) int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info); void calipso_doi_free(struct calipso_doi *doi_def); +struct calipso_doi *calipso_doi_getdef(u32 doi); +void calipso_doi_putdef(struct calipso_doi *doi_def); #endif -- cgit v1.2.3 From e1ce69df7e6e8cbdca78ae831ecf435b12b4c168 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:48 -0400 Subject: netlabel: Add support for enumerating the CALIPSO DOI list. Enumerate the DOI list through the NLBL_CALIPSO_C_LISTALL command. It takes no attributes. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 4 ++ net/ipv6/calipso.c | 41 ++++++++++++++++ net/netlabel/netlabel_calipso.c | 106 ++++++++++++++++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.h | 14 ++++++ 4 files changed, 165 insertions(+) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 0f05b83851e2..2653d3a8cde8 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -225,6 +225,7 @@ struct netlbl_lsm_secattr { * @doi_free: free a CALIPSO DOI * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI + * @doi_walk: enumerate the DOI list * * Description: * This structure is filled out by the CALIPSO engine and passed @@ -238,6 +239,9 @@ struct netlbl_calipso_ops { void (*doi_free)(struct calipso_doi *doi_def); struct calipso_doi *(*doi_getdef)(u32 doi); void (*doi_putdef)(struct calipso_doi *doi_def); + int (*doi_walk)(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg); }; /* diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 128cc6945e34..fa17c7a7f4be 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -210,11 +210,52 @@ static void calipso_doi_putdef(struct calipso_doi *doi_def) call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } +/** + * calipso_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +static int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, + void *arg), + void *cb_arg) +{ + int ret_val = -ENOENT; + u32 doi_cnt = 0; + struct calipso_doi *iter_doi; + + rcu_read_lock(); + list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) + if (atomic_read(&iter_doi->refcount) > 0) { + if (doi_cnt++ < *skip_cnt) + continue; + ret_val = callback(iter_doi, cb_arg); + if (ret_val < 0) { + doi_cnt--; + goto doi_walk_return; + } + } + +doi_walk_return: + rcu_read_unlock(); + *skip_cnt = doi_cnt; + return ret_val; +} + static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, + .doi_walk = calipso_doi_walk, }; /** diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 6161170dc5f4..214114575812 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -46,6 +46,13 @@ #include "netlabel_mgmt.h" #include "netlabel_domainhash.h" +/* Argument struct for calipso_doi_walk() */ +struct netlbl_calipso_doiwalk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + /* NetLabel Generic NETLINK CALIPSO family */ static struct genl_family netlbl_calipso_gnl_family = { .id = GENL_ID_GENERATE, @@ -183,6 +190,73 @@ list_failure: return ret_val; } +/** + * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL + * @doi_def: the CALIPSO DOI definition + * @arg: the netlbl_calipso_doiwalk_arg structure + * + * Description: + * This function is designed to be used as a callback to the + * calipso_doi_walk() function for use in generating a response for a LISTALL + * message. Returns the size of the message on success, negative values on + * failure. + * + */ +static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_calipso_doiwalk_arg *cb_arg = arg; + void *data; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, + cb_arg->seq, &netlbl_calipso_gnl_family, + NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL); + if (!data) + goto listall_cb_failure; + + ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi); + if (ret_val != 0) + goto listall_cb_failure; + ret_val = nla_put_u32(cb_arg->skb, + NLBL_CALIPSO_A_MTYPE, + doi_def->type); + if (ret_val != 0) + goto listall_cb_failure; + + genlmsg_end(cb_arg->skb, data); + return 0; + +listall_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_calipso_listall - Handle a LISTALL message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated LISTALL message and respond accordingly. Returns + * zero on success and negative values on error. + * + */ +static int netlbl_calipso_listall(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_calipso_doiwalk_arg cb_arg; + u32 doi_skip = cb->args[0]; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg); + + cb->args[0] = doi_skip; + return skb->len; +} + /* NetLabel Generic NETLINK Command Definitions */ @@ -201,6 +275,13 @@ static const struct genl_ops netlbl_calipso_ops[] = { .doit = netlbl_calipso_list, .dumpit = NULL, }, + { + .cmd = NLBL_CALIPSO_C_LISTALL, + .flags = 0, + .policy = calipso_genl_policy, + .doit = NULL, + .dumpit = netlbl_calipso_listall, + }, }; /* NetLabel Generic NETLINK Protocol Functions @@ -316,3 +397,28 @@ void calipso_doi_putdef(struct calipso_doi *doi_def) if (ops) ops->doi_putdef(doi_def); } + +/** + * calipso_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_walk(skip_cnt, callback, cb_arg); + return ret_val; +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index 6da737a9f37a..1f24174466e4 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -63,6 +63,17 @@ * * If using CALIPSO_MAP_PASS no additional attributes are required. * + * o LISTALL: + * This message is sent by an application to list the valid DOIs on the + * system. When sent by an application there is no payload and the + * NLM_F_DUMP flag should be set. The kernel should respond with a series of + * the following messages. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * NLBL_CALIPSO_A_MTYPE + * */ /* NetLabel CALIPSO commands */ @@ -105,5 +116,8 @@ int calipso_doi_add(struct calipso_doi *doi_def, void calipso_doi_free(struct calipso_doi *doi_def); struct calipso_doi *calipso_doi_getdef(u32 doi); void calipso_doi_putdef(struct calipso_doi *doi_def); +int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg); #endif -- cgit v1.2.3 From d7cce01504a0ccb95b5007d846560cfccbc1947f Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:49 -0400 Subject: netlabel: Add support for removing a CALIPSO DOI. Remove a specified DOI through the NLBL_CALIPSO_C_REMOVE command. It requires the attribute: NLBL_CALIPSO_A_DOI. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 1 + net/ipv6/calipso.c | 48 +++++++++++++++++++++ net/netlabel/netlabel_calipso.c | 92 +++++++++++++++++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.h | 9 ++++ 4 files changed, 150 insertions(+) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 2653d3a8cde8..2c0513b0cc88 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -237,6 +237,7 @@ struct netlbl_calipso_ops { int (*doi_add)(struct calipso_doi *doi_def, struct netlbl_audit *audit_info); void (*doi_free)(struct calipso_doi *doi_def); + int (*doi_remove)(u32 doi, struct netlbl_audit *audit_info); struct calipso_doi *(*doi_getdef)(u32 doi); void (*doi_putdef)(struct calipso_doi *doi_def); int (*doi_walk)(u32 *skip_cnt, diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index fa17c7a7f4be..d7df7a4bd32e 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -162,6 +162,53 @@ static void calipso_doi_free_rcu(struct rcu_head *entry) calipso_doi_free(doi_def); } +/** + * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val; + struct calipso_doi *doi_def; + struct audit_buffer *audit_buf; + + spin_lock(&calipso_doi_list_lock); + doi_def = calipso_doi_search(doi); + if (!doi_def) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -ENOENT; + goto doi_remove_return; + } + if (!atomic_dec_and_test(&doi_def->refcount)) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -EBUSY; + goto doi_remove_return; + } + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); + ret_val = 0; + +doi_remove_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); + if (audit_buf) { + audit_log_format(audit_buf, + " calipso_doi=%u res=%u", + doi, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value @@ -253,6 +300,7 @@ doi_walk_return: static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, + .doi_remove = calipso_doi_remove, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 214114575812..2857673e8802 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -53,6 +53,12 @@ struct netlbl_calipso_doiwalk_arg { u32 seq; }; +/* Argument struct for netlbl_domhsh_walk() */ +struct netlbl_domhsh_walk_arg { + struct netlbl_audit *audit_info; + u32 doi; +}; + /* NetLabel Generic NETLINK CALIPSO family */ static struct genl_family netlbl_calipso_gnl_family = { .id = GENL_ID_GENERATE, @@ -257,6 +263,64 @@ static int netlbl_calipso_listall(struct sk_buff *skb, return skb->len; } +/** + * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE + * @entry: LSM domain mapping entry + * @arg: the netlbl_domhsh_walk_arg structure + * + * Description: + * This function is intended for use by netlbl_calipso_remove() as the callback + * for the netlbl_domhsh_walk() function; it removes LSM domain map entries + * which are associated with the CALIPSO DOI specified in @arg. Returns zero on + * success, negative values on failure. + * + */ +static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg) +{ + struct netlbl_domhsh_walk_arg *cb_arg = arg; + + if (entry->def.type == NETLBL_NLTYPE_CALIPSO && + entry->def.calipso->doi == cb_arg->doi) + return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); + + return 0; +} + +/** + * netlbl_calipso_remove - Handle a REMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated REMOVE message and respond accordingly. Returns + * zero on success, negative values on failure. + * + */ +static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -EINVAL; + struct netlbl_domhsh_walk_arg cb_arg; + struct netlbl_audit audit_info; + u32 skip_bkt = 0; + u32 skip_chain = 0; + + if (!info->attrs[NLBL_CALIPSO_A_DOI]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + cb_arg.audit_info = &audit_info; + ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, + netlbl_calipso_remove_cb, &cb_arg); + if (ret_val == 0 || ret_val == -ENOENT) { + ret_val = calipso_doi_remove(cb_arg.doi, &audit_info); + if (ret_val == 0) + atomic_dec(&netlabel_mgmt_protocount); + } + + return ret_val; +} + /* NetLabel Generic NETLINK Command Definitions */ @@ -269,6 +333,13 @@ static const struct genl_ops netlbl_calipso_ops[] = { .dumpit = NULL, }, { + .cmd = NLBL_CALIPSO_C_REMOVE, + .flags = GENL_ADMIN_PERM, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_remove, + .dumpit = NULL, + }, + { .cmd = NLBL_CALIPSO_C_LIST, .flags = 0, .policy = calipso_genl_policy, @@ -362,6 +433,27 @@ void calipso_doi_free(struct calipso_doi *doi_def) ops->doi_free(doi_def); } +/** + * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_remove(doi, audit_info); + return ret_val; +} + /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index 1f24174466e4..ed78554ba472 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -46,6 +46,14 @@ * * If using CALIPSO_MAP_PASS no additional attributes are required. * + * o REMOVE: + * Sent by an application to remove a specific DOI mapping table from the + * CALIPSO system. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * * o LIST: * Sent by an application to list the details of a DOI definition. On * success the kernel should send a response using the following format. @@ -114,6 +122,7 @@ static inline int netlbl_calipso_genl_init(void) int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info); void calipso_doi_free(struct calipso_doi *doi_def); +int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info); struct calipso_doi *calipso_doi_getdef(u32 doi); void calipso_doi_putdef(struct calipso_doi *doi_def); int calipso_doi_walk(u32 *skip_cnt, -- cgit v1.2.3 From e67ae213c72f72be50561c060ae17e92426651da Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:50 -0400 Subject: ipv6: Add ipv6_renew_options_kern() that accepts a kernel mem pointer. The functionality is equivalent to ipv6_renew_options() except that the newopt pointer is in kernel, not user, memory The kernel memory implementation will be used by the CALIPSO network labelling engine, which needs to be able to set IPv6 hop-by-hop options. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/ipv6.h | 6 ++++++ net/ipv6/exthdrs.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d0aeb97aec5d..887313d978d0 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -308,6 +308,12 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, int newtype, struct ipv6_opt_hdr __user *newopt, int newoptlen); +struct ipv6_txoptions * +ipv6_renew_options_kern(struct sock *sk, + struct ipv6_txoptions *opt, + int newtype, + struct ipv6_opt_hdr *newopt, + int newoptlen); struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index ea7c4d64a00a..d5fd3e799f86 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -758,6 +758,27 @@ static int ipv6_renew_option(void *ohdr, return 0; } +/** + * ipv6_renew_options - replace a specific ext hdr with a new one. + * + * @sk: sock from which to allocate memory + * @opt: original options + * @newtype: option type to replace in @opt + * @newopt: new option of type @newtype to replace (user-mem) + * @newoptlen: length of @newopt + * + * Returns a new set of options which is a copy of @opt with the + * option type @newtype replaced with @newopt. + * + * @opt may be NULL, in which case a new set of options is returned + * containing just @newopt. + * + * @newopt may be NULL, in which case the specified option type is + * not copied into the new set of options. + * + * The new set of options is allocated from the socket option memory + * buffer of @sk. + */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, @@ -830,6 +851,34 @@ out: return ERR_PTR(err); } +/** + * ipv6_renew_options_kern - replace a specific ext hdr with a new one. + * + * @sk: sock from which to allocate memory + * @opt: original options + * @newtype: option type to replace in @opt + * @newopt: new option of type @newtype to replace (kernel-mem) + * @newoptlen: length of @newopt + * + * See ipv6_renew_options(). The difference is that @newopt is + * kernel memory, rather than user memory. + */ +struct ipv6_txoptions * +ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt, + int newtype, struct ipv6_opt_hdr *newopt, + int newoptlen) +{ + struct ipv6_txoptions *ret_val; + const mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret_val = ipv6_renew_options(sk, opt, newtype, + (struct ipv6_opt_hdr __user *)newopt, + newoptlen); + set_fs(old_fs); + return ret_val; +} + struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { -- cgit v1.2.3 From 3faa8f982f958961fda68b8d63e682fe77a032d4 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:51 -0400 Subject: netlabel: Move bitmap manipulation functions to the NetLabel core. This is to allow the CALIPSO labelling engine to use these. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 6 +++ net/ipv4/cipso_ipv4.c | 88 +++++--------------------------------------- net/netlabel/netlabel_kapi.c | 70 +++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 79 deletions(-) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 2c0513b0cc88..9fc2cab9be98 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -434,6 +434,12 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, unsigned long bitmap, gfp_t flags); +/* Bitmap functions + */ +int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, + u32 offset, u8 state); +void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state); + /* * LSM protocol operations (NetLabel LSM/kernel API) */ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index bdb2a07ec363..d710d4eb6a29 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -134,76 +134,6 @@ int cipso_v4_rbm_strictvalid = 1; * Helper Functions */ -/** - * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit - * @bitmap: the bitmap - * @bitmap_len: length in bits - * @offset: starting offset - * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit - * - * Description: - * Starting at @offset, walk the bitmap from left to right until either the - * desired bit is found or we reach the end. Return the bit offset, -1 if - * not found, or -2 if error. - */ -static int cipso_v4_bitmap_walk(const unsigned char *bitmap, - u32 bitmap_len, - u32 offset, - u8 state) -{ - u32 bit_spot; - u32 byte_offset; - unsigned char bitmask; - unsigned char byte; - - /* gcc always rounds to zero when doing integer division */ - byte_offset = offset / 8; - byte = bitmap[byte_offset]; - bit_spot = offset; - bitmask = 0x80 >> (offset % 8); - - while (bit_spot < bitmap_len) { - if ((state && (byte & bitmask) == bitmask) || - (state == 0 && (byte & bitmask) == 0)) - return bit_spot; - - bit_spot++; - bitmask >>= 1; - if (bitmask == 0) { - byte = bitmap[++byte_offset]; - bitmask = 0x80; - } - } - - return -1; -} - -/** - * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap - * @bitmap: the bitmap - * @bit: the bit - * @state: if non-zero, set the bit (1) else clear the bit (0) - * - * Description: - * Set a single bit in the bitmask. Returns zero on success, negative values - * on error. - */ -static void cipso_v4_bitmap_setbit(unsigned char *bitmap, - u32 bit, - u8 state) -{ - u32 byte_spot; - u8 bitmask; - - /* gcc always rounds to zero when doing integer division */ - byte_spot = bit / 8; - bitmask = 0x80 >> (bit % 8); - if (state) - bitmap[byte_spot] |= bitmask; - else - bitmap[byte_spot] &= ~bitmask; -} - /** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free @@ -840,10 +770,10 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { - cat = cipso_v4_bitmap_walk(bitmap, - bitmap_len_bits, - cat + 1, - 1); + cat = netlbl_bitmap_walk(bitmap, + bitmap_len_bits, + cat + 1, + 1); if (cat < 0) break; if (cat >= cipso_cat_size || @@ -909,7 +839,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, } if (net_spot >= net_clen_bits) return -ENOSPC; - cipso_v4_bitmap_setbit(net_cat, net_spot, 1); + netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; @@ -951,10 +881,10 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, } for (;;) { - net_spot = cipso_v4_bitmap_walk(net_cat, - net_clen_bits, - net_spot + 1, - 1); + net_spot = netlbl_bitmap_walk(net_cat, + net_clen_bits, + net_spot + 1, + 1); if (net_spot < 0) { if (net_spot == -2) return -EFAULT; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 8d2e483167a8..54f13a33b52c 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -728,6 +728,76 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, return 0; } +/* Bitmap functions + */ + +/** + * netlbl_bitmap_walk - Walk a bitmap looking for a bit + * @bitmap: the bitmap + * @bitmap_len: length in bits + * @offset: starting offset + * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit + * + * Description: + * Starting at @offset, walk the bitmap from left to right until either the + * desired bit is found or we reach the end. Return the bit offset, -1 if + * not found, or -2 if error. + */ +int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, + u32 offset, u8 state) +{ + u32 bit_spot; + u32 byte_offset; + unsigned char bitmask; + unsigned char byte; + + byte_offset = offset / 8; + byte = bitmap[byte_offset]; + bit_spot = offset; + bitmask = 0x80 >> (offset % 8); + + while (bit_spot < bitmap_len) { + if ((state && (byte & bitmask) == bitmask) || + (state == 0 && (byte & bitmask) == 0)) + return bit_spot; + + bit_spot++; + bitmask >>= 1; + if (bitmask == 0) { + byte = bitmap[++byte_offset]; + bitmask = 0x80; + } + } + + return -1; +} +EXPORT_SYMBOL(netlbl_bitmap_walk); + +/** + * netlbl_bitmap_setbit - Sets a single bit in a bitmap + * @bitmap: the bitmap + * @bit: the bit + * @state: if non-zero, set the bit (1) else clear the bit (0) + * + * Description: + * Set a single bit in the bitmask. Returns zero on success, negative values + * on error. + */ +void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state) +{ + u32 byte_spot; + u8 bitmask; + + /* gcc always rounds to zero when doing integer division */ + byte_spot = bit / 8; + bitmask = 0x80 >> (bit % 8); + if (state) + bitmap[byte_spot] |= bitmask; + else + bitmap[byte_spot] &= ~bitmask; +} +EXPORT_SYMBOL(netlbl_bitmap_setbit); + /* * LSM Functions */ -- cgit v1.2.3 From ceba1832b1b2da0149c51de62a847c00bca1677a Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:02:51 -0400 Subject: calipso: Set the calipso socket label to match the secattr. CALIPSO is a hop-by-hop IPv6 option. A lot of this patch is based on the equivalent CISPO code. The main difference is due to manipulating the options in the hop-by-hop header. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/ipv6.h | 2 + include/net/netlabel.h | 9 + include/uapi/linux/in6.h | 1 + net/ipv6/calipso.c | 595 ++++++++++++++++++++++++++++++++++++++++ net/ipv6/ipv6_sockglue.c | 1 - net/netlabel/Kconfig | 1 + net/netlabel/netlabel_calipso.c | 64 +++++ net/netlabel/netlabel_calipso.h | 5 + net/netlabel/netlabel_kapi.c | 58 +++- security/selinux/netlabel.c | 2 +- 10 files changed, 728 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 887313d978d0..4e279a83cdd0 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -319,6 +319,8 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); +struct ipv6_txoptions *ipv6_update_options(struct sock *sk, + struct ipv6_txoptions *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 9fc2cab9be98..918a6044c89c 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -226,6 +226,9 @@ struct netlbl_lsm_secattr { * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI * @doi_walk: enumerate the DOI list + * @sock_getattr: retrieve the socket's attr + * @sock_setattr: set the socket's attr + * @sock_delattr: remove the socket's attr * * Description: * This structure is filled out by the CALIPSO engine and passed @@ -243,6 +246,12 @@ struct netlbl_calipso_ops { int (*doi_walk)(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg); + int (*sock_getattr)(struct sock *sk, + struct netlbl_lsm_secattr *secattr); + int (*sock_setattr)(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); + void (*sock_delattr)(struct sock *sk); }; /* diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 318a4828bf98..b39ea4f2e701 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -143,6 +143,7 @@ struct in6_flowlabel_req { #define IPV6_TLV_PAD1 0 #define IPV6_TLV_PADN 1 #define IPV6_TLV_ROUTERALERT 5 +#define IPV6_TLV_CALIPSO 7 /* RFC 5570 */ #define IPV6_TLV_JUMBO 194 #define IPV6_TLV_HAO 201 /* home address option */ diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index d7df7a4bd32e..959db5268b38 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -44,6 +44,24 @@ #include #include #include +#include + +/* Maximium size of the calipso option including + * the two-byte TLV header. + */ +#define CALIPSO_OPT_LEN_MAX (2 + 252) + +/* Size of the minimum calipso option including + * the two-byte TLV header. + */ +#define CALIPSO_HDR_LEN (2 + 8) + +/* Maximium size of the calipso option including + * the two-byte TLV header and upto 3 bytes of + * leading pad and 7 bytes of trailing pad. + */ +#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) + /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); @@ -297,6 +315,580 @@ doi_walk_return: return ret_val; } +/** + * calipso_map_cat_hton - Perform a category mapping from host to network + * @doi_def: the DOI definition + * @secattr: the security attributes + * @net_cat: the zero'd out category bitmap in network/CALIPSO format + * @net_cat_len: the length of the CALIPSO bitmap in bytes + * + * Description: + * Perform a label mapping to translate a local MLS category bitmap to the + * correct CALIPSO bitmap using the given DOI definition. Returns the minimum + * size in bytes of the network bitmap on success, negative values otherwise. + * + */ +static int calipso_map_cat_hton(const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *net_cat, + u32 net_cat_len) +{ + int spot = -1; + u32 net_spot_max = 0; + u32 net_clen_bits = net_cat_len * 8; + + for (;;) { + spot = netlbl_catmap_walk(secattr->attr.mls.cat, + spot + 1); + if (spot < 0) + break; + if (spot >= net_clen_bits) + return -ENOSPC; + netlbl_bitmap_setbit(net_cat, spot, 1); + + if (spot > net_spot_max) + net_spot_max = spot; + } + + return (net_spot_max / 32 + 1) * 4; +} + +/** + * calipso_map_cat_ntoh - Perform a category mapping from network to host + * @doi_def: the DOI definition + * @net_cat: the category bitmap in network/CALIPSO format + * @net_cat_len: the length of the CALIPSO bitmap in bytes + * @secattr: the security attributes + * + * Description: + * Perform a label mapping to translate a CALIPSO bitmap to the correct local + * MLS category bitmap using the given DOI definition. Returns zero on + * success, negative values on failure. + * + */ +static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, + const unsigned char *net_cat, + u32 net_cat_len, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + int spot = -1; + u32 net_clen_bits = net_cat_len * 8; + + for (;;) { + spot = netlbl_bitmap_walk(net_cat, + net_clen_bits, + spot + 1, + 1); + if (spot < 0) { + if (spot == -2) + return -EFAULT; + return 0; + } + + ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, + spot, + GFP_ATOMIC); + if (ret_val != 0) + return ret_val; + } + + return -EINVAL; +} + +/** + * calipso_pad_write - Writes pad bytes in TLV format + * @buf: the buffer + * @offset: offset from start of buffer to write padding + * @count: number of pad bytes to write + * + * Description: + * Write @count bytes of TLV padding into @buffer starting at offset @offset. + * @count should be less than 8 - see RFC 4942. + * + */ +static int calipso_pad_write(unsigned char *buf, unsigned int offset, + unsigned int count) +{ + if (WARN_ON_ONCE(count >= 8)) + return -EINVAL; + + switch (count) { + case 0: + break; + case 1: + buf[offset] = IPV6_TLV_PAD1; + break; + default: + buf[offset] = IPV6_TLV_PADN; + buf[offset + 1] = count - 2; + if (count > 2) + memset(buf + offset + 2, 0, count - 2); + break; + } + return 0; +} + +/** + * calipso_genopt - Generate a CALIPSO option + * @buf: the option buffer + * @start: offset from which to write + * @buf_len: the size of opt_buf + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Generate a CALIPSO option using the DOI definition and security attributes + * passed to the function. This also generates upto three bytes of leading + * padding that ensures that the option is 4n + 2 aligned. It returns the + * number of bytes written (including any initial padding). + */ +static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u32 len, pad; + u16 crc; + static const unsigned char padding[4] = {2, 1, 0, 3}; + unsigned char *calipso; + + /* CALIPSO has 4n + 2 alignment */ + pad = padding[start & 3]; + if (buf_len <= start + pad + CALIPSO_HDR_LEN) + return -ENOSPC; + + if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) + return -EPERM; + + len = CALIPSO_HDR_LEN; + + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { + ret_val = calipso_map_cat_hton(doi_def, + secattr, + buf + start + pad + len, + buf_len - start - pad - len); + if (ret_val < 0) + return ret_val; + len += ret_val; + } + + calipso_pad_write(buf, start, pad); + calipso = buf + start + pad; + + calipso[0] = IPV6_TLV_CALIPSO; + calipso[1] = len - 2; + *(__be32 *)(calipso + 2) = htonl(doi_def->doi); + calipso[6] = (len - CALIPSO_HDR_LEN) / 4; + calipso[7] = secattr->attr.mls.lvl, + crc = ~crc_ccitt(0xffff, calipso, len); + calipso[8] = crc & 0xff; + calipso[9] = (crc >> 8) & 0xff; + return pad + len; +} + +/* Hop-by-hop hdr helper functions + */ + +/** + * calipso_opt_update - Replaces socket's hop options with a new set + * @sk: the socket + * @hop: new hop options + * + * Description: + * Replaces @sk's hop options with @hop. @hop may be NULL to leave + * the socket with no hop options. + * + */ +static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) +{ + struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; + + txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, + hop, hop ? ipv6_optlen(hop) : 0); + txopt_put(old); + if (IS_ERR(txopts)) + return PTR_ERR(txopts); + + txopts = ipv6_update_options(sk, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + + return 0; +} + +/** + * calipso_tlv_len - Returns the length of the TLV + * @opt: the option header + * @offset: offset of the TLV within the header + * + * Description: + * Returns the length of the TLV option at offset @offset within + * the option header @opt. Checks that the entire TLV fits inside + * the option header, returns a negative value if this is not the case. + */ +static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) +{ + unsigned char *tlv = (unsigned char *)opt; + unsigned int opt_len = ipv6_optlen(opt), tlv_len; + + if (offset < sizeof(*opt) || offset >= opt_len) + return -EINVAL; + if (tlv[offset] == IPV6_TLV_PAD1) + return 1; + if (offset + 1 >= opt_len) + return -EINVAL; + tlv_len = tlv[offset + 1] + 2; + if (offset + tlv_len > opt_len) + return -EINVAL; + return tlv_len; +} + +/** + * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header + * @hop: the hop options header + * @start: on return holds the offset of any leading padding + * @end: on return holds the offset of the first non-pad TLV after CALIPSO + * + * Description: + * Finds the space occupied by a CALIPSO option (including any leading and + * trailing padding). + * + * If a CALIPSO option exists set @start and @end to the + * offsets within @hop of the start of padding before the first + * CALIPSO option and the end of padding after the first CALIPSO + * option. In this case the function returns 0. + * + * In the absence of a CALIPSO option, @start and @end will be + * set to the start and end of any trailing padding in the header. + * This is useful when appending a new option, as the caller may want + * to overwrite some of this padding. In this case the function will + * return -ENOENT. + */ +static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, + unsigned int *end) +{ + int ret_val = -ENOENT, tlv_len; + unsigned int opt_len, offset, offset_s = 0, offset_e = 0; + unsigned char *opt = (unsigned char *)hop; + + opt_len = ipv6_optlen(hop); + offset = sizeof(*hop); + + while (offset < opt_len) { + tlv_len = calipso_tlv_len(hop, offset); + if (tlv_len < 0) + return tlv_len; + + switch (opt[offset]) { + case IPV6_TLV_PAD1: + case IPV6_TLV_PADN: + if (offset_e) + offset_e = offset; + break; + case IPV6_TLV_CALIPSO: + ret_val = 0; + offset_e = offset; + break; + default: + if (offset_e == 0) + offset_s = offset; + else + goto out; + } + offset += tlv_len; + } + +out: + if (offset_s) + *start = offset_s + calipso_tlv_len(hop, offset_s); + else + *start = sizeof(*hop); + if (offset_e) + *end = offset_e + calipso_tlv_len(hop, offset_e); + else + *end = opt_len; + + return ret_val; +} + +/** + * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr + * @hop: the original hop options header + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Creates a new hop options header based on @hop with a + * CALIPSO option added to it. If @hop already contains a CALIPSO + * option this is overwritten, otherwise the new option is appended + * after any existing options. If @hop is NULL then the new header + * will contain just the CALIPSO option and any needed padding. + * + */ +static struct ipv6_opt_hdr * +calipso_opt_insert(struct ipv6_opt_hdr *hop, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + unsigned int start, end, buf_len, pad, hop_len; + struct ipv6_opt_hdr *new; + int ret_val; + + if (hop) { + hop_len = ipv6_optlen(hop); + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val && ret_val != -ENOENT) + return ERR_PTR(ret_val); + } else { + hop_len = 0; + start = sizeof(*hop); + end = 0; + } + + buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; + new = kzalloc(buf_len, GFP_ATOMIC); + if (!new) + return ERR_PTR(-ENOMEM); + + if (start > sizeof(*hop)) + memcpy(new, hop, start); + ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, + secattr); + if (ret_val < 0) + return ERR_PTR(ret_val); + + buf_len = start + ret_val; + /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ + pad = ((buf_len & 4) + (end & 7)) & 7; + calipso_pad_write((unsigned char *)new, buf_len, pad); + buf_len += pad; + + if (end != hop_len) { + memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); + buf_len += hop_len - end; + } + new->nexthdr = 0; + new->hdrlen = buf_len / 8 - 1; + + return new; +} + +/** + * calipso_opt_del - Removes the CALIPSO option from an option header + * @hop: the original header + * @new: the new header + * + * Description: + * Creates a new header based on @hop without any CALIPSO option. If @hop + * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains + * no other non-padding options, it returns zero with @new set to NULL. + * Otherwise it returns zero, creates a new header without the CALIPSO + * option (and removing as much padding as possible) and returns with + * @new set to that header. + * + */ +static int calipso_opt_del(struct ipv6_opt_hdr *hop, + struct ipv6_opt_hdr **new) +{ + int ret_val; + unsigned int start, end, delta, pad, hop_len; + + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val) + return ret_val; + + hop_len = ipv6_optlen(hop); + if (start == sizeof(*hop) && end == hop_len) { + /* There's no other option in the header so return NULL */ + *new = NULL; + return 0; + } + + delta = (end - start) & ~7; + *new = kzalloc(hop_len - delta, GFP_ATOMIC); + if (!*new) + return -ENOMEM; + + memcpy(*new, hop, start); + (*new)->hdrlen -= delta / 8; + pad = (end - start) & 7; + calipso_pad_write((unsigned char *)*new, start, pad); + if (end != hop_len) + memcpy((char *)*new + start + pad, (char *)hop + end, + hop_len - end); + + return 0; +} + +/** + * calipso_opt_getattr - Get the security attributes from a memory block + * @calipso: the CALIPSO option + * @secattr: the security attributes + * + * Description: + * Inspect @calipso and return the security attributes in @secattr. + * Returns zero on success and negative values on failure. + * + */ +static int calipso_opt_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + u32 doi, len = calipso[1], cat_len = calipso[6] * 4; + struct calipso_doi *doi_def; + + if (cat_len + 8 > len) + return -EINVAL; + + doi = get_unaligned_be32(calipso + 2); + rcu_read_lock(); + doi_def = calipso_doi_search(doi); + if (!doi_def) + goto getattr_return; + + secattr->attr.mls.lvl = calipso[7]; + secattr->flags |= NETLBL_SECATTR_MLS_LVL; + + if (cat_len) { + ret_val = calipso_map_cat_ntoh(doi_def, + calipso + 10, + cat_len, + secattr); + if (ret_val != 0) { + netlbl_catmap_free(secattr->attr.mls.cat); + goto getattr_return; + } + + secattr->flags |= NETLBL_SECATTR_MLS_CAT; + } + + secattr->type = NETLBL_NLTYPE_CALIPSO; + +getattr_return: + rcu_read_unlock(); + return ret_val; +} + +/* sock functions. + */ + +/** + * calipso_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CALIPSO option attached to the sock and if + * there is return the CALIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +static int calipso_sock_getattr(struct sock *sk, + struct netlbl_lsm_secattr *secattr) +{ + struct ipv6_opt_hdr *hop; + int opt_len, len, ret_val = -ENOMSG, offset; + unsigned char *opt; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + if (!txopts || !txopts->hopopt) + goto done; + + hop = txopts->hopopt; + opt = (unsigned char *)hop; + opt_len = ipv6_optlen(hop); + offset = sizeof(*hop); + while (offset < opt_len) { + len = calipso_tlv_len(hop, offset); + if (len < 0) { + ret_val = len; + goto done; + } + switch (opt[offset]) { + case IPV6_TLV_CALIPSO: + if (len < CALIPSO_HDR_LEN) + ret_val = -EINVAL; + else + ret_val = calipso_opt_getattr(&opt[offset], + secattr); + goto done; + default: + offset += len; + break; + } + } +done: + txopt_put(txopts); + return ret_val; +} + +/** + * calipso_sock_setattr - Add a CALIPSO option to a socket + * @sk: the socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +static int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct ipv6_opt_hdr *old, *new; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + old = NULL; + if (txopts) + old = txopts->hopopt; + + new = calipso_opt_insert(old, doi_def, secattr); + txopt_put(txopts); + if (IS_ERR(new)) + return PTR_ERR(new); + + ret_val = calipso_opt_update(sk, new); + + kfree(new); + return ret_val; +} + +/** + * calipso_sock_delattr - Delete the CALIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CALIPSO option from a socket, if present. + * + */ +static void calipso_sock_delattr(struct sock *sk) +{ + struct ipv6_opt_hdr *new_hop; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + if (!txopts || !txopts->hopopt) + goto done; + + if (calipso_opt_del(txopts->hopopt, &new_hop)) + goto done; + + calipso_opt_update(sk, new_hop); + kfree(new_hop); + +done: + txopt_put(txopts); +} + static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, @@ -304,6 +896,9 @@ static const struct netlbl_calipso_ops ops = { .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, + .sock_getattr = calipso_sock_getattr, + .sock_setattr = calipso_sock_setattr, + .sock_delattr = calipso_sock_delattr, }; /** diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4449ad1f8114..8a80d59bed34 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -98,7 +98,6 @@ int ip6_ra_control(struct sock *sk, int sel) return 0; } -static struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig index 56958c85f2b4..d9eaa30ffe3f 100644 --- a/net/netlabel/Kconfig +++ b/net/netlabel/Kconfig @@ -5,6 +5,7 @@ config NETLABEL bool "NetLabel subsystem support" depends on SECURITY + select CRC_CCITT if IPV6 default n ---help--- NetLabel provides support for explicit network packet labeling diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 2857673e8802..6f9c6589a022 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -514,3 +514,67 @@ int calipso_doi_walk(u32 *skip_cnt, ret_val = ops->doi_walk(skip_cnt, callback, cb_arg); return ret_val; } + +/** + * calipso_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CALIPSO option attached to the sock and if + * there is return the CALIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->sock_getattr(sk, secattr); + return ret_val; +} + +/** + * calipso_sock_setattr - Add a CALIPSO option to a socket + * @sk: the socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->sock_setattr(sk, doi_def, secattr); + return ret_val; +} + +/** + * calipso_sock_delattr - Delete the CALIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CALIPSO option from a socket, if present. + * + */ +void calipso_sock_delattr(struct sock *sk) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->sock_delattr(sk); +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index ed78554ba472..49bc116705c4 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -128,5 +128,10 @@ void calipso_doi_putdef(struct calipso_doi *doi_def); int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg); +int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); +int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +void calipso_sock_delattr(struct sock *sk); #endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 54f13a33b52c..00bab51c291e 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -37,12 +37,14 @@ #include #include #include +#include #include #include #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" +#include "netlabel_calipso.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" @@ -521,6 +523,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) return -ENOENT; } +EXPORT_SYMBOL(netlbl_catmap_walk); /** * netlbl_catmap_walkrng - Find the end of a string of set bits @@ -656,6 +659,7 @@ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap, return 0; } +EXPORT_SYMBOL(netlbl_catmap_setbit); /** * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap @@ -870,9 +874,21 @@ int netlbl_sock_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + switch (dom_entry->def.type) { + case NETLBL_NLTYPE_ADDRSELECT: + ret_val = -EDESTADDRREQ; + break; + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_sock_setattr(sk, + dom_entry->def.calipso, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -899,6 +915,11 @@ void netlbl_sock_delattr(struct sock *sk) case AF_INET: cipso_v4_sock_delattr(sk); break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + calipso_sock_delattr(sk); + break; +#endif /* IPv6 */ } } @@ -925,7 +946,7 @@ int netlbl_sock_getattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - ret_val = -ENOMSG; + ret_val = calipso_sock_getattr(sk, secattr); break; #endif /* IPv6 */ default: @@ -953,6 +974,9 @@ int netlbl_conn_setattr(struct sock *sk, { int ret_val; struct sockaddr_in *addr4; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *addr6; +#endif struct netlbl_dommap_def *entry; rcu_read_lock(); @@ -973,7 +997,7 @@ int netlbl_conn_setattr(struct sock *sk, case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ - cipso_v4_sock_delattr(sk); + netlbl_sock_delattr(sk); ret_val = 0; break; default: @@ -982,9 +1006,27 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + addr6 = (struct sockaddr_in6 *)addr; + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &addr6->sin6_addr); + if (entry == NULL) { + ret_val = -ENOENT; + goto conn_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_sock_setattr(sk, + entry->calipso, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + netlbl_sock_delattr(sk); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 1f989a539fd4..5470f32eca54 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -333,7 +333,7 @@ int selinux_netlbl_socket_post_create(struct sock *sk, u16 family) struct sk_security_struct *sksec = sk->sk_security; struct netlbl_lsm_secattr *secattr; - if (family != PF_INET) + if (family != PF_INET && family != PF_INET6) return 0; secattr = selinux_netlbl_sock_genattr(sk); -- cgit v1.2.3 From 56ac42bc94b18d45b6c484edeac33be86bfb3efa Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:05:28 -0400 Subject: ipv6: Allow request socks to contain IPv6 options. If set, these will take precedence over the parent's options during both sending and child creation. If they're not set, the parent's options (if any) will be used. This is to allow the security_inet_conn_request() hook to modify the IPv6 options in just the same way that it already may do for IPv4. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/inet_sock.h | 7 ++++++- net/dccp/ipv6.c | 12 +++++++++--- net/ipv4/tcp_input.c | 3 +++ net/ipv6/tcp_ipv6.c | 12 +++++++++--- 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 012b1f91f3ec..236a81034fef 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -97,7 +97,12 @@ struct inet_request_sock { u32 ir_mark; union { struct ip_options_rcu *opt; - struct sk_buff *pktopts; +#if IS_ENABLED(CONFIG_IPV6) + struct { + struct ipv6_txoptions *ipv6_opt; + struct sk_buff *pktopts; + }; +#endif }; }; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4663a01d5039..3381748bd0f3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -216,14 +216,17 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req skb = dccp_make_response(sk, dst, req); if (skb != NULL) { struct dccp_hdr *dh = dccp_hdr(skb); + struct ipv6_txoptions *opt; dh->dccph_checksum = dccp_v6_csum_finish(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; rcu_read_lock(); - err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), - np->tclass); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); + err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -236,6 +239,7 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); + kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } @@ -494,7 +498,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - opt = rcu_dereference(np->opt); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e6e65f79ade8..071174c13bf9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6146,6 +6146,9 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, kmemcheck_annotate_bitfield(ireq, flags); ireq->opt = NULL; +#if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; +#endif atomic64_set(&ireq->ir_cookie, 0); ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 711d209f9124..18daddc6001c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -443,6 +443,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; @@ -463,8 +464,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); rcu_read_lock(); - err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), - np->tclass); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); + err = ip6_xmit(sk, skb, fl6, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -476,6 +479,7 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { + kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } @@ -1107,7 +1111,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * but we make one more one thing there: reattach optmem to newsk. */ - opt = rcu_dereference(np->opt); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); -- cgit v1.2.3 From e1adea927080821ebfa7505bff752a4015955660 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:05:29 -0400 Subject: calipso: Allow request sockets to be relabelled by the lsm. Request sockets need to have a label that takes into account the incoming connection as well as their parent's label. This is used for the outgoing SYN-ACK and for their child full-socket. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 6 +++ net/ipv6/calipso.c | 86 +++++++++++++++++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.c | 40 +++++++++++++++++++ net/netlabel/netlabel_calipso.h | 4 ++ net/netlabel/netlabel_kapi.c | 33 ++++++++++++---- security/selinux/netlabel.c | 2 +- 6 files changed, 163 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 918a6044c89c..a2408c30a7f7 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -229,6 +229,8 @@ struct netlbl_lsm_secattr { * @sock_getattr: retrieve the socket's attr * @sock_setattr: set the socket's attr * @sock_delattr: remove the socket's attr + * @req_setattr: set the req socket's attr + * @req_delattr: remove the req socket's attr * * Description: * This structure is filled out by the CALIPSO engine and passed @@ -252,6 +254,10 @@ struct netlbl_calipso_ops { const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); void (*sock_delattr)(struct sock *sk); + int (*req_setattr)(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); + void (*req_delattr)(struct request_sock *req); }; /* diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 959db5268b38..067a5640ef17 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -889,6 +889,90 @@ done: txopt_put(txopts); } +/* request sock functions. + */ + +/** + * calipso_req_setattr - Add a CALIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +static int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + struct ipv6_txoptions *txopts; + struct inet_request_sock *req_inet = inet_rsk(req); + struct ipv6_opt_hdr *old, *new; + struct sock *sk = sk_to_full_sk(req_to_sk(req)); + + if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) + old = req_inet->ipv6_opt->hopopt; + else + old = NULL; + + new = calipso_opt_insert(old, doi_def, secattr); + if (IS_ERR(new)) + return PTR_ERR(new); + + txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, + new, new ? ipv6_optlen(new) : 0); + + kfree(new); + + if (IS_ERR(txopts)) + return PTR_ERR(txopts); + + txopts = xchg(&req_inet->ipv6_opt, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + + return 0; +} + +/** + * calipso_req_delattr - Delete the CALIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CALIPSO option from a request socket, if present. + * + */ +static void calipso_req_delattr(struct request_sock *req) +{ + struct inet_request_sock *req_inet = inet_rsk(req); + struct ipv6_opt_hdr *new; + struct ipv6_txoptions *txopts; + struct sock *sk = sk_to_full_sk(req_to_sk(req)); + + if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) + return; + + if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) + return; /* Nothing to do */ + + txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, + new, new ? ipv6_optlen(new) : 0); + + if (!IS_ERR(txopts)) { + txopts = xchg(&req_inet->ipv6_opt, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + } + kfree(new); +} + static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, @@ -899,6 +983,8 @@ static const struct netlbl_calipso_ops ops = { .sock_getattr = calipso_sock_getattr, .sock_setattr = calipso_sock_setattr, .sock_delattr = calipso_sock_delattr, + .req_setattr = calipso_req_setattr, + .req_delattr = calipso_req_delattr, }; /** diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 6f9c6589a022..79519e3a6a93 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -578,3 +578,43 @@ void calipso_sock_delattr(struct sock *sk) if (ops) ops->sock_delattr(sk); } + +/** + * calipso_req_setattr - Add a CALIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->req_setattr(req, doi_def, secattr); + return ret_val; +} + +/** + * calipso_req_delattr - Delete the CALIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CALIPSO option from a request socket, if present. + * + */ +void calipso_req_delattr(struct request_sock *req) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->req_delattr(req); +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index 49bc116705c4..1372fdd86588 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -133,5 +133,9 @@ int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); void calipso_sock_delattr(struct sock *sk); +int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +void calipso_req_delattr(struct request_sock *req); #endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 00bab51c291e..9b725f75c750 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1053,12 +1053,13 @@ int netlbl_req_setattr(struct request_sock *req, { int ret_val; struct netlbl_dommap_def *entry; + struct inet_request_sock *ireq = inet_rsk(req); rcu_read_lock(); switch (req->rsk_ops->family) { case AF_INET: entry = netlbl_domhsh_getentry_af4(secattr->domain, - inet_rsk(req)->ir_rmt_addr); + ireq->ir_rmt_addr); if (entry == NULL) { ret_val = -ENOENT; goto req_setattr_return; @@ -1069,9 +1070,7 @@ int netlbl_req_setattr(struct request_sock *req, entry->cipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: - /* just delete the protocols we support for right now - * but we could remove other protocols if needed */ - cipso_v4_req_delattr(req); + netlbl_req_delattr(req); ret_val = 0; break; default: @@ -1080,9 +1079,24 @@ int netlbl_req_setattr(struct request_sock *req, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &ireq->ir_v6_rmt_addr); + if (entry == NULL) { + ret_val = -ENOENT; + goto req_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_req_setattr(req, + entry->calipso, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + netlbl_req_delattr(req); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -1108,6 +1122,11 @@ void netlbl_req_delattr(struct request_sock *req) case AF_INET: cipso_v4_req_delattr(req); break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + calipso_req_delattr(req); + break; +#endif /* IPv6 */ } } diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index 2477a75f16e7..ca220c3fbcf9 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -284,7 +284,7 @@ int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family) int rc; struct netlbl_lsm_secattr secattr; - if (family != PF_INET) + if (family != PF_INET && family != PF_INET6) return 0; netlbl_secattr_init(&secattr); -- cgit v1.2.3 From 0868383b822e4d8ebde980c7aac973a6aa81a3ec Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:06:15 -0400 Subject: ipv6: constify the skb pointer of ipv6_find_tlv(). Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/ipv6.h | 2 +- net/ipv6/exthdrs_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4e279a83cdd0..24a5ebecaf1f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -945,7 +945,7 @@ enum { int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *fragflg); -int ipv6_find_tlv(struct sk_buff *skb, int offset, int type); +int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 9508a20fbf61..305e2ed730bf 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -112,7 +112,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, } EXPORT_SYMBOL(ipv6_skip_exthdr); -int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); -- cgit v1.2.3 From 2917f57b6bc15cc6787496ee5f2fdf17f0e9b7d3 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:06:15 -0400 Subject: calipso: Allow the lsm to label the skbuff directly. In some cases, the lsm needs to add the label to the skbuff directly. A NF_INET_LOCAL_OUT IPv6 hook is added to selinux to match the IPv4 behaviour. This allows selinux to label the skbuffs that it requires. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 11 +++ net/ipv6/calipso.c | 165 ++++++++++++++++++++++++++++++++++++++++ net/netlabel/netlabel_calipso.c | 82 ++++++++++++++++++++ net/netlabel/netlabel_calipso.h | 7 ++ net/netlabel/netlabel_kapi.c | 32 +++++++- security/selinux/hooks.c | 15 ++++ 6 files changed, 308 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index a2408c30a7f7..e0e4ce8f22af 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -231,6 +231,10 @@ struct netlbl_lsm_secattr { * @sock_delattr: remove the socket's attr * @req_setattr: set the req socket's attr * @req_delattr: remove the req socket's attr + * @opt_getattr: retrieve attr from memory block + * @skbuff_optptr: find option in packet + * @skbuff_setattr: set the skbuff's attr + * @skbuff_delattr: remove the skbuff's attr * * Description: * This structure is filled out by the CALIPSO engine and passed @@ -258,6 +262,13 @@ struct netlbl_calipso_ops { const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); void (*req_delattr)(struct request_sock *req); + int (*opt_getattr)(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr); + unsigned char *(*skbuff_optptr)(const struct sk_buff *skb); + int (*skbuff_setattr)(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); + int (*skbuff_delattr)(struct sk_buff *skb); }; /* diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 067a5640ef17..fa371a8827cf 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -62,6 +62,11 @@ */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) + /* Maximium size of u32 aligned buffer required to hold calipso + * option. Max of 3 initial pad bytes starting from buffer + 3. + * i.e. the worst case is when the previous tlv finishes on 4n + 3. + */ +#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); @@ -973,6 +978,162 @@ static void calipso_req_delattr(struct request_sock *req) kfree(new); } +/* skbuff functions. + */ + +/** + * calipso_skbuff_optptr - Find the CALIPSO option in the packet + * @skb: the packet + * + * Description: + * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer + * to the start of the CALIPSO option on success, NULL if one if not found. + * + */ +static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); + int offset; + + if (ip6_hdr->nexthdr != NEXTHDR_HOP) + return NULL; + + offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); + if (offset >= 0) + return (unsigned char *)ip6_hdr + offset; + + return NULL; +} + +/** + * calipso_skbuff_setattr - Set the CALIPSO option on a packet + * @skb: the packet + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Set the CALIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +static int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct ipv6hdr *ip6_hdr; + struct ipv6_opt_hdr *hop; + unsigned char buf[CALIPSO_MAX_BUFFER]; + int len_delta, new_end, pad; + unsigned int start, end; + + ip6_hdr = ipv6_hdr(skb); + if (ip6_hdr->nexthdr == NEXTHDR_HOP) { + hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val && ret_val != -ENOENT) + return ret_val; + } else { + start = 0; + end = 0; + } + + memset(buf, 0, sizeof(buf)); + ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); + if (ret_val < 0) + return ret_val; + + new_end = start + ret_val; + /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ + pad = ((new_end & 4) + (end & 7)) & 7; + len_delta = new_end - (int)end + pad; + ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + if (ret_val < 0) + return ret_val; + + if (len_delta) { + if (len_delta > 0) + skb_push(skb, len_delta); + else + skb_pull(skb, -len_delta); + memmove((char *)ip6_hdr - len_delta, ip6_hdr, + sizeof(*ip6_hdr) + start); + skb_reset_network_header(skb); + ip6_hdr = ipv6_hdr(skb); + } + + hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + if (start == 0) { + struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; + + new_hop->nexthdr = ip6_hdr->nexthdr; + new_hop->hdrlen = len_delta / 8 - 1; + ip6_hdr->nexthdr = NEXTHDR_HOP; + } else { + hop->hdrlen += len_delta / 8; + } + memcpy((char *)hop + start, buf + (start & 3), new_end - start); + calipso_pad_write((unsigned char *)hop, new_end, pad); + + return 0; +} + +/** + * calipso_skbuff_delattr - Delete any CALIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CALIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +static int calipso_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val; + struct ipv6hdr *ip6_hdr; + struct ipv6_opt_hdr *old_hop; + u32 old_hop_len, start = 0, end = 0, delta, size, pad; + + if (!calipso_skbuff_optptr(skb)) + return 0; + + /* since we are changing the packet we should make a copy */ + ret_val = skb_cow(skb, skb_headroom(skb)); + if (ret_val < 0) + return ret_val; + + ip6_hdr = ipv6_hdr(skb); + old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + old_hop_len = ipv6_optlen(old_hop); + + ret_val = calipso_opt_find(old_hop, &start, &end); + if (ret_val) + return ret_val; + + if (start == sizeof(*old_hop) && end == old_hop_len) { + /* There's no other option in the header so we delete + * the whole thing. */ + delta = old_hop_len; + size = sizeof(*ip6_hdr); + ip6_hdr->nexthdr = old_hop->nexthdr; + } else { + delta = (end - start) & ~7; + if (delta) + old_hop->hdrlen -= delta / 8; + pad = (end - start) & 7; + size = sizeof(*ip6_hdr) + start + pad; + calipso_pad_write((unsigned char *)old_hop, start, pad); + } + + if (delta) { + skb_pull(skb, delta); + memmove((char *)ip6_hdr + delta, ip6_hdr, size); + skb_reset_network_header(skb); + } + + return 0; +} + static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, @@ -985,6 +1146,10 @@ static const struct netlbl_calipso_ops ops = { .sock_delattr = calipso_sock_delattr, .req_setattr = calipso_req_setattr, .req_delattr = calipso_req_delattr, + .opt_getattr = calipso_opt_getattr, + .skbuff_optptr = calipso_skbuff_optptr, + .skbuff_setattr = calipso_skbuff_setattr, + .skbuff_delattr = calipso_skbuff_delattr, }; /** diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 79519e3a6a93..0d02c262dbf6 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -618,3 +618,85 @@ void calipso_req_delattr(struct request_sock *req) if (ops) ops->req_delattr(req); } + +/** + * calipso_optptr - Find the CALIPSO option in the packet + * @skb: the packet + * + * Description: + * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer + * to the start of the CALIPSO option on success, NULL if one if not found. + * + */ +unsigned char *calipso_optptr(const struct sk_buff *skb) +{ + unsigned char *ret_val = NULL; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_optptr(skb); + return ret_val; +} + +/** + * calipso_getattr - Get the security attributes from a memory block. + * @calipso: the CALIPSO option + * @secattr: the security attributes + * + * Description: + * Inspect @calipso and return the security attributes in @secattr. + * Returns zero on success and negative values on failure. + * + */ +int calipso_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->opt_getattr(calipso, secattr); + return ret_val; +} + +/** + * calipso_skbuff_setattr - Set the CALIPSO option on a packet + * @skb: the packet + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Set the CALIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_setattr(skb, doi_def, secattr); + return ret_val; +} + +/** + * calipso_skbuff_delattr - Delete any CALIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CALIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +int calipso_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_delattr(skb); + return ret_val; +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index 1372fdd86588..66ba92e9289f 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -137,5 +137,12 @@ int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); void calipso_req_delattr(struct request_sock *req); +unsigned char *calipso_optptr(const struct sk_buff *skb); +int calipso_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr); +int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +int calipso_skbuff_delattr(struct sk_buff *skb); #endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 9b725f75c750..c50a1c5d1e1a 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1147,13 +1147,17 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, { int ret_val; struct iphdr *hdr4; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *hdr6; +#endif struct netlbl_dommap_def *entry; rcu_read_lock(); switch (family) { case AF_INET: hdr4 = ip_hdr(skb); - entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr); + entry = netlbl_domhsh_getentry_af4(secattr->domain, + hdr4->daddr); if (entry == NULL) { ret_val = -ENOENT; goto skbuff_setattr_return; @@ -1174,9 +1178,26 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + hdr6 = ipv6_hdr(skb); + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &hdr6->daddr); + if (entry == NULL) { + ret_val = -ENOENT; + goto skbuff_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_skbuff_setattr(skb, entry->calipso, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + ret_val = calipso_skbuff_delattr(skb); + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -1215,6 +1236,9 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + ptr = calipso_optptr(skb); + if (ptr && calipso_getattr(ptr, secattr) == 0) + return 0; break; #endif /* IPv6 */ } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a00ab81ab719..cb7c5c8028e7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5063,6 +5063,15 @@ static unsigned int selinux_ipv4_output(void *priv, return selinux_ip_output(skb, PF_INET); } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static unsigned int selinux_ipv6_output(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return selinux_ip_output(skb, PF_INET6); +} +#endif /* IPV6 */ + static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, int ifindex, u16 family) @@ -6297,6 +6306,12 @@ static struct nf_hook_ops selinux_nf_ops[] = { .hooknum = NF_INET_FORWARD, .priority = NF_IP6_PRI_SELINUX_FIRST, }, + { + .hook = selinux_ipv6_output, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_SELINUX_FIRST, + }, #endif /* IPV6 */ }; -- cgit v1.2.3 From a04e71f631fa3d2fd2aa0404c11484739d1e9073 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:06:16 -0400 Subject: netlabel: Pass a family parameter to netlbl_skbuff_err(). This makes it possible to route the error to the appropriate labelling engine. CALIPSO is far less verbose than CIPSO when encountering a bogus packet, so there is no need for a CALIPSO error handler. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 2 +- net/netlabel/netlabel_kapi.c | 11 ++++++++--- security/selinux/hooks.c | 6 +++--- security/selinux/include/netlabel.h | 4 +++- security/selinux/netlabel.c | 6 +++--- security/smack/smack_lsm.c | 2 +- 6 files changed, 19 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index e0e4ce8f22af..d8a46a8ed512 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -488,7 +488,7 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr); -void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway); +void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway); /* * LSM label mapping cache operations diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index c50a1c5d1e1a..a42ce3c15d70 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1249,6 +1249,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, /** * netlbl_skbuff_err - Handle a LSM error on a sk_buff * @skb: the packet + * @family: the family * @error: the error code * @gateway: true if host is acting as a gateway, false otherwise * @@ -1258,10 +1259,14 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, * according to the packet's labeling protocol. * */ -void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) +void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway) { - if (cipso_v4_optptr(skb)) - cipso_v4_error(skb, error, gateway); + switch (family) { + case AF_INET: + if (cipso_v4_optptr(skb)) + cipso_v4_error(skb, error, gateway); + break; + } } /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index cb7c5c8028e7..51eafe5d3bf4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4603,13 +4603,13 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif, addrp, family, peer_sid, &ad); if (err) { - selinux_netlbl_err(skb, err, 0); + selinux_netlbl_err(skb, family, err, 0); return err; } err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, PEER__RECV, &ad); if (err) { - selinux_netlbl_err(skb, err, 0); + selinux_netlbl_err(skb, family, err, 0); return err; } } @@ -4977,7 +4977,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, err = selinux_inet_sys_rcv_skb(dev_net(indev), indev->ifindex, addrp, family, peer_sid, &ad); if (err) { - selinux_netlbl_err(skb, err, 1); + selinux_netlbl_err(skb, family, err, 1); return NF_DROP; } } diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h index 8c59b8f150e8..75686d53df07 100644 --- a/security/selinux/include/netlabel.h +++ b/security/selinux/include/netlabel.h @@ -40,7 +40,8 @@ #ifdef CONFIG_NETLABEL void selinux_netlbl_cache_invalidate(void); -void selinux_netlbl_err(struct sk_buff *skb, int error, int gateway); +void selinux_netlbl_err(struct sk_buff *skb, u16 family, int error, + int gateway); void selinux_netlbl_sk_security_free(struct sk_security_struct *sksec); void selinux_netlbl_sk_security_reset(struct sk_security_struct *sksec); @@ -72,6 +73,7 @@ static inline void selinux_netlbl_cache_invalidate(void) } static inline void selinux_netlbl_err(struct sk_buff *skb, + u16 family, int error, int gateway) { diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index ca220c3fbcf9..dfca50dc292a 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -151,9 +151,9 @@ void selinux_netlbl_cache_invalidate(void) * present on the packet, NetLabel is smart enough to only act when it should. * */ -void selinux_netlbl_err(struct sk_buff *skb, int error, int gateway) +void selinux_netlbl_err(struct sk_buff *skb, u16 family, int error, int gateway) { - netlbl_skbuff_err(skb, error, gateway); + netlbl_skbuff_err(skb, family, error, gateway); } /** @@ -405,7 +405,7 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, return 0; if (nlbl_sid != SECINITSID_UNLABELED) - netlbl_skbuff_err(skb, rc, 0); + netlbl_skbuff_err(skb, family, rc, 0); return rc; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 11f79013ae1f..292fdeaaec7e 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3992,7 +3992,7 @@ access_check: rc = smk_bu_note("IPv4 delivery", skp, ssp->smk_in, MAY_WRITE, rc); if (rc != 0) - netlbl_skbuff_err(skb, rc, 0); + netlbl_skbuff_err(skb, sk->sk_family, rc, 0); break; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: -- cgit v1.2.3 From 2e532b702834c07f614caf4489feb691e713232a Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:06:17 -0400 Subject: calipso: Add validation of CALIPSO option. Lengths, checksum and the DOI are checked. Checking of the level and categories are left for the socket layer. CRC validation is performed in the calipso module to avoid unconditionally linking crc_ccitt() into ipv6. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/calipso.h | 6 ++++++ net/ipv6/calipso.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/ipv6/exthdrs.c | 27 +++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) (limited to 'include/net') diff --git a/include/net/calipso.h b/include/net/calipso.h index 38dbb4707150..85404e2375d8 100644 --- a/include/net/calipso.h +++ b/include/net/calipso.h @@ -65,6 +65,7 @@ struct calipso_doi { #ifdef CONFIG_NETLABEL int __init calipso_init(void); void calipso_exit(void); +bool calipso_validate(const struct sk_buff *skb, const unsigned char *option); #else static inline int __init calipso_init(void) { @@ -74,6 +75,11 @@ static inline int __init calipso_init(void) static inline void calipso_exit(void) { } +static inline bool calipso_validate(const struct sk_buff *skb, + const unsigned char *option) +{ + return true; +} #endif /* CONFIG_NETLABEL */ #endif /* _CALIPSO_H */ diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index fa371a8827cf..ea80450efe56 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -320,6 +320,47 @@ doi_walk_return: return ret_val; } +/** + * calipso_validate - Validate a CALIPSO option + * @skb: the packet + * @option: the start of the option + * + * Description: + * This routine is called to validate a CALIPSO option. + * If the option is valid then %true is returned, otherwise + * %false is returned. + * + * The caller should have already checked that the length of the + * option (including the TLV header) is >= 10 and that the catmap + * length is consistent with the option length. + * + * We leave checks on the level and categories to the socket layer. + */ +bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) +{ + struct calipso_doi *doi_def; + bool ret_val; + u16 crc, len = option[1] + 2; + static const u8 zero[2]; + + /* The original CRC runs over the option including the TLV header + * with the CRC-16 field (at offset 8) zeroed out. */ + crc = crc_ccitt(0xffff, option, 8); + crc = crc_ccitt(crc, zero, sizeof(zero)); + if (len > 10) + crc = crc_ccitt(crc, option + 10, len - 10); + crc = ~crc; + if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) + return false; + + rcu_read_lock(); + doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); + ret_val = !!doi_def; + rcu_read_unlock(); + + return ret_val; +} + /** * calipso_map_cat_hton - Perform a category mapping from host to network * @doi_def: the DOI definition diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index d5fd3e799f86..0f69cab39986 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -43,6 +43,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6_MIP6) #include #endif @@ -603,6 +604,28 @@ drop: return false; } +/* CALIPSO RFC 5570 */ + +static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + + if (nh[optoff + 1] < 8) + goto drop; + + if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) + goto drop; + + if (!calipso_validate(skb, nh + optoff)) + goto drop; + + return true; + +drop: + kfree_skb(skb); + return false; +} + static const struct tlvtype_proc tlvprochopopt_lst[] = { { .type = IPV6_TLV_ROUTERALERT, @@ -612,6 +635,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { .type = IPV6_TLV_JUMBO, .func = ipv6_hop_jumbo, }, + { + .type = IPV6_TLV_CALIPSO, + .func = ipv6_hop_calipso, + }, { -1, } }; -- cgit v1.2.3 From 4fee5242bf41d9ad641d4c1b821e36eb7ba37fbf Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:06:17 -0400 Subject: calipso: Add a label cache. This works in exactly the same way as the CIPSO label cache. The idea is to allow the lsm to cache the result of a secattr lookup so that it doesn't need to perform the lookup for every skbuff. It introduces two sysctl controls: calipso_cache_enable - enables/disables the cache. calipso_cache_bucket_size - sets the size of a cache bucket. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/calipso.h | 6 + include/net/netlabel.h | 9 +- net/ipv6/calipso.c | 264 +++++++++++++++++++++++++++++++++++++++- net/ipv6/sysctl_net_ipv6.c | 19 +++ net/netlabel/netlabel_calipso.c | 38 ++++++ net/netlabel/netlabel_calipso.h | 3 + net/netlabel/netlabel_kapi.c | 24 +++- security/selinux/netlabel.c | 9 +- 8 files changed, 360 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/calipso.h b/include/net/calipso.h index 85404e2375d8..b1b30cd36601 100644 --- a/include/net/calipso.h +++ b/include/net/calipso.h @@ -62,6 +62,12 @@ struct calipso_doi { struct rcu_head rcu; }; +/* + * Sysctl Variables + */ +extern int calipso_cache_enabled; +extern int calipso_cache_bucketsize; + #ifdef CONFIG_NETLABEL int __init calipso_init(void); void calipso_exit(void); diff --git a/include/net/netlabel.h b/include/net/netlabel.h index d8a46a8ed512..a306bc7d2642 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -235,6 +235,8 @@ struct netlbl_lsm_secattr { * @skbuff_optptr: find option in packet * @skbuff_setattr: set the skbuff's attr * @skbuff_delattr: remove the skbuff's attr + * @cache_invalidate: invalidate cache + * @cache_add: add cache entry * * Description: * This structure is filled out by the CALIPSO engine and passed @@ -269,6 +271,9 @@ struct netlbl_calipso_ops { const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); int (*skbuff_delattr)(struct sk_buff *skb); + void (*cache_invalidate)(void); + int (*cache_add)(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr); }; /* @@ -494,7 +499,7 @@ void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway); * LSM label mapping cache operations */ void netlbl_cache_invalidate(void); -int netlbl_cache_add(const struct sk_buff *skb, +int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr); /* @@ -647,7 +652,7 @@ static inline void netlbl_cache_invalidate(void) { return; } -static inline int netlbl_cache_add(const struct sk_buff *skb, +static inline int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { return 0; diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index ea80450efe56..c53b92c617c5 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -72,6 +72,255 @@ static DEFINE_SPINLOCK(calipso_doi_list_lock); static LIST_HEAD(calipso_doi_list); +/* Label mapping cache */ +int calipso_cache_enabled = 1; +int calipso_cache_bucketsize = 10; +#define CALIPSO_CACHE_BUCKETBITS 7 +#define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) +#define CALIPSO_CACHE_REORDERLIMIT 10 +struct calipso_map_cache_bkt { + spinlock_t lock; + u32 size; + struct list_head list; +}; + +struct calipso_map_cache_entry { + u32 hash; + unsigned char *key; + size_t key_len; + + struct netlbl_lsm_cache *lsm_data; + + u32 activity; + struct list_head list; +}; + +static struct calipso_map_cache_bkt *calipso_cache; + +/* Label Mapping Cache Functions + */ + +/** + * calipso_cache_entry_free - Frees a cache entry + * @entry: the entry to free + * + * Description: + * This function frees the memory associated with a cache entry including the + * LSM cache data if there are no longer any users, i.e. reference count == 0. + * + */ +static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) +{ + if (entry->lsm_data) + netlbl_secattr_cache_free(entry->lsm_data); + kfree(entry->key); + kfree(entry); +} + +/** + * calipso_map_cache_hash - Hashing function for the CALIPSO cache + * @key: the hash key + * @key_len: the length of the key in bytes + * + * Description: + * The CALIPSO tag hashing function. Returns a 32-bit hash value. + * + */ +static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +/** + * calipso_cache_init - Initialize the CALIPSO cache + * + * Description: + * Initializes the CALIPSO label mapping cache, this function should be called + * before any of the other functions defined in this file. Returns zero on + * success, negative values on error. + * + */ +static int __init calipso_cache_init(void) +{ + u32 iter; + + calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, + sizeof(struct calipso_map_cache_bkt), + GFP_KERNEL); + if (!calipso_cache) + return -ENOMEM; + + for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { + spin_lock_init(&calipso_cache[iter].lock); + calipso_cache[iter].size = 0; + INIT_LIST_HEAD(&calipso_cache[iter].list); + } + + return 0; +} + +/** + * calipso_cache_invalidate - Invalidates the current CALIPSO cache + * + * Description: + * Invalidates and frees any entries in the CALIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +static void calipso_cache_invalidate(void) +{ + struct calipso_map_cache_entry *entry, *tmp_entry; + u32 iter; + + for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { + spin_lock_bh(&calipso_cache[iter].lock); + list_for_each_entry_safe(entry, + tmp_entry, + &calipso_cache[iter].list, list) { + list_del(&entry->list); + calipso_cache_entry_free(entry); + } + calipso_cache[iter].size = 0; + spin_unlock_bh(&calipso_cache[iter].lock); + } +} + +/** + * calipso_cache_check - Check the CALIPSO cache for a label mapping + * @key: the buffer to check + * @key_len: buffer length in bytes + * @secattr: the security attribute struct to use + * + * Description: + * This function checks the cache to see if a label mapping already exists for + * the given key. If there is a match then the cache is adjusted and the + * @secattr struct is populated with the correct LSM security attributes. The + * cache is adjusted in the following manner if the entry is not already the + * first in the cache bucket: + * + * 1. The cache entry's activity counter is incremented + * 2. The previous (higher ranking) entry's activity counter is decremented + * 3. If the difference between the two activity counters is geater than + * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped + * + * Returns zero on success, -ENOENT for a cache miss, and other negative values + * on error. + * + */ +static int calipso_cache_check(const unsigned char *key, + u32 key_len, + struct netlbl_lsm_secattr *secattr) +{ + u32 bkt; + struct calipso_map_cache_entry *entry; + struct calipso_map_cache_entry *prev_entry = NULL; + u32 hash; + + if (!calipso_cache_enabled) + return -ENOENT; + + hash = calipso_map_cache_hash(key, key_len); + bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); + spin_lock_bh(&calipso_cache[bkt].lock); + list_for_each_entry(entry, &calipso_cache[bkt].list, list) { + if (entry->hash == hash && + entry->key_len == key_len && + memcmp(entry->key, key, key_len) == 0) { + entry->activity += 1; + atomic_inc(&entry->lsm_data->refcount); + secattr->cache = entry->lsm_data; + secattr->flags |= NETLBL_SECATTR_CACHE; + secattr->type = NETLBL_NLTYPE_CALIPSO; + if (!prev_entry) { + spin_unlock_bh(&calipso_cache[bkt].lock); + return 0; + } + + if (prev_entry->activity > 0) + prev_entry->activity -= 1; + if (entry->activity > prev_entry->activity && + entry->activity - prev_entry->activity > + CALIPSO_CACHE_REORDERLIMIT) { + __list_del(entry->list.prev, entry->list.next); + __list_add(&entry->list, + prev_entry->list.prev, + &prev_entry->list); + } + + spin_unlock_bh(&calipso_cache[bkt].lock); + return 0; + } + prev_entry = entry; + } + spin_unlock_bh(&calipso_cache[bkt].lock); + + return -ENOENT; +} + +/** + * calipso_cache_add - Add an entry to the CALIPSO cache + * @calipso_ptr: the CALIPSO option + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CALIPSO label mapping cache. Add the new entry to + * head of the cache bucket's list, if the cache bucket is out of room remove + * the last entry in the list first. It is important to note that there is + * currently no checking for duplicate keys. Returns zero on success, + * negative values on failure. The key stored starts at calipso_ptr + 2, + * i.e. the type and length bytes are not stored, this corresponds to + * calipso_ptr[1] bytes of data. + * + */ +static int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + u32 bkt; + struct calipso_map_cache_entry *entry = NULL; + struct calipso_map_cache_entry *old_entry = NULL; + u32 calipso_ptr_len; + + if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) + return 0; + + calipso_ptr_len = calipso_ptr[1]; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); + if (!entry->key) { + ret_val = -ENOMEM; + goto cache_add_failure; + } + entry->key_len = calipso_ptr_len; + entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); + atomic_inc(&secattr->cache->refcount); + entry->lsm_data = secattr->cache; + + bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); + spin_lock_bh(&calipso_cache[bkt].lock); + if (calipso_cache[bkt].size < calipso_cache_bucketsize) { + list_add(&entry->list, &calipso_cache[bkt].list); + calipso_cache[bkt].size += 1; + } else { + old_entry = list_entry(calipso_cache[bkt].list.prev, + struct calipso_map_cache_entry, list); + list_del(&old_entry->list); + list_add(&entry->list, &calipso_cache[bkt].list); + calipso_cache_entry_free(old_entry); + } + spin_unlock_bh(&calipso_cache[bkt].lock); + + return 0; + +cache_add_failure: + if (entry) + calipso_cache_entry_free(entry); + return ret_val; +} + /* DOI List Functions */ @@ -789,6 +1038,9 @@ static int calipso_opt_getattr(const unsigned char *calipso, if (cat_len + 8 > len) return -EINVAL; + if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) + return 0; + doi = get_unaligned_be32(calipso + 2); rcu_read_lock(); doi_def = calipso_doi_search(doi); @@ -1191,6 +1443,8 @@ static const struct netlbl_calipso_ops ops = { .skbuff_optptr = calipso_skbuff_optptr, .skbuff_setattr = calipso_skbuff_setattr, .skbuff_delattr = calipso_skbuff_delattr, + .cache_invalidate = calipso_cache_invalidate, + .cache_add = calipso_cache_add }; /** @@ -1203,11 +1457,17 @@ static const struct netlbl_calipso_ops ops = { */ int __init calipso_init(void) { - netlbl_calipso_ops_register(&ops); - return 0; + int ret_val; + + ret_val = calipso_cache_init(); + if (!ret_val) + netlbl_calipso_ops_register(&ops); + return ret_val; } void calipso_exit(void) { netlbl_calipso_ops_register(NULL); + calipso_cache_invalidate(); + kfree(calipso_cache); } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 45243bbe5253..69c50e737c54 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -15,6 +15,9 @@ #include #include #include +#ifdef CONFIG_NETLABEL +#include +#endif static int one = 1; static int auto_flowlabels_min; @@ -106,6 +109,22 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one }, +#ifdef CONFIG_NETLABEL + { + .procname = "calipso_cache_enable", + .data = &calipso_cache_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "calipso_cache_bucket_size", + .data = &calipso_cache_bucketsize, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NETLABEL */ { } }; diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 0d02c262dbf6..2ec93c5e77bb 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -700,3 +700,41 @@ int calipso_skbuff_delattr(struct sk_buff *skb) ret_val = ops->skbuff_delattr(skb); return ret_val; } + +/** + * calipso_cache_invalidate - Invalidates the current CALIPSO cache + * + * Description: + * Invalidates and frees any entries in the CALIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +void calipso_cache_invalidate(void) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->cache_invalidate(); +} + +/** + * calipso_cache_add - Add an entry to the CALIPSO cache + * @calipso_ptr: the CALIPSO option + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CALIPSO label mapping cache. + * Returns zero on success, negative values on failure. + * + */ +int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr) + +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->cache_add(calipso_ptr, secattr); + return ret_val; +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h index 66ba92e9289f..9fd291cd0fc5 100644 --- a/net/netlabel/netlabel_calipso.h +++ b/net/netlabel/netlabel_calipso.h @@ -144,5 +144,8 @@ int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); int calipso_skbuff_delattr(struct sk_buff *skb); +void calipso_cache_invalidate(void); +int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr); #endif diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index a42ce3c15d70..fbad7187d4fc 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1281,11 +1281,15 @@ void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway) void netlbl_cache_invalidate(void) { cipso_v4_cache_invalidate(); +#if IS_ENABLED(CONFIG_IPV6) + calipso_cache_invalidate(); +#endif /* IPv6 */ } /** * netlbl_cache_add - Add an entry to a NetLabel protocol cache * @skb: the packet + * @family: the family * @secattr: the packet's security attributes * * Description: @@ -1294,7 +1298,7 @@ void netlbl_cache_invalidate(void) * values on error. * */ -int netlbl_cache_add(const struct sk_buff *skb, +int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { unsigned char *ptr; @@ -1302,10 +1306,20 @@ int netlbl_cache_add(const struct sk_buff *skb, if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0) return -ENOMSG; - ptr = cipso_v4_optptr(skb); - if (ptr) - return cipso_v4_cache_add(ptr, secattr); - + switch (family) { + case AF_INET: + ptr = cipso_v4_optptr(skb); + if (ptr) + return cipso_v4_cache_add(ptr, secattr); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ptr = calipso_optptr(skb); + if (ptr) + return calipso_cache_add(ptr, secattr); + break; +#endif /* IPv6 */ + } return -ENOMSG; } diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index dfca50dc292a..aaba6677ee2e 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -54,6 +54,7 @@ * */ static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb, + u16 family, struct netlbl_lsm_secattr *secattr, u32 *sid) { @@ -63,7 +64,7 @@ static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb, if (rc == 0 && (secattr->flags & NETLBL_SECATTR_CACHEABLE) && (secattr->flags & NETLBL_SECATTR_CACHE)) - netlbl_cache_add(skb, secattr); + netlbl_cache_add(skb, family, secattr); return rc; } @@ -214,7 +215,8 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, netlbl_secattr_init(&secattr); rc = netlbl_skbuff_getattr(skb, family, &secattr); if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) - rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid); + rc = selinux_netlbl_sidlookup_cached(skb, family, + &secattr, sid); else *sid = SECSID_NULL; *type = secattr.type; @@ -382,7 +384,8 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, netlbl_secattr_init(&secattr); rc = netlbl_skbuff_getattr(skb, family, &secattr); if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) - rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid); + rc = selinux_netlbl_sidlookup_cached(skb, family, + &secattr, &nlbl_sid); else nlbl_sid = SECINITSID_UNLABELED; netlbl_secattr_destroy(&secattr); -- cgit v1.2.3 From 3f09354ac84c6904787189d85fb306bf60f714b8 Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Mon, 27 Jun 2016 15:06:18 -0400 Subject: netlabel: Implement CALIPSO config functions for SMACK. SMACK uses similar functions to control CIPSO, these are the equivalent functions for CALIPSO and follow exactly the same semantics. int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) Adds a CALIPSO doi. void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info) Removes a CALIPSO doi. int netlbl_cfg_calipso_map_add(u32 doi, const char *domain, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) Creates a mapping between a domain and a CALIPSO doi. If addr and mask are non-NULL this creates an address-selector type mapping. This also extends netlbl_cfg_map_del() to remove IPv6 address-selector mappings. Signed-off-by: Huw Davies Signed-off-by: Paul Moore --- include/net/netlabel.h | 26 +++++++ net/netlabel/netlabel_domainhash.c | 66 ++++++++++++++++++ net/netlabel/netlabel_domainhash.h | 8 +++ net/netlabel/netlabel_kapi.c | 138 +++++++++++++++++++++++++++++++++++++ 4 files changed, 238 insertions(+) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index a306bc7d2642..efe98068880f 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -445,6 +445,14 @@ int netlbl_cfg_cipsov4_map_add(u32 doi, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info); +int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info); +void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info); +int netlbl_cfg_calipso_map_add(u32 doi, + const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); /* * LSM security attribute operations */ @@ -561,6 +569,24 @@ static inline int netlbl_cfg_cipsov4_map_add(u32 doi, { return -ENOSYS; } +static inline int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} +static inline void netlbl_cfg_calipso_del(u32 doi, + struct netlbl_audit *audit_info) +{ + return; +} +static inline int netlbl_cfg_calipso_map_add(u32 doi, + const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ + return -ENOSYS; +} static inline int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) { diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f1dee0d5e5e2..41d0e95d171e 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -725,6 +725,72 @@ remove_af4_failure: return -ENOENT; } +#if IS_ENABLED(CONFIG_IPV6) +/** + * netlbl_domhsh_remove_af6 - Removes an address selector entry + * @domain: the domain + * @addr: IPv6 address + * @mask: IPv6 address mask + * @audit_info: NetLabel audit information + * + * Description: + * Removes an individual address selector from a domain mapping and potentially + * the entire mapping if it is empty. Returns zero on success, negative values + * on failure. + * + */ +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ + struct netlbl_dom_map *entry_map; + struct netlbl_af6list *entry_addr; + struct netlbl_af4list *iter4; + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *entry; + + rcu_read_lock(); + + if (domain) + entry_map = netlbl_domhsh_search(domain, AF_INET6); + else + entry_map = netlbl_domhsh_search_def(domain, AF_INET6); + if (entry_map == NULL || + entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) + goto remove_af6_failure; + + spin_lock(&netlbl_domhsh_lock); + entry_addr = netlbl_af6list_remove(addr, mask, + &entry_map->def.addrsel->list6); + spin_unlock(&netlbl_domhsh_lock); + + if (entry_addr == NULL) + goto remove_af6_failure; + netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) + goto remove_af6_single_addr; + netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) + goto remove_af6_single_addr; + /* the domain mapping is empty so remove it from the mapping table */ + netlbl_domhsh_remove_entry(entry_map, audit_info); + +remove_af6_single_addr: + rcu_read_unlock(); + /* yick, we can't use call_rcu here because we don't have a rcu head + * pointer but hopefully this should be a rare case so the pause + * shouldn't be a problem */ + synchronize_rcu(); + entry = netlbl_domhsh_addr6_entry(entry_addr); + calipso_doi_putdef(entry->def.calipso); + kfree(entry); + return 0; + +remove_af6_failure: + rcu_read_unlock(); + return -ENOENT; +} +#endif /* IPv6 */ + /** * netlbl_domhsh_remove - Removes an entry from the domain hash table * @domain: the domain to remove diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h index b7ddb6e5c53f..1f9247781927 100644 --- a/net/netlabel/netlabel_domainhash.h +++ b/net/netlabel/netlabel_domainhash.h @@ -93,6 +93,10 @@ int netlbl_domhsh_remove_af4(const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); int netlbl_domhsh_remove(const char *domain, u16 family, struct netlbl_audit *audit_info); int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info); @@ -102,6 +106,10 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain, #if IS_ENABLED(CONFIG_IPV6) struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, const struct in6_addr *addr); +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); #endif /* IPv6 */ int netlbl_domhsh_walk(u32 *skip_bkt, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index fbad7187d4fc..28c56b95fb7f 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -80,6 +80,11 @@ int netlbl_cfg_map_del(const char *domain, case AF_INET: return netlbl_domhsh_remove_af4(domain, addr, mask, audit_info); +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return netlbl_domhsh_remove_af6(domain, addr, mask, + audit_info); +#endif /* IPv6 */ default: return -EPFNOSUPPORT; } @@ -403,6 +408,139 @@ out_entry: return ret_val; } +/** + * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition + * @doi_def: CALIPSO DOI definition + * @audit_info: NetLabel audit information + * + * Description: + * Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on + * success and negative values on failure. + * + */ +int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + return calipso_doi_add(doi_def, audit_info); +#else /* IPv6 */ + return -ENOSYS; +#endif /* IPv6 */ +} + +/** + * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition + * @doi: CALIPSO DOI + * @audit_info: NetLabel audit information + * + * Description: + * Remove an existing CALIPSO DOI definition matching @doi. Returns zero on + * success and negative values on failure. + * + */ +void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + calipso_doi_remove(doi, audit_info); +#endif /* IPv6 */ +} + +/** + * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping + * @doi: the CALIPSO DOI + * @domain: the domain mapping to add + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the + * NetLabel subsystem. A @domain value of NULL adds a new default domain + * mapping. Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_calipso_map_add(u32 doi, + const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + int ret_val = -ENOMEM; + struct calipso_doi *doi_def; + struct netlbl_dom_map *entry; + struct netlbl_domaddr_map *addrmap = NULL; + struct netlbl_domaddr6_map *addrinfo = NULL; + + doi_def = calipso_doi_getdef(doi); + if (doi_def == NULL) + return -ENOENT; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + goto out_entry; + entry->family = AF_INET6; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto out_domain; + } + + if (addr == NULL && mask == NULL) { + entry->def.calipso = doi_def; + entry->def.type = NETLBL_NLTYPE_CALIPSO; + } else if (addr != NULL && mask != NULL) { + addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + if (addrmap == NULL) + goto out_addrmap; + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); + if (addrinfo == NULL) + goto out_addrinfo; + addrinfo->def.calipso = doi_def; + addrinfo->def.type = NETLBL_NLTYPE_CALIPSO; + addrinfo->list.addr = *addr; + addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + addrinfo->list.mask = *mask; + addrinfo->list.valid = 1; + ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6); + if (ret_val != 0) + goto cfg_calipso_map_add_failure; + + entry->def.addrsel = addrmap; + entry->def.type = NETLBL_NLTYPE_ADDRSELECT; + } else { + ret_val = -EINVAL; + goto out_addrmap; + } + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_calipso_map_add_failure; + + return 0; + +cfg_calipso_map_add_failure: + kfree(addrinfo); +out_addrinfo: + kfree(addrmap); +out_addrmap: + kfree(entry->domain); +out_domain: + kfree(entry); +out_entry: + calipso_doi_putdef(doi_def); + return ret_val; +#else /* IPv6 */ + return -ENOSYS; +#endif /* IPv6 */ +} + /* * Security Attribute Functions */ -- cgit v1.2.3 From 80e73cc563c4359be809a03bcb8e7e28141a813a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 28 Jun 2016 16:57:05 +0200 Subject: net: rtnetlink: add support for the IFLA_STATS_LINK_XSTATS_SLAVE attribute This patch adds support for the IFLA_STATS_LINK_XSTATS_SLAVE attribute which allows to export per-slave statistics if the master device supports the linkxstats callback. The attribute is passed down to the linkxstats callback and it is up to the callback user to use it (an example has been added to the only current user - the bridge). This allows us to query only specific slaves of master devices like bridge ports and export only what we're interested in instead of having to dump all ports and searching only for a single one. This will be used to export per-port IGMP/MLD stats and also per-port vlan stats in the future, possibly other statistics as well. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 5 ++-- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 59 +++++++++++++++++++++++++++++++++++++++++--- net/core/rtnetlink.c | 50 +++++++++++++++++++++++++++++++++++-- 4 files changed, 108 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 006a7b81d758..4113916cc1bb 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -98,10 +98,11 @@ struct rtnl_link_ops { const struct net_device *dev, const struct net_device *slave_dev); struct net *(*get_link_net)(const struct net_device *dev); - size_t (*get_linkxstats_size)(const struct net_device *dev); + size_t (*get_linkxstats_size)(const struct net_device *dev, + int attr); int (*fill_linkxstats)(struct sk_buff *skb, const struct net_device *dev, - int *prividx); + int *prividx, int attr); }; int __rtnl_link_register(struct rtnl_link_ops *ops); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bb36bd5675a7..db2458ade81c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -822,6 +822,7 @@ enum { IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ IFLA_STATS_LINK_64, IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, __IFLA_STATS_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 85e89f693589..ed75ff9ff9e6 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1234,7 +1234,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } -static size_t br_get_linkxstats_size(const struct net_device *dev) +static size_t bridge_get_linkxstats_size(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_vlan_group *vg; @@ -1254,8 +1254,30 @@ static size_t br_get_linkxstats_size(const struct net_device *dev) nla_total_size(0); } -static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, - int *prividx) +static size_t brport_get_linkxstats_size(const struct net_device *dev) +{ + return nla_total_size(0); +} + +static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) +{ + size_t retsize = 0; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + retsize = bridge_get_linkxstats_size(dev); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + retsize = brport_get_linkxstats_size(dev); + break; + } + + return retsize; +} + +static int bridge_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_vlan_group *vg; @@ -1298,6 +1320,37 @@ nla_put_failure: return -EMSGSIZE; } +static int brport_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); + if (!nest) + return -EMSGSIZE; + nla_nest_end(skb, nest); + + return 0; +} + +static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, + int *prividx, int attr) +{ + int ret = -EINVAL; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + ret = bridge_fill_linkxstats(skb, dev, prividx); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + ret = brport_fill_linkxstats(skb, dev, prividx); + break; + } + + return ret; +} + static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eb49ca24274a..cfed7bc14ee6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3519,7 +3519,32 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (!attr) goto nla_put_failure; - err = ops->fill_linkxstats(skb, dev, prividx); + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, + *idxattr)) { + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + master = netdev_master_upper_dev_get(dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->fill_linkxstats) { + int err; + + *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; + attr = nla_nest_start(skb, + IFLA_STATS_LINK_XSTATS_SLAVE); + if (!attr) + goto nla_put_failure; + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; @@ -3555,14 +3580,35 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { - size += nla_total_size(ops->get_linkxstats_size(dev)); + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { + struct net_device *_dev = (struct net_device *)dev; + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + /* netdev_master_upper_dev_get can't take const */ + master = netdev_master_upper_dev_get(_dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->get_linkxstats_size) { + int attr = IFLA_STATS_LINK_XSTATS_SLAVE; + + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS_SLAVE */ + size += nla_total_size(0); + } + } + return size; } -- cgit v1.2.3 From 2631b79f6cb8b634fe41b77de9c4add0ec6b3cae Mon Sep 17 00:00:00 2001 From: "Seymour, Shane M" Date: Tue, 28 Jun 2016 23:06:48 +0000 Subject: tcp: increase size at which tcp_bound_to_half_wnd bounds to > TCP_MSS_DEFAULT In previous commit 01f83d69844d307be2aa6fea88b0e8fe5cbdb2f4 the following comments were added: "When peer uses tiny windows, there is no use in packetizing to sub-MSS pieces for the sake of SWS or making sure there are enough packets in the pipe for fast recovery." The test should be > TCP_MSS_DEFAULT not >= 512. This allows low end devices that send an MSS of 536 (TCP_MSS_DEFAULT) to see better network performance by sending it 536 bytes of data at a time instead of bounding to half window size (268). Other network stacks work this way, e.g. HP-UX. Signed-off-by: Shane Seymour Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index a79894b66726..d825858fe4f1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -589,7 +589,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense. */ - if (tp->max_window >= 512) + if (tp->max_window > TCP_MSS_DEFAULT) cutoff = (tp->max_window >> 1); else cutoff = tp->max_window; -- cgit v1.2.3 From 19689e38eca5d7b32755182d4e62efd7a5376c45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Jun 2016 18:51:53 +0200 Subject: tcp: md5: use kmalloc() backed scratch areas Some arches have virtually mapped kernel stacks, or will soon have. tcp_md5_hash_header() uses an automatic variable to copy tcp header before mangling th->check and calling crypto function, which might be problematic on such arches. David says that using percpu storage is also problematic on non SMP builds. Just use kmalloc() to allocate scratch areas. Signed-off-by: Eric Dumazet Reported-by: Andy Lutomirski Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- net/ipv4/tcp.c | 10 ++++++++++ net/ipv4/tcp_ipv4.c | 31 ++++++++++++++----------------- net/ipv6/tcp_ipv6.c | 29 ++++++++++++++++------------- 4 files changed, 41 insertions(+), 32 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d825858fe4f1..c00e7d51bb18 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1384,7 +1384,7 @@ union tcp_md5sum_block { /* - pool: digest algorithm, hash description and scratch buffer */ struct tcp_md5sig_pool { struct ahash_request *md5_req; - union tcp_md5sum_block md5_blk; + void *scratch; }; /* - functions */ @@ -1420,7 +1420,6 @@ static inline void tcp_put_md5sig_pool(void) local_bh_enable(); } -int tcp_md5_hash_header(struct tcp_md5sig_pool *, const struct tcphdr *); int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *, unsigned int header_len); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 108ef2a6665c..032a96d78c99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3026,8 +3026,18 @@ static void __tcp_alloc_md5sig_pool(void) return; for_each_possible_cpu(cpu) { + void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch; struct ahash_request *req; + if (!scratch) { + scratch = kmalloc_node(sizeof(union tcp_md5sum_block) + + sizeof(struct tcphdr), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!scratch) + return; + per_cpu(tcp_md5sig_pool, cpu).scratch = scratch; + } if (per_cpu(tcp_md5sig_pool, cpu).md5_req) continue; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3708de2a6683..32b048e524d6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1018,27 +1018,28 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, GFP_KERNEL); } -static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - __be32 daddr, __be32 saddr, int nbytes) +static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, + __be32 daddr, __be32 saddr, + const struct tcphdr *th, int nbytes) { struct tcp4_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip4; - - /* - * 1. the TCP pseudo-header (in the order: source IP address, - * destination IP address, zero-padded protocol number, and - * segment length) - */ + bp = hp->scratch; bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; bp->protocol = IPPROTO_TCP; bp->len = cpu_to_be16(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } @@ -1055,9 +1056,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -1101,9 +1100,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2255d2bf5f6b..37cf91323319 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -526,26 +526,33 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } -static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - const struct in6_addr *daddr, - const struct in6_addr *saddr, int nbytes) +static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; + struct tcphdr *_th; - bp = &hp->md5_blk.ip6; + bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); - sg_init_one(&sg, bp, sizeof(*bp)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + _th = (struct tcphdr *)(bp + 1); + memcpy(_th, th, sizeof(*th)); + _th->check = 0; + + sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, + sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->md5_req); } -static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, +static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { @@ -559,9 +566,7 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; @@ -606,9 +611,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, if (crypto_ahash_init(req)) goto clear_hash; - if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) - goto clear_hash; - if (tcp_md5_hash_header(hp, th)) + if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) goto clear_hash; -- cgit v1.2.3 From 08f4b5918b2d6b491f0403cc1886f5cdccef89bb Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Fri, 1 Jul 2016 14:51:01 +0300 Subject: net/devlink: Add E-Switch mode control Add the commands to set and show the mode of SRIOV E-Switch, two modes are supported: * legacy: operating in the "old" L2 based mode (DMAC --> VF vport) * switchdev: the E-Switch is referred to as whitebox switch configured using standard tools such as tc, bridge, openvswitch etc. To allow working with the tools, for each VF, a VF representor netdevice is created by the E-Switch manager vendor device driver instance (e.g PF). Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/devlink.h | 3 ++ include/uapi/linux/devlink.h | 8 ++++ net/core/devlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1d45b61cb320..c99ffe8cef3c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -90,6 +90,9 @@ struct devlink_ops { u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); + + int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode); + int (*eswitch_mode_set)(struct devlink *devlink, u16 mode); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ba0073b26fa6..915bfa74458c 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -57,6 +57,8 @@ enum devlink_command { DEVLINK_CMD_SB_OCC_SNAPSHOT, DEVLINK_CMD_SB_OCC_MAX_CLEAR, + DEVLINK_CMD_ESWITCH_MODE_GET, + DEVLINK_CMD_ESWITCH_MODE_SET, /* add new commands above here */ __DEVLINK_CMD_MAX, @@ -95,6 +97,11 @@ enum devlink_sb_threshold_type { #define DEVLINK_SB_THRESHOLD_TO_ALPHA_MAX 20 +enum devlink_eswitch_mode { + DEVLINK_ESWITCH_MODE_LEGACY, + DEVLINK_ESWITCH_MODE_SWITCHDEV, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -125,6 +132,7 @@ enum devlink_attr { DEVLINK_ATTR_SB_TC_INDEX, /* u16 */ DEVLINK_ATTR_SB_OCC_CUR, /* u32 */ DEVLINK_ATTR_SB_OCC_MAX, /* u32 */ + DEVLINK_ATTR_ESWITCH_MODE, /* u16 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 933e8d4d3968..b2e592a198c0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1394,6 +1394,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } +static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, u16 mode) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + struct sk_buff *msg; + u16 mode; + int err; + + if (!ops || !ops->eswitch_mode_get) + return -EOPNOTSUPP; + + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, + info->snd_portid, info->snd_seq, 0, mode); + + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + u16 mode; + + if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) + return -EINVAL; + + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + + if (ops && ops->eswitch_mode_set) + return ops->eswitch_mode_set(devlink, mode); + return -EOPNOTSUPP; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -1407,6 +1479,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1525,6 +1598,20 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_NEED_SB | DEVLINK_NL_FLAG_LOCK_PORTS, }, + { + .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, + .doit = devlink_nl_cmd_eswitch_mode_get_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, + .doit = devlink_nl_cmd_eswitch_mode_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; /** -- cgit v1.2.3 From f86dec94e3a86c992a637df1c301a4df25a85801 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 15 Apr 2016 18:14:25 +0200 Subject: NFC: hci: delete unused nfc_llc_get_rx_head_tail_room() It used to be EXPORTed, but then EXPORT usage was cleaned up (in 2012), without noticing that the function has no users at all (and curiously, never had any users). Delete it. While at it, remove non-static "inline" hints on nearby functions: these hints don't work across compilation units anyway, and these functions are not used in their .c file, thus they are never inlined. IOW: "inline" here does not help in any way. Signed-off-by: Denys Vlasenko CC: Samuel Ortiz CC: Christophe Ricard CC: linux-wireless@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Samuel Ortiz --- include/net/nfc/llc.h | 4 ---- net/nfc/hci/llc.c | 17 +++++------------ 2 files changed, 5 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/nfc/llc.h b/include/net/nfc/llc.h index c25fbdee0d61..7ecb45757897 100644 --- a/include/net/nfc/llc.h +++ b/include/net/nfc/llc.h @@ -37,10 +37,6 @@ struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev, int tx_tailroom, llc_failure_t llc_failure); void nfc_llc_free(struct nfc_llc *llc); -void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom, - int *rx_tailroom); - - int nfc_llc_start(struct nfc_llc *llc); int nfc_llc_stop(struct nfc_llc *llc); void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb); diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 1399a03fa6e6..3d699cbc7435 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -133,36 +133,29 @@ void nfc_llc_free(struct nfc_llc *llc) kfree(llc); } -inline void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom, - int *rx_tailroom) -{ - *rx_headroom = llc->rx_headroom; - *rx_tailroom = llc->rx_tailroom; -} - -inline int nfc_llc_start(struct nfc_llc *llc) +int nfc_llc_start(struct nfc_llc *llc) { return llc->ops->start(llc); } EXPORT_SYMBOL(nfc_llc_start); -inline int nfc_llc_stop(struct nfc_llc *llc) +int nfc_llc_stop(struct nfc_llc *llc) { return llc->ops->stop(llc); } EXPORT_SYMBOL(nfc_llc_stop); -inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) +void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb) { llc->ops->rcv_from_drv(llc, skb); } -inline int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) +int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) { return llc->ops->xmit_from_hci(llc, skb); } -inline void *nfc_llc_get_data(struct nfc_llc *llc) +void *nfc_llc_get_data(struct nfc_llc *llc) { return llc->data; } -- cgit v1.2.3 From 7854a44526de84142e367f08288c9f3a33c4c8ee Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Tue, 7 Jun 2016 16:21:52 +0200 Subject: NFC: digital: Add a delay between poll cycles This replaces the polling work struct with a delayed work struct and add a 10 ms delay between 2 poll cycles. This avoids to flood the device with 'switch off'/'switch on' commands. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- include/net/nfc/digital.h | 2 +- net/nfc/digital_core.c | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h index 0ae101eef0f4..506e3f6eabef 100644 --- a/include/net/nfc/digital.h +++ b/include/net/nfc/digital.h @@ -220,7 +220,7 @@ struct nfc_digital_dev { struct list_head cmd_queue; struct mutex cmd_lock; - struct work_struct poll_work; + struct delayed_work poll_work; u8 curr_protocol; u8 curr_rf_tech; diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index dd9003f38822..27769ac89d27 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -30,6 +30,9 @@ #define DIGITAL_PROTO_ISO15693_RF_TECH NFC_PROTO_ISO15693_MASK +/* Delay between each poll frame (ms) */ +#define DIGITAL_POLL_INTERVAL 10 + struct digital_cmd { struct list_head queue; @@ -419,7 +422,8 @@ void digital_poll_next_tech(struct nfc_digital_dev *ddev) mutex_unlock(&ddev->poll_lock); - schedule_work(&ddev->poll_work); + schedule_delayed_work(&ddev->poll_work, + msecs_to_jiffies(DIGITAL_POLL_INTERVAL)); } static void digital_wq_poll(struct work_struct *work) @@ -428,7 +432,7 @@ static void digital_wq_poll(struct work_struct *work) struct digital_poll_tech *poll_tech; struct nfc_digital_dev *ddev = container_of(work, struct nfc_digital_dev, - poll_work); + poll_work.work); mutex_lock(&ddev->poll_lock); if (!ddev->poll_tech_count) { @@ -543,7 +547,7 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, return -EINVAL; } - schedule_work(&ddev->poll_work); + schedule_delayed_work(&ddev->poll_work, 0); return 0; } @@ -564,7 +568,7 @@ static void digital_stop_poll(struct nfc_dev *nfc_dev) mutex_unlock(&ddev->poll_lock); - cancel_work_sync(&ddev->poll_work); + cancel_delayed_work_sync(&ddev->poll_work); digital_abort_cmd(ddev); } @@ -770,7 +774,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, INIT_WORK(&ddev->cmd_complete_work, digital_wq_cmd_complete); mutex_init(&ddev->poll_lock); - INIT_WORK(&ddev->poll_work, digital_wq_poll); + INIT_DELAYED_WORK(&ddev->poll_work, digital_wq_poll); if (supported_protocols & NFC_PROTO_JEWEL_MASK) ddev->protocols |= NFC_PROTO_JEWEL_MASK; @@ -832,7 +836,7 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) ddev->poll_tech_count = 0; mutex_unlock(&ddev->poll_lock); - cancel_work_sync(&ddev->poll_work); + cancel_delayed_work_sync(&ddev->poll_work); cancel_work_sync(&ddev->cmd_work); cancel_work_sync(&ddev->cmd_complete_work); -- cgit v1.2.3 From ff202ee1ed8f032f05b80b541664cf02e75d7080 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 2 Jul 2016 06:43:15 -0400 Subject: net sched actions: skbedit add support for mod-ing skb pkt_type Extremely useful for setting packet type to host so i dont have to modify the dst mac address using pedit (which requires that i know the mac address) Example usage: tc filter add dev eth0 parent ffff: protocol ip pref 9 u32 \ match ip src 5.5.5.5/32 \ flowid 1:5 action skbedit ptype host This will tag all packets incoming from 5.5.5.5 with type PACKET_HOST Signed-off-by: Jamal Hadi Salim Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 10 +++++----- include/uapi/linux/tc_act/tc_skbedit.h | 2 ++ net/sched/act_skbedit.c | 18 +++++++++++++++++- 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index b496d5ad7d42..d01a5d40cfb5 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -24,11 +24,11 @@ struct tcf_skbedit { struct tcf_common common; - u32 flags; - u32 priority; - u32 mark; - u16 queue_mapping; - /* XXX: 16-bit pad here? */ + u32 flags; + u32 priority; + u32 mark; + u16 queue_mapping; + u16 ptype; }; #define to_skbedit(a) \ container_of(a->priv, struct tcf_skbedit, common) diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index fecb5cc48c40..a4d00c608d8f 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -27,6 +27,7 @@ #define SKBEDIT_F_PRIORITY 0x1 #define SKBEDIT_F_QUEUE_MAPPING 0x2 #define SKBEDIT_F_MARK 0x4 +#define SKBEDIT_F_PTYPE 0x8 struct tc_skbedit { tc_gen; @@ -40,6 +41,7 @@ enum { TCA_SKBEDIT_QUEUE_MAPPING, TCA_SKBEDIT_MARK, TCA_SKBEDIT_PAD, + TCA_SKBEDIT_PTYPE, __TCA_SKBEDIT_MAX }; #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 53d1486cddf7..1c4c9240a3f8 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -47,6 +47,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, skb_set_queue_mapping(skb, d->queue_mapping); if (d->flags & SKBEDIT_F_MARK) skb->mark = d->mark; + if (d->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = d->ptype; spin_unlock(&d->tcf_lock); return d->tcf_action; @@ -57,6 +59,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, @@ -68,7 +71,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tc_skbedit *parm; struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; - u16 *queue_mapping = NULL; + u16 *queue_mapping = NULL, *ptype = NULL; bool exists = false; int ret = 0, err; @@ -92,6 +95,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } + if (tb[TCA_SKBEDIT_PTYPE] != NULL) { + ptype = nla_data(tb[TCA_SKBEDIT_PTYPE]); + if (!skb_pkt_type_ok(*ptype)) + return -EINVAL; + flags |= SKBEDIT_F_PTYPE; + } + if (tb[TCA_SKBEDIT_MARK] != NULL) { flags |= SKBEDIT_F_MARK; mark = nla_data(tb[TCA_SKBEDIT_MARK]); @@ -132,6 +142,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, d->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) d->mark = *mark; + if (flags & SKBEDIT_F_PTYPE) + d->ptype = *ptype; d->tcf_action = parm->action; @@ -169,6 +181,10 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), &d->mark)) goto nla_put_failure; + if ((d->flags & SKBEDIT_F_PTYPE) && + nla_put(skb, TCA_SKBEDIT_PTYPE, sizeof(d->ptype), + &d->ptype)) + goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) -- cgit v1.2.3 From 2a4501ae18b52fcdf553404286e6cefabd1d17ec Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 5 Jul 2016 11:27:42 +0200 Subject: neigh: Send a notification when DELAY_PROBE_TIME changes When the data plane is offloaded the traffic doesn't go through the networking stack. Therefore, after first resolving a neighbour the NUD state machine will transition it from REACHABLE to STALE until it's finally deleted by the garbage collector. To prevent such situations the offloading driver should notify the NUD state machine on any neighbours that were recently used. The driver's polling interval should be set so that the NUD state machine can function as if the traffic wasn't offloaded. Currently, there are no in-tree drivers that can report confirmation for a neighbour, but only 'used' indication. Therefore, the polling interval should be set according to DELAY_FIRST_PROBE_TIME, as a neighbour will transition from REACHABLE state to DELAY (instead of STALE) if "a packet was sent within the last DELAY_FIRST_PROBE_TIME seconds" (RFC 4861). Send a netevent whenever the DELAY_FIRST_PROBE_TIME changes - either via netlink or sysctl - so that offloading drivers can correctly set their polling interval. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + net/core/neighbour.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netevent.h b/include/net/netevent.h index d8bbb38584b6..f440df172b56 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -24,6 +24,7 @@ struct netevent_redirect { enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ + NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 952aabb5aa56..5cdc62a8eb84 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2047,6 +2047,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, @@ -2930,6 +2931,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } -- cgit v1.2.3 From c6e6a0c8be575c830a97b1942dabeab70f423fe0 Mon Sep 17 00:00:00 2001 From: Aviya Erenfeld Date: Tue, 5 Jul 2016 15:23:08 +0300 Subject: nl80211: Add API to support VHT MU-MIMO air sniffer add API to support VHT MU-MIMO air sniffer. in MU-MIMO there are parallel frames on the air while the HW has only one RX. add the capability to sniff one of the MU-MIMO parallel frames by giving the sniffer additional information so it'll know which of the parallel frames it shall follow. Add attribute - NL80211_ATTR_MU_MIMO_GROUP_DATA - for getting a MU-MIMO groupID in order to monitor packets from that group using VHT MU-MIMO. And add attribute -NL80211_ATTR_MU_MIMO_FOLLOW_ADDR - for passing MAC address to monitor mode. that option will be used by VHT MU-MIMO air sniffer to follow a station according to it's MAC address using VHT MU-MIMO. Signed-off-by: Aviya Erenfeld Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++++++-- include/uapi/linux/nl80211.h | 29 +++++++++++++++++++++++++++++ net/wireless/nl80211.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7bbb00d8b2cd..fa4f0f793817 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -330,6 +330,9 @@ struct ieee80211_supported_band { * in a separate chapter. */ +#define VHT_MUMIMO_GROUPS_DATA_LEN (WLAN_MEMBERSHIP_LEN +\ + WLAN_USER_POSITION_LEN) + /** * struct vif_params - describes virtual interface parameters * @use_4addr: use 4-address frames @@ -339,10 +342,13 @@ struct ieee80211_supported_band { * This feature is only fully supported by drivers that enable the * %NL80211_FEATURE_MAC_ON_CREATE flag. Others may support creating ** only p2p devices with specified MAC. + * @vht_mumimo_groups: MU-MIMO groupID. used for monitoring only + * packets belonging to that MU-MIMO groupID. */ struct vif_params { - int use_4addr; - u8 macaddr[ETH_ALEN]; + int use_4addr; + u8 macaddr[ETH_ALEN]; + u8 vht_mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN]; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 53c8278827a0..1d7da7888dcf 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1829,6 +1829,25 @@ enum nl80211_commands { * %NL80211_ATTR_EXT_CAPA_MASK, to specify the extended capabilities per * interface type. * + * @NL80211_ATTR_MU_MIMO_GROUP_DATA: array of 24 bytes that defines a MU-MIMO + * groupID for monitor mode. + * The first 8 bytes are a mask that defines the membership in each + * group (there are 64 groups, group 0 and 63 are reserved), + * each bit represents a group and set to 1 for being a member in + * that group and 0 for not being a member. + * The remaining 16 bytes define the position in each group: 2 bits for + * each group. + * (smaller group numbers represented on most significant bits and bigger + * group numbers on least significant bits.) + * This attribute is used only if all interfaces are in monitor mode. + * Set this attribute in order to monitor packets using the given MU-MIMO + * groupID data. + * to turn off that feature set all the bits of the groupID to zero. + * @NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR: mac address for the sniffer to follow + * when using MU-MIMO air sniffer. + * to turn that feature off set an invalid mac address + * (e.g. FF:FF:FF:FF:FF:FF) + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2213,6 +2232,9 @@ enum nl80211_attrs { NL80211_ATTR_IFTYPE_EXT_CAPA, + NL80211_ATTR_MU_MIMO_GROUP_DATA, + NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4479,6 +4501,12 @@ enum nl80211_feature_flags { * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set * the ASSOC_REQ_USE_RRM flag in the association request even if * NL80211_FEATURE_QUIET is not advertized. + * @NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER: This device supports MU-MIMO air + * sniffer which means that it can be configured to hear packets from + * certain groups which can be configured by the + * %NL80211_ATTR_MU_MIMO_GROUP_DATA attribute, + * or can be configured to follow a station by configuring the + * %NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR attribute. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4486,6 +4514,7 @@ enum nl80211_feature_flags { enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_VHT_IBSS, NL80211_EXT_FEATURE_RRM, + NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 244d552d5647..447026f8cc76 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -405,6 +405,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 }, + [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { + .len = VHT_MUMIMO_GROUPS_DATA_LEN + }, + [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, }; /* policy for the key attributes */ @@ -2695,6 +2699,38 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) change = true; } + if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) { + const u8 *mumimo_groups; + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + mumimo_groups = + nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]); + + /* bits 0 and 63 are reserved and must be zero */ + if ((mumimo_groups[0] & BIT(7)) || + (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(0))) + return -EINVAL; + + memcpy(params.vht_mumimo_groups, mumimo_groups, + VHT_MUMIMO_GROUPS_DATA_LEN); + change = true; + } + + if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) { + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + nla_memcpy(params.macaddr, + info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR], + ETH_ALEN); + change = true; + } + if (flags && (*flags & MONITOR_FLAG_ACTIVE) && !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; -- cgit v1.2.3 From 1d76250bd34af86c6498fc51e50cab3bfbbeceaa Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Tue, 5 Jul 2016 17:10:13 +0300 Subject: nl80211: support beacon report scanning Beacon report radio measurement requires reporting observed BSSs on the channels specified in the beacon request. If the measurement mode is set to passive or active, it requires actually performing a scan (passive or active, accordingly), and reporting the time that the scan was started and the time each beacon/probe was received (both in terms of TSF of the BSS of the requesting AP). If the request mode is table, this information is optional. In addition, the radio measurement request specifies the channel dwell time for the measurement. In order to use scan for beacon report when the mode is active or passive, add a parameter to scan request that specifies the channel dwell time, and add scan start time and beacon received time to scan results information. Supporting beacon report is required for Multi Band Operation (MBO). Signed-off-by: Assaf Krauss Signed-off-by: David Spinadel Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 17 +++++++-- drivers/net/wireless/ath/wil6210/cfg80211.c | 6 +++- drivers/net/wireless/ath/wil6210/main.c | 12 +++++-- drivers/net/wireless/ath/wil6210/p2p.c | 6 +++- drivers/net/wireless/ath/wil6210/wmi.c | 8 +++-- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 6 +++- drivers/net/wireless/intersil/orinoco/scan.c | 12 +++++-- drivers/net/wireless/marvell/libertas/cfg.c | 11 ++++-- drivers/net/wireless/marvell/mwifiex/cmdevt.c | 12 +++++-- drivers/net/wireless/marvell/mwifiex/main.c | 6 +++- drivers/net/wireless/marvell/mwifiex/scan.c | 12 +++++-- drivers/net/wireless/rndis_wlan.c | 10 ++++-- drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c | 11 ++++-- drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 12 +++++-- drivers/staging/wlan-ng/cfg80211.c | 5 ++- include/net/cfg80211.h | 40 ++++++++++++++++++--- include/uapi/linux/nl80211.h | 42 ++++++++++++++++++++++ net/mac80211/scan.c | 9 +++-- net/wireless/core.c | 4 +-- net/wireless/core.h | 12 +++++++ net/wireless/nl80211.c | 27 ++++++++++++++ net/wireless/scan.c | 18 ++++++---- net/wireless/trace.h | 33 ++++++++++++----- 23 files changed, 279 insertions(+), 52 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 4e11ba06f089..ef5b40ef6d67 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -859,7 +859,11 @@ void ath6kl_cfg80211_disconnect_event(struct ath6kl_vif *vif, u8 reason, struct ath6kl *ar = vif->ar; if (vif->scan_req) { - cfg80211_scan_done(vif->scan_req, true); + struct cfg80211_scan_info info = { + .aborted = true, + }; + + cfg80211_scan_done(vif->scan_req, &info); vif->scan_req = NULL; } @@ -1069,6 +1073,9 @@ static int ath6kl_cfg80211_scan(struct wiphy *wiphy, void ath6kl_cfg80211_scan_complete_event(struct ath6kl_vif *vif, bool aborted) { struct ath6kl *ar = vif->ar; + struct cfg80211_scan_info info = { + .aborted = aborted, + }; int i; ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: status%s\n", __func__, @@ -1089,7 +1096,7 @@ void ath6kl_cfg80211_scan_complete_event(struct ath6kl_vif *vif, bool aborted) } out: - cfg80211_scan_done(vif->scan_req, aborted); + cfg80211_scan_done(vif->scan_req, &info); vif->scan_req = NULL; } @@ -3614,7 +3621,11 @@ void ath6kl_cfg80211_vif_stop(struct ath6kl_vif *vif, bool wmi_ready) } if (vif->scan_req) { - cfg80211_scan_done(vif->scan_req, true); + struct cfg80211_scan_info info = { + .aborted = true, + }; + + cfg80211_scan_done(vif->scan_req, &info); vif->scan_req = NULL; } diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 62bf9331bd7f..f0e1175fb76a 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1369,7 +1369,11 @@ static void wil_cfg80211_stop_p2p_device(struct wiphy *wiphy, mutex_lock(&wil->mutex); started = wil_p2p_stop_discovery(wil); if (started && wil->scan_request) { - cfg80211_scan_done(wil->scan_request, 1); + struct cfg80211_scan_info info = { + .aborted = true, + }; + + cfg80211_scan_done(wil->scan_request, &info); wil->scan_request = NULL; wil->radio_wdev = wil->wdev; } diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c index 8e31d755bbee..4bc92e54984a 100644 --- a/drivers/net/wireless/ath/wil6210/main.c +++ b/drivers/net/wireless/ath/wil6210/main.c @@ -850,10 +850,14 @@ int wil_reset(struct wil6210_priv *wil, bool load_fw) mutex_unlock(&wil->wmi_mutex); if (wil->scan_request) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + wil_dbg_misc(wil, "Abort scan_request 0x%p\n", wil->scan_request); del_timer_sync(&wil->scan_timer); - cfg80211_scan_done(wil->scan_request, true); + cfg80211_scan_done(wil->scan_request, &info); wil->scan_request = NULL; } @@ -1049,10 +1053,14 @@ int __wil_down(struct wil6210_priv *wil) (void)wil_p2p_stop_discovery(wil); if (wil->scan_request) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + wil_dbg_misc(wil, "Abort scan_request 0x%p\n", wil->scan_request); del_timer_sync(&wil->scan_timer); - cfg80211_scan_done(wil->scan_request, true); + cfg80211_scan_done(wil->scan_request, &info); wil->scan_request = NULL; } diff --git a/drivers/net/wireless/ath/wil6210/p2p.c b/drivers/net/wireless/ath/wil6210/p2p.c index 213b8259638c..e0f8aa0ebfac 100644 --- a/drivers/net/wireless/ath/wil6210/p2p.c +++ b/drivers/net/wireless/ath/wil6210/p2p.c @@ -252,8 +252,12 @@ void wil_p2p_search_expired(struct work_struct *work) mutex_unlock(&wil->mutex); if (started) { + struct cfg80211_scan_info info = { + .aborted = false, + }; + mutex_lock(&wil->p2p_wdev_mutex); - cfg80211_scan_done(wil->scan_request, 0); + cfg80211_scan_done(wil->scan_request, &info); wil->scan_request = NULL; wil->radio_wdev = wil->wdev; mutex_unlock(&wil->p2p_wdev_mutex); diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index b80c5d850e1e..4d92541913c0 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -426,15 +426,17 @@ static void wmi_evt_scan_complete(struct wil6210_priv *wil, int id, { if (wil->scan_request) { struct wmi_scan_complete_event *data = d; - bool aborted = (data->status != WMI_SCAN_SUCCESS); + struct cfg80211_scan_info info = { + .aborted = (data->status != WMI_SCAN_SUCCESS), + }; wil_dbg_wmi(wil, "SCAN_COMPLETE(0x%08x)\n", data->status); wil_dbg_misc(wil, "Complete scan_request 0x%p aborted %d\n", - wil->scan_request, aborted); + wil->scan_request, info.aborted); del_timer_sync(&wil->scan_timer); mutex_lock(&wil->p2p_wdev_mutex); - cfg80211_scan_done(wil->scan_request, aborted); + cfg80211_scan_done(wil->scan_request, &info); wil->radio_wdev = wil->wdev; mutex_unlock(&wil->p2p_wdev_mutex); wil->scan_request = NULL; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 264bd638a3d9..afe2b202040a 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -775,9 +775,13 @@ s32 brcmf_notify_escan_complete(struct brcmf_cfg80211_info *cfg, if (!aborted) cfg80211_sched_scan_results(cfg_to_wiphy(cfg)); } else if (scan_request) { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + brcmf_dbg(SCAN, "ESCAN Completed scan: %s\n", aborted ? "Aborted" : "Done"); - cfg80211_scan_done(scan_request, aborted); + cfg80211_scan_done(scan_request, &info); } if (!test_and_clear_bit(BRCMF_SCAN_STATUS_BUSY, &cfg->scan_status)) brcmf_dbg(SCAN, "Scan complete, probably P2P scan\n"); diff --git a/drivers/net/wireless/intersil/orinoco/scan.c b/drivers/net/wireless/intersil/orinoco/scan.c index d0ceb06c72d0..6d1d084854fb 100644 --- a/drivers/net/wireless/intersil/orinoco/scan.c +++ b/drivers/net/wireless/intersil/orinoco/scan.c @@ -237,7 +237,11 @@ void orinoco_add_hostscan_results(struct orinoco_private *priv, scan_abort: if (priv->scan_request) { - cfg80211_scan_done(priv->scan_request, abort); + struct cfg80211_scan_info info = { + .aborted = abort, + }; + + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } } @@ -245,7 +249,11 @@ void orinoco_add_hostscan_results(struct orinoco_private *priv, void orinoco_scan_done(struct orinoco_private *priv, bool abort) { if (priv->scan_request) { - cfg80211_scan_done(priv->scan_request, abort); + struct cfg80211_scan_info info = { + .aborted = abort, + }; + + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } } diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index 776b44bfd93a..ea4802446618 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -796,10 +796,15 @@ void lbs_scan_done(struct lbs_private *priv) { WARN_ON(!priv->scan_req); - if (priv->internal_scan) + if (priv->internal_scan) { kfree(priv->scan_req); - else - cfg80211_scan_done(priv->scan_req, false); + } else { + struct cfg80211_scan_info info = { + .aborted = false, + }; + + cfg80211_scan_done(priv->scan_req, &info); + } priv->scan_req = NULL; } diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c index 6bc2011d8609..e7a21443647e 100644 --- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c +++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c @@ -1057,8 +1057,12 @@ mwifiex_cancel_all_pending_cmd(struct mwifiex_adapter *adapter) if (!priv) continue; if (priv->scan_request) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + mwifiex_dbg(adapter, WARN, "info: aborting scan\n"); - cfg80211_scan_done(priv->scan_request, 1); + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } } @@ -1112,8 +1116,12 @@ mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter) if (!priv) continue; if (priv->scan_request) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + mwifiex_dbg(adapter, WARN, "info: aborting scan\n"); - cfg80211_scan_done(priv->scan_request, 1); + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } } diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 0e280f879b58..db4925db39aa 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -697,9 +697,13 @@ mwifiex_close(struct net_device *dev) struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev); if (priv->scan_request) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + mwifiex_dbg(priv->adapter, INFO, "aborting scan on ndo_stop\n"); - cfg80211_scan_done(priv->scan_request, 1); + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; priv->scan_aborting = true; } diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c b/drivers/net/wireless/marvell/mwifiex/scan.c index bc5e52cebce1..fdd749110fcb 100644 --- a/drivers/net/wireless/marvell/mwifiex/scan.c +++ b/drivers/net/wireless/marvell/mwifiex/scan.c @@ -1956,9 +1956,13 @@ static void mwifiex_check_next_scan_command(struct mwifiex_private *priv) mwifiex_complete_scan(priv); if (priv->scan_request) { + struct cfg80211_scan_info info = { + .aborted = false, + }; + mwifiex_dbg(adapter, INFO, "info: notifying scan done\n"); - cfg80211_scan_done(priv->scan_request, 0); + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } else { priv->scan_aborting = false; @@ -1977,9 +1981,13 @@ static void mwifiex_check_next_scan_command(struct mwifiex_private *priv) if (!adapter->active_scan_triggered) { if (priv->scan_request) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + mwifiex_dbg(adapter, INFO, "info: aborting scan\n"); - cfg80211_scan_done(priv->scan_request, 1); + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } else { priv->scan_aborting = false; diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c index 569918c485b4..603c90470225 100644 --- a/drivers/net/wireless/rndis_wlan.c +++ b/drivers/net/wireless/rndis_wlan.c @@ -2134,6 +2134,7 @@ static void rndis_get_scan_results(struct work_struct *work) struct rndis_wlan_private *priv = container_of(work, struct rndis_wlan_private, scan_work.work); struct usbnet *usbdev = priv->usbdev; + struct cfg80211_scan_info info = {}; int ret; netdev_dbg(usbdev->net, "get_scan_results\n"); @@ -2143,7 +2144,8 @@ static void rndis_get_scan_results(struct work_struct *work) ret = rndis_check_bssid_list(usbdev, NULL, NULL); - cfg80211_scan_done(priv->scan_request, ret < 0); + info.aborted = ret < 0; + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } @@ -3574,7 +3576,11 @@ static int rndis_wlan_stop(struct usbnet *usbdev) flush_workqueue(priv->workqueue); if (priv->scan_request) { - cfg80211_scan_done(priv->scan_request, true); + struct cfg80211_scan_info info = { + .aborted = true, + }; + + cfg80211_scan_done(priv->scan_request, &info); priv->scan_request = NULL; } diff --git a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c index 0da559d929bc..d0ba3778990e 100644 --- a/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723au/os_dep/ioctl_cfg80211.c @@ -1256,10 +1256,15 @@ void rtw_cfg80211_indicate_scan_done(struct rtw_wdev_priv *pwdev_priv, DBG_8723A("%s with scan req\n", __func__); if (pwdev_priv->scan_request->wiphy != - pwdev_priv->rtw_wdev->wiphy) + pwdev_priv->rtw_wdev->wiphy) { DBG_8723A("error wiphy compare\n"); - else - cfg80211_scan_done(pwdev_priv->scan_request, aborted); + } else { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + + cfg80211_scan_done(pwdev_priv->scan_request, &info); + } pwdev_priv->scan_request = NULL; } else { diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c index 51aff4ff7d7c..a0d8e22e575b 100644 --- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c +++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c @@ -454,7 +454,11 @@ static void CfgScanResult(enum scan_event scan_event, mutex_lock(&priv->scan_req_lock); if (priv->pstrScanReq) { - cfg80211_scan_done(priv->pstrScanReq, false); + struct cfg80211_scan_info info = { + .aborted = false, + }; + + cfg80211_scan_done(priv->pstrScanReq, &info); priv->u32RcvdChCount = 0; priv->bCfgScanning = false; priv->pstrScanReq = NULL; @@ -464,10 +468,14 @@ static void CfgScanResult(enum scan_event scan_event, mutex_lock(&priv->scan_req_lock); if (priv->pstrScanReq) { + struct cfg80211_scan_info info = { + .aborted = false, + }; + update_scan_time(); refresh_scan(priv, 1, false); - cfg80211_scan_done(priv->pstrScanReq, false); + cfg80211_scan_done(priv->pstrScanReq, &info); priv->bCfgScanning = false; priv->pstrScanReq = NULL; } diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c index a6e6fb9f42e1..f46dfe6b24e8 100644 --- a/drivers/staging/wlan-ng/cfg80211.c +++ b/drivers/staging/wlan-ng/cfg80211.c @@ -338,6 +338,8 @@ static int prism2_scan(struct wiphy *wiphy, struct p80211msg_dot11req_scan msg1; struct p80211msg_dot11req_scan_results msg2; struct cfg80211_bss *bss; + struct cfg80211_scan_info info = {}; + int result; int err = 0; int numbss = 0; @@ -440,7 +442,8 @@ static int prism2_scan(struct wiphy *wiphy, err = prism2_result2err(msg2.resultcode.data); exit: - cfg80211_scan_done(request, err ? 1 : 0); + info.aborted = !!(err); + cfg80211_scan_done(request, &info); priv->scan_request = NULL; return err; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fa4f0f793817..e2658e392a1f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1423,6 +1423,21 @@ struct cfg80211_ssid { u8 ssid_len; }; +/** + * struct cfg80211_scan_info - information about completed scan + * @scan_start_tsf: scan start time in terms of the TSF of the BSS that the + * wireless device that requested the scan is connected to. If this + * information is not available, this field is left zero. + * @tsf_bssid: the BSSID according to which %scan_start_tsf is set. + * @aborted: set to true if the scan was aborted for any reason, + * userspace will be notified of that + */ +struct cfg80211_scan_info { + u64 scan_start_tsf; + u8 tsf_bssid[ETH_ALEN] __aligned(2); + bool aborted; +}; + /** * struct cfg80211_scan_request - scan request description * @@ -1433,12 +1448,17 @@ struct cfg80211_ssid { * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets + * @duration: how long to listen on each channel, in TUs. If + * %duration_mandatory is not set, this is the maximum dwell time and + * the actual dwell time may be shorter. + * @duration_mandatory: if set, the scan duration must be as specified by the + * %duration field. * @flags: bit field of flags controlling operation * @rates: bitmap of rates to advertise for each band * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started * @wdev: the wireless device to scan for - * @aborted: (internal) scan request was notified as aborted + * @info: (internal) information about completed scan * @notified: (internal) scan request was notified as done or aborted * @no_cck: used to send probe requests at non CCK rate in 2GHz band * @mac_addr: MAC address used with randomisation @@ -1454,6 +1474,8 @@ struct cfg80211_scan_request { enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; + u16 duration; + bool duration_mandatory; u32 flags; u32 rates[NUM_NL80211_BANDS]; @@ -1467,7 +1489,8 @@ struct cfg80211_scan_request { /* internal */ struct wiphy *wiphy; unsigned long scan_start; - bool aborted, notified; + struct cfg80211_scan_info info; + bool notified; bool no_cck; /* keep last */ @@ -1600,12 +1623,19 @@ enum cfg80211_signal_type { * buffered on the device) and be accurate to about 10ms. * If the frame isn't buffered, just passing the return value of * ktime_get_boot_ns() is likely appropriate. + * @parent_tsf: the time at the start of reception of the first octet of the + * timestamp field of the frame. The time is the TSF of the BSS specified + * by %parent_bssid. + * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to + * the BSS that requested the scan in which the beacon/probe was received. */ struct cfg80211_inform_bss { struct ieee80211_channel *chan; enum nl80211_bss_scan_width scan_width; s32 signal; u64 boottime_ns; + u64 parent_tsf; + u8 parent_bssid[ETH_ALEN] __aligned(2); }; /** @@ -4067,10 +4097,10 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * cfg80211_scan_done - notify that scan finished * * @request: the corresponding scan request - * @aborted: set to true if the scan was aborted for any reason, - * userspace will be notified of that + * @info: information about the completed scan */ -void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted); +void cfg80211_scan_done(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info); /** * cfg80211_sched_scan_results - notify that new scan results are available diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1d7da7888dcf..b39ccab45333 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1848,6 +1848,22 @@ enum nl80211_commands { * to turn that feature off set an invalid mac address * (e.g. FF:FF:FF:FF:FF:FF) * + * @NL80211_ATTR_SCAN_START_TIME_TSF: The time at which the scan was actually + * started (u64). The time is the TSF of the BSS the interface that + * requested the scan is connected to (if available, otherwise this + * attribute must not be included). + * @NL80211_ATTR_SCAN_START_TIME_TSF_BSSID: The BSS according to which + * %NL80211_ATTR_SCAN_START_TIME_TSF is set. + * @NL80211_ATTR_MEASUREMENT_DURATION: measurement duration in TUs (u16). If + * %NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY is not set, this is the + * maximum measurement duration allowed. This attribute is used with + * measurement requests. It can also be used with %NL80211_CMD_TRIGGER_SCAN + * if the scan is used for beacon report radio measurement. + * @NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY: flag attribute that indicates + * that the duration specified with %NL80211_ATTR_MEASUREMENT_DURATION is + * mandatory. If this flag is not set, the duration is the maximum duration + * and the actual measurement duration may be shorter. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2235,6 +2251,11 @@ enum nl80211_attrs { NL80211_ATTR_MU_MIMO_GROUP_DATA, NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR, + NL80211_ATTR_SCAN_START_TIME_TSF, + NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, + NL80211_ATTR_MEASUREMENT_DURATION, + NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3496,6 +3517,12 @@ enum nl80211_bss_scan_width { * was last updated by a received frame. The value is expected to be * accurate to about 10ms. (u64, nanoseconds) * @NL80211_BSS_PAD: attribute used for padding for 64-bit alignment + * @NL80211_BSS_PARENT_TSF: the time at the start of reception of the first + * octet of the timestamp field of the last beacon/probe received for + * this BSS. The time is the TSF of the BSS specified by + * @NL80211_BSS_PARENT_BSSID. (u64). + * @NL80211_BSS_PARENT_BSSID: the BSS according to which @NL80211_BSS_PARENT_TSF + * is set. * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -3517,6 +3544,8 @@ enum nl80211_bss { NL80211_BSS_PRESP_DATA, NL80211_BSS_LAST_SEEN_BOOTTIME, NL80211_BSS_PAD, + NL80211_BSS_PARENT_TSF, + NL80211_BSS_PARENT_BSSID, /* keep last */ __NL80211_BSS_AFTER_LAST, @@ -4507,6 +4536,16 @@ enum nl80211_feature_flags { * %NL80211_ATTR_MU_MIMO_GROUP_DATA attribute, * or can be configured to follow a station by configuring the * %NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR attribute. + * @NL80211_EXT_FEATURE_SCAN_START_TIME: This driver includes the actual + * time the scan started in scan results event. The time is the TSF of + * the BSS that the interface that requested the scan is connected to + * (if available). + * @NL80211_EXT_FEATURE_BSS_PARENT_TSF: Per BSS, this driver reports the + * time the last beacon/probe was received. The time is the TSF of the + * BSS that the interface that requested the scan is connected to + * (if available). + * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of + * channel dwell time. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4515,6 +4554,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_VHT_IBSS, NL80211_EXT_FEATURE_RRM, NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER, + NL80211_EXT_FEATURE_SCAN_START_TIME, + NL80211_EXT_FEATURE_BSS_PARENT_TSF, + NL80211_EXT_FEATURE_SET_SCAN_DWELL, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f9648ef9e31f..4ec1c52a1549 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -353,8 +353,13 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); - if (scan_req != local->int_scan_req) - cfg80211_scan_done(scan_req, aborted); + if (scan_req != local->int_scan_req) { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + + cfg80211_scan_done(scan_req, &info); + } RCU_INIT_POINTER(local->scan_req, NULL); scan_sdata = rcu_dereference_protected(local->scan_sdata, diff --git a/net/wireless/core.c b/net/wireless/core.c index 39d9abd309ea..7645e97362c0 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -220,7 +220,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) - rdev->scan_req->aborted = true; + rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } } @@ -1087,7 +1087,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) - rdev->scan_req->aborted = true; + rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } diff --git a/net/wireless/core.h b/net/wireless/core.h index a4d547f99f8d..eee91443924d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -141,6 +141,18 @@ struct cfg80211_internal_bss { unsigned long refcount; atomic_t hold; + /* time at the start of the reception of the first octet of the + * timestamp field of the last beacon/probe received for this BSS. + * The time is the TSF of the BSS specified by %parent_bssid. + */ + u64 parent_tsf; + + /* the BSS according to which %parent_tsf is set. This is set to + * the BSS that the interface that requested the scan was connected to + * when the beacon/probe was received. + */ + u8 parent_bssid[ETH_ALEN] __aligned(2); + /* must be last because of priv member */ struct cfg80211_bss pub; }; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 447026f8cc76..c53b5462ed00 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6223,6 +6223,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SET_SCAN_DWELL)) { + err = -EOPNOTSUPP; + goto out_free; + } + + request->duration = + nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]); + request->duration_mandatory = + nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); + } + if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) { request->flags = nla_get_u32( info->attrs[NL80211_ATTR_SCAN_FLAGS]); @@ -7056,6 +7069,13 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; + if (intbss->parent_tsf && + (nla_put_u64_64bit(msg, NL80211_BSS_PARENT_TSF, + intbss->parent_tsf, NL80211_BSS_PAD) || + nla_put(msg, NL80211_BSS_PARENT_BSSID, ETH_ALEN, + intbss->parent_bssid))) + goto nla_put_failure; + if (intbss->ts_boottime && nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, intbss->ts_boottime, NL80211_BSS_PAD)) @@ -11829,6 +11849,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) goto nla_put_failure; + if (req->info.scan_start_tsf && + (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF, + req->info.scan_start_tsf, NL80211_BSS_PAD) || + nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN, + req->info.tsf_bssid))) + goto nla_put_failure; + return 0; nla_put_failure: return -ENOBUFS; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ef2955c89a00..0358e12be54b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3,6 +3,7 @@ * * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2016 Intel Deutschland GmbH */ #include #include @@ -194,7 +195,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (!request->aborted && + if (!request->info.aborted && request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); @@ -202,10 +203,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, spin_unlock_bh(&rdev->bss_lock); } - msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted); #ifdef CONFIG_CFG80211_WEXT - if (wdev->netdev && !request->aborted) { + if (wdev->netdev && !request->info.aborted) { memset(&wrqu, 0, sizeof(wrqu)); wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL); @@ -236,12 +237,13 @@ void __cfg80211_scan_done(struct work_struct *wk) rtnl_unlock(); } -void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted) +void cfg80211_scan_done(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info) { - trace_cfg80211_scan_done(request, aborted); + trace_cfg80211_scan_done(request, info); WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req); - request->aborted = aborted; + request->info = *info; request->notified = true; queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } @@ -843,6 +845,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, found->pub.capability = tmp->pub.capability; found->ts = tmp->ts; found->ts_boottime = tmp->ts_boottime; + found->parent_tsf = tmp->parent_tsf; + ether_addr_copy(found->parent_bssid, tmp->parent_bssid); } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; @@ -1086,6 +1090,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); tmp.ts_boottime = data->boottime_ns; + tmp.parent_tsf = data->parent_tsf; + ether_addr_copy(tmp.parent_bssid, data->parent_bssid); signal_valid = abs(data->chan->center_freq - channel->center_freq) <= wiphy->max_adj_channel_rssi_comp; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 3c1091ae6c36..72b5255cefe2 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2642,8 +2642,9 @@ TRACE_EVENT(cfg80211_tdls_oper_request, ); TRACE_EVENT(cfg80211_scan_done, - TP_PROTO(struct cfg80211_scan_request *request, bool aborted), - TP_ARGS(request, aborted), + TP_PROTO(struct cfg80211_scan_request *request, + struct cfg80211_scan_info *info), + TP_ARGS(request, info), TP_STRUCT__entry( __field(u32, n_channels) __dynamic_array(u8, ie, request ? request->ie_len : 0) @@ -2652,6 +2653,8 @@ TRACE_EVENT(cfg80211_scan_done, MAC_ENTRY(wiphy_mac) __field(bool, no_cck) __field(bool, aborted) + __field(u64, scan_start_tsf) + MAC_ENTRY(tsf_bssid) ), TP_fast_assign( if (request) { @@ -2666,9 +2669,16 @@ TRACE_EVENT(cfg80211_scan_done, request->wiphy->perm_addr); __entry->no_cck = request->no_cck; } - __entry->aborted = aborted; + if (info) { + __entry->aborted = info->aborted; + __entry->scan_start_tsf = info->scan_start_tsf; + MAC_ASSIGN(tsf_bssid, info->tsf_bssid); + } ), - TP_printk("aborted: %s", BOOL_TO_STR(__entry->aborted)) + TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: " MAC_PR_FMT, + BOOL_TO_STR(__entry->aborted), + (unsigned long long)__entry->scan_start_tsf, + MAC_PR_ARG(tsf_bssid)) ); DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results, @@ -2721,6 +2731,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame, __dynamic_array(u8, mgmt, len) __field(s32, signal) __field(u64, ts_boottime) + __field(u64, parent_tsf) + MAC_ENTRY(parent_bssid) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2730,10 +2742,15 @@ TRACE_EVENT(cfg80211_inform_bss_frame, memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = data->signal; __entry->ts_boottime = data->boottime_ns; - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d, tsb:%llu", - WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, - __entry->signal, (unsigned long long)__entry->ts_boottime) + __entry->parent_tsf = data->parent_tsf; + MAC_ASSIGN(parent_bssid, data->parent_bssid); + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT + "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: " + MAC_PR_FMT, WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + __entry->signal, (unsigned long long)__entry->ts_boottime, + (unsigned long long)__entry->parent_tsf, + MAC_PR_ARG(parent_bssid)) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, -- cgit v1.2.3 From 7947d3e075cde1a18e538f2dafbc850aa356ff79 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Tue, 5 Jul 2016 15:23:12 +0300 Subject: mac80211: Add support for beacon report radio measurement Add the following to support beacon report radio measurement with the measurement mode field set to passive or active: 1. Propagate the required scan duration to the device 2. Report the scan start time (in terms of TSF) 3. Report each BSS's detection time (also in terms of TSF) TSF times refer to the BSS that the interface that requested the scan is connected to. Signed-off-by: Assaf Krauss Signed-off-by: Avraham Stern [changed ath9k/10k, at76c59x-usb, iwlegacy, wl1251 and wlcore to match the new API] Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 14 +++++---- drivers/net/wireless/ath/ath9k/channel.c | 5 +++- drivers/net/wireless/atmel/at76c50x-usb.c | 5 +++- drivers/net/wireless/intel/iwlegacy/common.c | 6 +++- drivers/net/wireless/intel/iwlwifi/dvm/scan.c | 6 +++- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 37 ++++++++++++++++++----- drivers/net/wireless/mac80211_hwsim.c | 11 +++++-- drivers/net/wireless/st/cw1200/scan.c | 6 +++- drivers/net/wireless/ti/wl1251/event.c | 6 +++- drivers/net/wireless/ti/wl1251/main.c | 6 +++- drivers/net/wireless/ti/wlcore/main.c | 11 +++++-- drivers/net/wireless/ti/wlcore/scan.c | 5 +++- include/net/mac80211.h | 5 ++-- net/mac80211/ieee80211_i.h | 1 + net/mac80211/scan.c | 42 ++++++++++++++++++++++----- 15 files changed, 131 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index d4b7a168f7c0..ebc12c521fe0 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -3858,12 +3858,16 @@ void __ath10k_scan_finish(struct ath10k *ar) break; case ATH10K_SCAN_RUNNING: case ATH10K_SCAN_ABORTING: - if (!ar->scan.is_roc) - ieee80211_scan_completed(ar->hw, - (ar->scan.state == - ATH10K_SCAN_ABORTING)); - else if (ar->scan.roc_notify) + if (!ar->scan.is_roc) { + struct cfg80211_scan_info info = { + .aborted = (ar->scan.state == + ATH10K_SCAN_ABORTING), + }; + + ieee80211_scan_completed(ar->hw, &info); + } else if (ar->scan.roc_notify) { ieee80211_remain_on_channel_expired(ar->hw); + } /* fall through */ case ATH10K_SCAN_STARTING: ar->scan.state = ATH10K_SCAN_IDLE; diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c index e56bafcf5864..57e26a640477 100644 --- a/drivers/net/wireless/ath/ath9k/channel.c +++ b/drivers/net/wireless/ath/ath9k/channel.c @@ -960,6 +960,9 @@ void ath_roc_complete(struct ath_softc *sc, enum ath_roc_complete_reason reason) void ath_scan_complete(struct ath_softc *sc, bool abort) { struct ath_common *common = ath9k_hw_common(sc->sc_ah); + struct cfg80211_scan_info info = { + .aborted = abort, + }; if (abort) ath_dbg(common, CHAN_CTX, "HW scan aborted\n"); @@ -969,7 +972,7 @@ void ath_scan_complete(struct ath_softc *sc, bool abort) sc->offchannel.scan_req = NULL; sc->offchannel.scan_vif = NULL; sc->offchannel.state = ATH_OFFCHANNEL_IDLE; - ieee80211_scan_completed(sc->hw, abort); + ieee80211_scan_completed(sc->hw, &info); clear_bit(ATH_OP_SCANNING, &common->op_flags); spin_lock_bh(&sc->chan_lock); if (test_bit(ATH_OP_MULTI_CHANNEL, &common->op_flags)) diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c index 7c108047fb46..0e180677c7fc 100644 --- a/drivers/net/wireless/atmel/at76c50x-usb.c +++ b/drivers/net/wireless/atmel/at76c50x-usb.c @@ -1922,6 +1922,9 @@ static void at76_dwork_hw_scan(struct work_struct *work) { struct at76_priv *priv = container_of(work, struct at76_priv, dwork_hw_scan.work); + struct cfg80211_scan_info info = { + .aborted = false, + }; int ret; if (priv->device_unplugged) @@ -1948,7 +1951,7 @@ static void at76_dwork_hw_scan(struct work_struct *work) mutex_unlock(&priv->mtx); - ieee80211_scan_completed(priv->hw, false); + ieee80211_scan_completed(priv->hw, &info); ieee80211_wake_queues(priv->hw); } diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c index eb24b9241bb2..140b6ea8f7cc 100644 --- a/drivers/net/wireless/intel/iwlegacy/common.c +++ b/drivers/net/wireless/intel/iwlegacy/common.c @@ -1305,10 +1305,14 @@ il_send_scan_abort(struct il_priv *il) static void il_complete_scan(struct il_priv *il, bool aborted) { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + /* check if scan was requested from mac80211 */ if (il->scan_request) { D_SCAN("Complete scan in mac80211\n"); - ieee80211_scan_completed(il->hw, aborted); + ieee80211_scan_completed(il->hw, &info); } il->scan_vif = NULL; diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/scan.c b/drivers/net/wireless/intel/iwlwifi/dvm/scan.c index d01766f16175..17e6a32384d3 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/scan.c @@ -94,10 +94,14 @@ static int iwl_send_scan_abort(struct iwl_priv *priv) static void iwl_complete_scan(struct iwl_priv *priv, bool aborted) { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + /* check if scan was requested from mac80211 */ if (priv->scan_request) { IWL_DEBUG_SCAN(priv, "Complete scan in mac80211\n"); - ieee80211_scan_completed(priv->hw, aborted); + ieee80211_scan_completed(priv->hw, &info); } priv->scan_type = IWL_SCAN_NORMAL; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index e78fc567ff7d..1cac10c5d818 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -391,13 +391,16 @@ void iwl_mvm_rx_lmac_scan_complete_notif(struct iwl_mvm *mvm, ieee80211_sched_scan_stopped(mvm->hw); mvm->sched_scan_pass_all = SCHED_SCAN_PASS_ALL_DISABLED; } else if (mvm->scan_status & IWL_MVM_SCAN_REGULAR) { + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + IWL_DEBUG_SCAN(mvm, "Regular scan %s, EBS status %s (FW)\n", aborted ? "aborted" : "completed", iwl_mvm_ebs_status_str(scan_notif->ebs_status)); mvm->scan_status &= ~IWL_MVM_SCAN_REGULAR; - ieee80211_scan_completed(mvm->hw, - scan_notif->status == IWL_SCAN_OFFLOAD_ABORTED); + ieee80211_scan_completed(mvm->hw, &info); iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN); del_timer(&mvm->scan_timer); } else { @@ -1430,7 +1433,11 @@ void iwl_mvm_rx_umac_scan_complete_notif(struct iwl_mvm *mvm, /* if the scan is already stopping, we don't need to notify mac80211 */ if (mvm->scan_uid_status[uid] == IWL_MVM_SCAN_REGULAR) { - ieee80211_scan_completed(mvm->hw, aborted); + struct cfg80211_scan_info info = { + .aborted = aborted, + }; + + ieee80211_scan_completed(mvm->hw, &info); iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN); del_timer(&mvm->scan_timer); } else if (mvm->scan_uid_status[uid] == IWL_MVM_SCAN_SCHED) { @@ -1564,7 +1571,11 @@ void iwl_mvm_report_scan_aborted(struct iwl_mvm *mvm) uid = iwl_mvm_scan_uid_by_status(mvm, IWL_MVM_SCAN_REGULAR); if (uid >= 0) { - ieee80211_scan_completed(mvm->hw, true); + struct cfg80211_scan_info info = { + .aborted = true, + }; + + ieee80211_scan_completed(mvm->hw, &info); mvm->scan_uid_status[uid] = 0; } uid = iwl_mvm_scan_uid_by_status(mvm, IWL_MVM_SCAN_SCHED); @@ -1585,8 +1596,13 @@ void iwl_mvm_report_scan_aborted(struct iwl_mvm *mvm) mvm->scan_uid_status[i] = 0; } } else { - if (mvm->scan_status & IWL_MVM_SCAN_REGULAR) - ieee80211_scan_completed(mvm->hw, true); + if (mvm->scan_status & IWL_MVM_SCAN_REGULAR) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + + ieee80211_scan_completed(mvm->hw, &info); + } /* Sched scan will be restarted by mac80211 in * restart_hw, so do not report if FW is about to be @@ -1629,8 +1645,13 @@ out: */ iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN); del_timer(&mvm->scan_timer); - if (notify) - ieee80211_scan_completed(mvm->hw, true); + if (notify) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + + ieee80211_scan_completed(mvm->hw, &info); + } } else if (notify) { ieee80211_sched_scan_stopped(mvm->hw); mvm->sched_scan_pass_all = SCHED_SCAN_PASS_ALL_DISABLED; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 337794f9506e..8c35ac838fce 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -1941,8 +1941,12 @@ static void hw_scan_work(struct work_struct *work) mutex_lock(&hwsim->mutex); if (hwsim->scan_chan_idx >= req->n_channels) { + struct cfg80211_scan_info info = { + .aborted = false, + }; + wiphy_debug(hwsim->hw->wiphy, "hw scan complete\n"); - ieee80211_scan_completed(hwsim->hw, false); + ieee80211_scan_completed(hwsim->hw, &info); hwsim->hw_scan_request = NULL; hwsim->hw_scan_vif = NULL; hwsim->tmp_chan = NULL; @@ -2027,13 +2031,16 @@ static void mac80211_hwsim_cancel_hw_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct mac80211_hwsim_data *hwsim = hw->priv; + struct cfg80211_scan_info info = { + .aborted = true, + }; wiphy_debug(hw->wiphy, "hwsim cancel_hw_scan\n"); cancel_delayed_work_sync(&hwsim->hw_scan); mutex_lock(&hwsim->mutex); - ieee80211_scan_completed(hwsim->hw, true); + ieee80211_scan_completed(hwsim->hw, &info); hwsim->tmp_chan = NULL; hwsim->hw_scan_request = NULL; hwsim->hw_scan_vif = NULL; diff --git a/drivers/net/wireless/st/cw1200/scan.c b/drivers/net/wireless/st/cw1200/scan.c index 983788156bb0..0a0ff7e31f5b 100644 --- a/drivers/net/wireless/st/cw1200/scan.c +++ b/drivers/net/wireless/st/cw1200/scan.c @@ -167,6 +167,10 @@ void cw1200_scan_work(struct work_struct *work) } if (!priv->scan.req || (priv->scan.curr == priv->scan.end)) { + struct cfg80211_scan_info info = { + .aborted = priv->scan.status ? 1 : 0, + }; + if (priv->scan.output_power != priv->output_power) wsm_set_output_power(priv, priv->output_power * 10); if (priv->join_status == CW1200_JOIN_STATUS_STA && @@ -188,7 +192,7 @@ void cw1200_scan_work(struct work_struct *work) cw1200_scan_restart_delayed(priv); wsm_unlock_tx(priv); mutex_unlock(&priv->conf_mutex); - ieee80211_scan_completed(priv->hw, priv->scan.status ? 1 : 0); + ieee80211_scan_completed(priv->hw, &info); up(&priv->scan.lock); return; } else { diff --git a/drivers/net/wireless/ti/wl1251/event.c b/drivers/net/wireless/ti/wl1251/event.c index c98630394a1a..d0593bc1f1a9 100644 --- a/drivers/net/wireless/ti/wl1251/event.c +++ b/drivers/net/wireless/ti/wl1251/event.c @@ -36,7 +36,11 @@ static int wl1251_event_scan_complete(struct wl1251 *wl, mbox->scheduled_scan_channels); if (wl->scanning) { - ieee80211_scan_completed(wl->hw, false); + struct cfg80211_scan_info info = { + .aborted = false, + }; + + ieee80211_scan_completed(wl->hw, &info); wl1251_debug(DEBUG_MAC80211, "mac80211 hw scan completed"); wl->scanning = false; if (wl->hw->conf.flags & IEEE80211_CONF_IDLE) diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c index 56384a4e2a35..bbf7604889b7 100644 --- a/drivers/net/wireless/ti/wl1251/main.c +++ b/drivers/net/wireless/ti/wl1251/main.c @@ -448,7 +448,11 @@ static void wl1251_op_stop(struct ieee80211_hw *hw) WARN_ON(wl->state != WL1251_STATE_ON); if (wl->scanning) { - ieee80211_scan_completed(wl->hw, true); + struct cfg80211_scan_info info = { + .aborted = true, + }; + + ieee80211_scan_completed(wl->hw, &info); wl->scanning = false; } diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 10fd24c28ece..69267d592504 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -2615,6 +2615,10 @@ static void __wl1271_op_remove_interface(struct wl1271 *wl, if (wl->scan.state != WL1271_SCAN_STATE_IDLE && wl->scan_wlvif == wlvif) { + struct cfg80211_scan_info info = { + .aborted = true, + }; + /* * Rearm the tx watchdog just before idling scan. This * prevents just-finished scans from triggering the watchdog @@ -2625,7 +2629,7 @@ static void __wl1271_op_remove_interface(struct wl1271 *wl, memset(wl->scan.scanned_ch, 0, sizeof(wl->scan.scanned_ch)); wl->scan_wlvif = NULL; wl->scan.req = NULL; - ieee80211_scan_completed(wl->hw, true); + ieee80211_scan_completed(wl->hw, &info); } if (wl->sched_vif == wlvif) @@ -3649,6 +3653,9 @@ static void wl1271_op_cancel_hw_scan(struct ieee80211_hw *hw, { struct wl1271 *wl = hw->priv; struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif); + struct cfg80211_scan_info info = { + .aborted = true, + }; int ret; wl1271_debug(DEBUG_MAC80211, "mac80211 cancel hw scan"); @@ -3681,7 +3688,7 @@ static void wl1271_op_cancel_hw_scan(struct ieee80211_hw *hw, memset(wl->scan.scanned_ch, 0, sizeof(wl->scan.scanned_ch)); wl->scan_wlvif = NULL; wl->scan.req = NULL; - ieee80211_scan_completed(wl->hw, true); + ieee80211_scan_completed(wl->hw, &info); out_sleep: wl1271_ps_elp_sleep(wl); diff --git a/drivers/net/wireless/ti/wlcore/scan.c b/drivers/net/wireless/ti/wlcore/scan.c index 23343643207a..5612f5916b4e 100644 --- a/drivers/net/wireless/ti/wlcore/scan.c +++ b/drivers/net/wireless/ti/wlcore/scan.c @@ -36,6 +36,9 @@ void wl1271_scan_complete_work(struct work_struct *work) struct delayed_work *dwork; struct wl1271 *wl; struct wl12xx_vif *wlvif; + struct cfg80211_scan_info info = { + .aborted = false, + }; int ret; dwork = to_delayed_work(work); @@ -82,7 +85,7 @@ void wl1271_scan_complete_work(struct work_struct *work) wlcore_cmd_regdomain_config_locked(wl); - ieee80211_scan_completed(wl->hw, false); + ieee80211_scan_completed(wl->hw, &info); out: mutex_unlock(&wl->mutex); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a52009ffc19f..b4faadbb4e01 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4697,9 +4697,10 @@ void ieee80211_wake_queues(struct ieee80211_hw *hw); * any context, including hardirq context. * * @hw: the hardware that finished the scan - * @aborted: set to true if scan was aborted + * @info: information about the completed scan */ -void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted); +void ieee80211_scan_completed(struct ieee80211_hw *hw, + struct cfg80211_scan_info *info); /** * ieee80211_sched_scan_results - got results from scheduled scan diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 54edfb6fc1d1..f56d342c31b8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1250,6 +1250,7 @@ struct ieee80211_local { int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; + struct cfg80211_scan_info scan_info; struct work_struct sched_scan_stopped_work; struct ieee80211_sub_if_data __rcu *sched_scan_sdata; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4ec1c52a1549..8d4a9cd8a39a 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -7,6 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -70,6 +71,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, .boottime_ns = rx_status->boottime_ns, }; bool signal_valid; + struct ieee80211_sub_if_data *scan_sdata; if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; @@ -83,6 +85,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10; bss_meta.chan = channel; + + rcu_read_lock(); + scan_sdata = rcu_dereference(local->scan_sdata); + if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && + scan_sdata->vif.bss_conf.assoc && + ieee80211_have_rx_timestamp(rx_status)) { + bss_meta.parent_tsf = + ieee80211_calculate_rx_timestamp(local, rx_status, + len + FCS_LEN, 24); + ether_addr_copy(bss_meta.parent_bssid, + scan_sdata->vif.bss_conf.bssid); + } + rcu_read_unlock(); + cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) @@ -345,6 +361,11 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (rc == 0) return; + + /* HW scan failed and is going to be reported as done, so clear + * old scan info. + */ + memset(&local->scan_info, 0, sizeof(local->scan_info)); } kfree(local->hw_scan_req); @@ -354,11 +375,8 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) lockdep_is_held(&local->mtx)); if (scan_req != local->int_scan_req) { - struct cfg80211_scan_info info = { - .aborted = aborted, - }; - - cfg80211_scan_done(scan_req, &info); + local->scan_info.aborted = aborted; + cfg80211_scan_done(scan_req, &local->scan_info); } RCU_INIT_POINTER(local->scan_req, NULL); @@ -396,15 +414,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_start_next_roc(local); } -void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +void ieee80211_scan_completed(struct ieee80211_hw *hw, + struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); - trace_api_scan_completed(local, aborted); + trace_api_scan_completed(local, info); set_bit(SCAN_COMPLETED, &local->scanning); - if (aborted) + if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); + + memcpy(&local->scan_info, info, sizeof(*info)); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); @@ -571,6 +593,9 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); + local->hw_scan_req->req.duration = req->duration; + local->hw_scan_req->req.duration_mandatory = + req->duration_mandatory; local->hw_scan_band = 0; @@ -1078,6 +1103,7 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) */ cancel_delayed_work(&local->scan_work); /* and clean up */ + memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); out: mutex_unlock(&local->mtx); -- cgit v1.2.3 From 7d27a0ba7adc8ef30c2aae7592fce4c162aee4df Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Fri, 1 Jul 2016 10:19:34 +0900 Subject: cfg80211: Add mesh peer AID setting API Previously, mesh power management functionality works only with kernel MPM. Because user space MPM did not report mesh peer AID to kernel, the kernel could not identify the bit in TIM element. So this patch adds mesh peer AID setting API. Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/mac80211/cfg.c | 1 + net/wireless/nl80211.c | 6 ++++++ 4 files changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2658e392a1f..9c23f4d33e06 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -780,6 +780,7 @@ enum station_parameters_apply_mask { * (bitmask of BIT(NL80211_STA_FLAG_...)) * @listen_interval: listen interval or -1 for no change * @aid: AID or zero for no change + * @peer_aid: mesh peer AID or zero for no change * @plink_action: plink action to take * @plink_state: set the peer link state for a station * @ht_capa: HT capabilities of station @@ -811,6 +812,7 @@ struct station_parameters { u32 sta_modify_mask; int listen_interval; u16 aid; + u16 peer_aid; u8 supported_rates_len; u8 plink_action; u8 plink_state; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b39ccab45333..220694151434 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1864,6 +1864,9 @@ enum nl80211_commands { * mandatory. If this flag is not set, the duration is the maximum duration * and the actual measurement duration may be shorter. * + * @NL80211_ATTR_MESH_PEER_AID: Association ID for the mesh peer (u16). This is + * used to pull the stored data for mesh peer in power save state. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2256,6 +2259,8 @@ enum nl80211_attrs { NL80211_ATTR_MEASUREMENT_DURATION, NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY, + NL80211_ATTR_MESH_PEER_AID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0c12e4001f19..47e99ab8d97a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -997,6 +997,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local, if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count(sdata); sta->mesh->plink_state = params->plink_state; + sta->mesh->aid = params->peer_aid; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c53b5462ed00..5782f718d567 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4446,6 +4446,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]); if (params.plink_state >= NUM_NL80211_PLINK_STATES) return -EINVAL; + if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) { + params.peer_aid = nla_get_u16( + info->attrs[NL80211_ATTR_MESH_PEER_AID]); + if (params.peer_aid > IEEE80211_MAX_AID) + return -EINVAL; + } params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE; } -- cgit v1.2.3 From aece0c3fe1f06962a591268b8c236df0ae5d9e4e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 18 Jun 2016 10:45:33 +0200 Subject: nl802154: move PAD to right position The PAD define should be above the experimental support. We don't care about if we break userspace in experimental stuff but PAD is part of the existing UAPI. Cc: Nicolas Dichtel Acked-by: Nicolas Dichtel Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/nl802154.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/nl802154.h b/include/net/nl802154.h index fcab4de49951..7aad2fdfd16a 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -124,6 +124,8 @@ enum nl802154_attrs { NL802154_ATTR_ACKREQ_DEFAULT, + NL802154_ATTR_PAD, + /* add attributes here, update the policy in nl802154.c */ #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL @@ -138,8 +140,6 @@ enum nl802154_attrs { NL802154_ATTR_SEC_KEY, #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ - NL802154_ATTR_PAD, - __NL802154_ATTR_AFTER_LAST, NL802154_ATTR_MAX = __NL802154_ATTR_AFTER_LAST - 1 }; -- cgit v1.2.3 From 66e5c2672cd11b9310008099faf6a4ffb9dfb6d0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 18 Jun 2016 10:45:34 +0200 Subject: ieee802154: add netns support This patch adds netns support for 802.15.4 subsystem. Most parts are copy&pasted from wireless subsystem, it has the identically userspace API. Cc: Nicolas Dichtel Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 13 +++++++++ include/net/nl802154.h | 5 ++++ net/ieee802154/core.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++- net/ieee802154/core.h | 2 ++ net/ieee802154/nl802154.c | 54 ++++++++++++++++++++++++++++++++---- 5 files changed, 138 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 171cd76558fb..795ca4008f72 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -219,9 +219,22 @@ struct wpan_phy { struct device dev; + /* the network namespace this phy lives in currently */ + possible_net_t _net; + char priv[0] __aligned(NETDEV_ALIGN); }; +static inline struct net *wpan_phy_net(struct wpan_phy *wpan_phy) +{ + return read_pnet(&wpan_phy->_net); +} + +static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net) +{ + write_pnet(&wpan_phy->_net, net); +} + struct ieee802154_addr { u8 mode; __le16 pan_id; diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 7aad2fdfd16a..ddcee128f5d9 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -54,6 +54,8 @@ enum nl802154_commands { NL802154_CMD_SET_ACKREQ_DEFAULT, + NL802154_CMD_SET_WPAN_PHY_NETNS, + /* add new commands above here */ #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL @@ -126,6 +128,9 @@ enum nl802154_attrs { NL802154_ATTR_PAD, + NL802154_ATTR_PID, + NL802154_ATTR_NETNS_FD, + /* add attributes here, update the policy in nl802154.c */ #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index c35fdfa6d04e..cb7176cd4cd6 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -140,6 +140,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; + wpan_phy_net_set(&rdev->wpan_phy, &init_net); + init_waitqueue_head(&rdev->dev_wait); return &rdev->wpan_phy; @@ -207,6 +209,49 @@ void wpan_phy_free(struct wpan_phy *phy) } EXPORT_SYMBOL(wpan_phy_free); +int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, + struct net *net) +{ + struct wpan_dev *wpan_dev; + int err = 0; + + list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { + if (!wpan_dev->netdev) + continue; + wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); + if (err) + break; + wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + if (err) { + /* failed -- clean up to old netns */ + net = wpan_phy_net(&rdev->wpan_phy); + + list_for_each_entry_continue_reverse(wpan_dev, + &rdev->wpan_dev_list, + list) { + if (!wpan_dev->netdev) + continue; + wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + err = dev_change_net_namespace(wpan_dev->netdev, net, + "wpan%d"); + WARN_ON(err); + wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + } + + return err; + } + + wpan_phy_net_set(&rdev->wpan_phy, net); + + err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev)); + WARN_ON(err); + + return 0; +} + void cfg802154_dev_free(struct cfg802154_registered_device *rdev) { kfree(rdev); @@ -286,14 +331,34 @@ static struct notifier_block cfg802154_netdev_notifier = { .notifier_call = cfg802154_netdev_notifier_call, }; +static void __net_exit cfg802154_pernet_exit(struct net *net) +{ + struct cfg802154_registered_device *rdev; + + rtnl_lock(); + list_for_each_entry(rdev, &cfg802154_rdev_list, list) { + if (net_eq(wpan_phy_net(&rdev->wpan_phy), net)) + WARN_ON(cfg802154_switch_netns(rdev, &init_net)); + } + rtnl_unlock(); +} + +static struct pernet_operations cfg802154_pernet_ops = { + .exit = cfg802154_pernet_exit, +}; + static int __init wpan_phy_class_init(void) { int rc; - rc = wpan_phy_sysfs_init(); + rc = register_pernet_device(&cfg802154_pernet_ops); if (rc) goto err; + rc = wpan_phy_sysfs_init(); + if (rc) + goto err_sysfs; + rc = register_netdevice_notifier(&cfg802154_netdev_notifier); if (rc) goto err_nl; @@ -315,6 +380,8 @@ err_notifier: unregister_netdevice_notifier(&cfg802154_netdev_notifier); err_nl: wpan_phy_sysfs_exit(); +err_sysfs: + unregister_pernet_device(&cfg802154_pernet_ops); err: return rc; } @@ -326,6 +393,7 @@ static void __exit wpan_phy_class_exit(void) ieee802154_nl_exit(); unregister_netdevice_notifier(&cfg802154_netdev_notifier); wpan_phy_sysfs_exit(); + unregister_pernet_device(&cfg802154_pernet_ops); } module_exit(wpan_phy_class_exit); diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h index 231fade959f3..81141f58d079 100644 --- a/net/ieee802154/core.h +++ b/net/ieee802154/core.h @@ -38,6 +38,8 @@ wpan_phy_to_rdev(struct wpan_phy *wpan_phy) extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; +int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, + struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 116187b5c267..d90a4ed5b8a0 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -80,7 +80,8 @@ __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs) list_for_each_entry(rdev, &cfg802154_rdev_list, list) { struct wpan_dev *wpan_dev; - /* TODO netns compare */ + if (wpan_phy_net(&rdev->wpan_phy) != netns) + continue; if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx) continue; @@ -175,7 +176,8 @@ __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs) if (!rdev) return ERR_PTR(-ENODEV); - /* TODO netns compare */ + if (netns != wpan_phy_net(&rdev->wpan_phy)) + return ERR_PTR(-ENODEV); return rdev; } @@ -233,6 +235,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, + [NL802154_ATTR_PID] = { .type = NLA_U32 }, + [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, @@ -590,7 +594,6 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct cfg802154_registered_device *rdev; int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]); - /* TODO netns */ netdev = __dev_get_by_index(&init_net, ifidx); if (!netdev) return -ENODEV; @@ -629,7 +632,8 @@ nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb) } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { - /* TODO net ns compare */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) + continue; if (++idx <= state->start) continue; if (state->filter_wpan_phy != -1 && @@ -871,7 +875,8 @@ nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { - /* TODO netns compare */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) + continue; if (wp_idx < wp_start) { wp_idx++; continue; @@ -1271,6 +1276,37 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } +static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net *net; + int err; + + if (info->attrs[NL802154_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]); + + net = get_net_ns_by_pid(pid); + } else if (info->attrs[NL802154_ATTR_NETNS_FD]) { + u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]); + + net = get_net_ns_by_fd(fd); + } else { + return -EINVAL; + } + + if (IS_ERR(net)) + return PTR_ERR(net); + + err = 0; + + /* check if anything to do */ + if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net)) + err = cfg802154_switch_netns(rdev, net); + + put_net(net); + return err; +} + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, @@ -2261,6 +2297,14 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS, + .doit = nl802154_wpan_phy_netns, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, { .cmd = NL802154_CMD_SET_PAN_ID, .doit = nl802154_set_pan_id, -- cgit v1.2.3 From 9cc577dd25b9762df7f353658426bb2e048c480a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:24 +0200 Subject: ieee802154: add ieee802154_skb_dst_pan helper This patch adds ieee802154_skb_dst_pan function to get the pointer address of the destination pan id at skb mac pointer. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/linux/ieee802154.h | 16 ++++++++++++++++ include/net/mac802154.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include/net') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index acedbb68a5a3..91f4665fea63 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -31,6 +31,8 @@ #define IEEE802154_MIN_PSDU_LEN 9 #define IEEE802154_FCS_LEN 2 #define IEEE802154_MAX_AUTH_TAG_LEN 16 +#define IEEE802154_FC_LEN 2 +#define IEEE802154_SEQ_LEN 1 /* General MAC frame format: * 2 bytes: Frame Control @@ -221,9 +223,14 @@ enum { #define IEEE802154_FCTL_ACKREQ 0x0020 #define IEEE802154_FCTL_SECEN 0x0004 #define IEEE802154_FCTL_INTRA_PAN 0x0040 +#define IEEE802154_FCTL_DADDR 0x0c00 #define IEEE802154_FTYPE_DATA 0x0001 +#define IEEE802154_FCTL_ADDR_NONE 0x0000 +#define IEEE802154_FCTL_DADDR_SHORT 0x0800 +#define IEEE802154_FCTL_DADDR_EXTENDED 0x0c00 + /* * ieee802154_is_data - check if type is IEEE802154_FTYPE_DATA * @fc: frame control bytes in little-endian byteorder @@ -261,6 +268,15 @@ static inline bool ieee802154_is_intra_pan(__le16 fc) return fc & cpu_to_le16(IEEE802154_FCTL_INTRA_PAN); } +/* + * ieee802154_daddr_mode - get daddr mode from fc + * @fc: frame control bytes in little-endian byteorder + */ +static inline __le16 ieee802154_daddr_mode(__le16 fc) +{ + return fc & cpu_to_le16(IEEE802154_FCTL_DADDR); +} + /** * ieee802154_is_valid_psdu_len - check if psdu len is valid * available lengths: diff --git a/include/net/mac802154.h b/include/net/mac802154.h index e465c8551ac3..b3f7cd868fe9 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -257,6 +257,35 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) return get_unaligned_le16(skb_mac_header(skb)); } +/** + * ieee802154_skb_dst_pan - get the pointer to destination pan field + * @fc: mac header frame control field + * @skb: skb where the destination pan pointer will be get from + */ +static inline unsigned char *ieee802154_skb_dst_pan(__le16 fc, + const struct sk_buff *skb) +{ + unsigned char *dst_pan; + + switch (ieee802154_daddr_mode(fc)) { + case cpu_to_le16(IEEE802154_FCTL_ADDR_NONE): + dst_pan = NULL; + break; + case cpu_to_le16(IEEE802154_FCTL_DADDR_SHORT): + case cpu_to_le16(IEEE802154_FCTL_DADDR_EXTENDED): + dst_pan = skb_mac_header(skb) + + IEEE802154_FC_LEN + + IEEE802154_SEQ_LEN; + break; + default: + WARN_ONCE(1, "invalid addr mode detected"); + dst_pan = NULL; + break; + } + + return dst_pan; +} + /** * ieee802154_be64_to_le64 - copies and convert be64 to le64 * @le64_dst: le64 destination pointer -- cgit v1.2.3 From 19580cc1ed299c736b56b45c7576b477f185f8f5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:25 +0200 Subject: ieee802154: add ieee802154_skb_src_pan helper This patch adds ieee802154_skb_src_pan function to get the pointer address of the source pan id at skb mac pointer. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/linux/ieee802154.h | 13 ++++++++++ include/net/mac802154.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'include/net') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 91f4665fea63..ddb890174a0e 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -50,6 +50,7 @@ #define IEEE802154_EXTENDED_ADDR_LEN 8 #define IEEE802154_SHORT_ADDR_LEN 2 +#define IEEE802154_PAN_ID_LEN 2 #define IEEE802154_LIFS_PERIOD 40 #define IEEE802154_SIFS_PERIOD 12 @@ -224,12 +225,15 @@ enum { #define IEEE802154_FCTL_SECEN 0x0004 #define IEEE802154_FCTL_INTRA_PAN 0x0040 #define IEEE802154_FCTL_DADDR 0x0c00 +#define IEEE802154_FCTL_SADDR 0xc000 #define IEEE802154_FTYPE_DATA 0x0001 #define IEEE802154_FCTL_ADDR_NONE 0x0000 #define IEEE802154_FCTL_DADDR_SHORT 0x0800 #define IEEE802154_FCTL_DADDR_EXTENDED 0x0c00 +#define IEEE802154_FCTL_SADDR_SHORT 0x8000 +#define IEEE802154_FCTL_SADDR_EXTENDED 0xc000 /* * ieee802154_is_data - check if type is IEEE802154_FTYPE_DATA @@ -277,6 +281,15 @@ static inline __le16 ieee802154_daddr_mode(__le16 fc) return fc & cpu_to_le16(IEEE802154_FCTL_DADDR); } +/* + * ieee802154_saddr_mode - get saddr mode from fc + * @fc: frame control bytes in little-endian byteorder + */ +static inline __le16 ieee802154_saddr_mode(__le16 fc) +{ + return fc & cpu_to_le16(IEEE802154_FCTL_SADDR); +} + /** * ieee802154_is_valid_psdu_len - check if psdu len is valid * available lengths: diff --git a/include/net/mac802154.h b/include/net/mac802154.h index b3f7cd868fe9..ec01b35bc969 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -286,6 +286,65 @@ static inline unsigned char *ieee802154_skb_dst_pan(__le16 fc, return dst_pan; } +/** + * ieee802154_skb_src_pan - get the pointer to source pan field + * @fc: mac header frame control field + * @skb: skb where the source pan pointer will be get from + */ +static inline unsigned char *ieee802154_skb_src_pan(__le16 fc, + const struct sk_buff *skb) +{ + unsigned char *src_pan; + + switch (ieee802154_saddr_mode(fc)) { + case cpu_to_le16(IEEE802154_FCTL_ADDR_NONE): + src_pan = NULL; + break; + case cpu_to_le16(IEEE802154_FCTL_SADDR_SHORT): + case cpu_to_le16(IEEE802154_FCTL_SADDR_EXTENDED): + /* if intra-pan and source addr mode is non none, + * then source pan id is equal destination pan id. + */ + if (ieee802154_is_intra_pan(fc)) { + src_pan = ieee802154_skb_dst_pan(fc, skb); + break; + } + + switch (ieee802154_daddr_mode(fc)) { + case cpu_to_le16(IEEE802154_FCTL_ADDR_NONE): + src_pan = skb_mac_header(skb) + + IEEE802154_FC_LEN + + IEEE802154_SEQ_LEN; + break; + case cpu_to_le16(IEEE802154_FCTL_DADDR_SHORT): + src_pan = skb_mac_header(skb) + + IEEE802154_FC_LEN + + IEEE802154_SEQ_LEN + + IEEE802154_PAN_ID_LEN + + IEEE802154_SHORT_ADDR_LEN; + break; + case cpu_to_le16(IEEE802154_FCTL_DADDR_EXTENDED): + src_pan = skb_mac_header(skb) + + IEEE802154_FC_LEN + + IEEE802154_SEQ_LEN + + IEEE802154_PAN_ID_LEN + + IEEE802154_EXTENDED_ADDR_LEN; + break; + default: + WARN_ONCE(1, "invalid addr mode detected"); + src_pan = NULL; + break; + } + break; + default: + WARN_ONCE(1, "invalid addr mode detected"); + src_pan = NULL; + break; + } + + return src_pan; +} + /** * ieee802154_be64_to_le64 - copies and convert be64 to le64 * @le64_dst: le64 destination pointer -- cgit v1.2.3 From 0ea0b9af9b7599ada307258dc841f4300873e8a1 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:26 +0200 Subject: ieee802154: 6lowpan: fix intra pan id check The RIOT-OS stack does send intra-pan frames but don't set the intra pan flag inside the mac header. It seems this is valid frame addressing but inefficient. Anyway this patch adds a new function for intra pan addressing, doesn't matter if intra pan flag or source and destination are the same. The newly introduction function will be used to check on intra pan addressing for 6lowpan. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 19 +++++++++++++++++++ net/ieee802154/6lowpan/rx.c | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index ec01b35bc969..d757edd0b0b7 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -345,6 +345,25 @@ static inline unsigned char *ieee802154_skb_src_pan(__le16 fc, return src_pan; } +/** + * ieee802154_skb_is_intra_pan_addressing - checks whenever the mac addressing + * is an intra pan communication + * @fc: mac header frame control field + * @skb: skb where the source and destination pan should be get from + */ +static inline bool ieee802154_skb_is_intra_pan_addressing(__le16 fc, + const struct sk_buff *skb) +{ + unsigned char *dst_pan = ieee802154_skb_dst_pan(fc, skb), + *src_pan = ieee802154_skb_src_pan(fc, skb); + + /* if one is NULL is no intra pan addressing */ + if (!dst_pan || !src_pan) + return false; + + return !memcmp(dst_pan, src_pan, IEEE802154_PAN_ID_LEN); +} + /** * ieee802154_be64_to_le64 - copies and convert be64 to le64 * @le64_dst: le64 destination pointer diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index ef185dd4110d..649e7d45e88f 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -262,7 +262,7 @@ static inline bool lowpan_rx_h_check(struct sk_buff *skb) /* check on ieee802154 conform 6LoWPAN header */ if (!ieee802154_is_data(fc) || - !ieee802154_is_intra_pan(fc)) + !ieee802154_skb_is_intra_pan_addressing(fc, skb)) return false; /* check if we can dereference the dispatch */ -- cgit v1.2.3 From aaa7088eb29a0ffe6cf8d8a443695df41e5a62f3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:27 +0200 Subject: ieee802154: fix skb get fc on big endian This patch fixes ieee802154_get_fc_from_skb function on big endian machines. The function get_unaligned_le16 converts the byte order to host byte order but we want to keep the byte order like in mac header. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index d757edd0b0b7..bb7bfecc5ab3 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -247,6 +247,8 @@ struct ieee802154_ops { */ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) { + __le16 fc; + /* check if we can fc at skb_mac_header of sk buffer */ if (unlikely(!skb_mac_header_was_set(skb) || (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) { @@ -254,7 +256,8 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) return cpu_to_le16(0); } - return get_unaligned_le16(skb_mac_header(skb)); + memcpy(&fc, skb_mac_header(skb), IEEE802154_FC_LEN); + return fc; } /** -- cgit v1.2.3 From 048e7f7e66a8693fe1707997bffd19d08cde08f5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 6 Jul 2016 23:32:30 +0200 Subject: ieee802154: cleanup WARN_ON for fc fetch This patch cleanups the WARN_ON which occurs when the sk buffer has insufficient buffer space by moving the WARN_ON into if condition. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index bb7bfecc5ab3..286824acd008 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -250,11 +250,10 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) __le16 fc; /* check if we can fc at skb_mac_header of sk buffer */ - if (unlikely(!skb_mac_header_was_set(skb) || - (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) { - WARN_ON(1); + if (WARN_ON(!skb_mac_header_was_set(skb) || + (skb_tail_pointer(skb) - + skb_mac_header(skb)) < IEEE802154_FC_LEN)) return cpu_to_le16(0); - } memcpy(&fc, skb_mac_header(skb), IEEE802154_FC_LEN); return fc; -- cgit v1.2.3 From d390238c4fba7c87a3bcd859ce3373c864eb7b02 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 6 Jul 2016 20:03:54 -0400 Subject: net: dsa: initialize the routing table The routing table of every switch in a tree is currently initialized to all zeros. This is an issue since 0 is a valid port number. Add a DSA_RTABLE_NONE=-1 constant to initialize the signed values of the routing table pointing to other switches. This fixes the device mapping of the mv88e6xxx driver where the port pointing to the switch itself and to non-existent switches was wrongly configured to be 0. It is now set to the expected 0xf value. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ net/dsa/dsa.c | 6 ++++++ net/dsa/dsa2.c | 7 ++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 20b3087ad193..52ab18bc2b0d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -32,6 +32,8 @@ enum dsa_tag_protocol { #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 +#define DSA_RTABLE_NONE -1 + struct dsa_chip_data { /* * How to access the switch configuration registers. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 766d2a525ada..7e68bc6bc853 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -774,11 +774,17 @@ static int dsa_of_probe(struct device *dev) chip_index = -1; for_each_available_child_of_node(np, child) { + int i; + chip_index++; cd = &pd->chip[chip_index]; cd->of_node = child; + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + cd->rtable[i] = DSA_RTABLE_NONE; + /* When assigning the host device, increment its refcount */ cd->host_dev = get_device(&mdio_bus->dev); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 83b95fc4cede..78e4c0131c30 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -595,7 +595,7 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) struct device_node *ports = dsa_get_ports(ds, np); struct dsa_switch_tree *dst; u32 tree, index; - int err; + int i, err; err = dsa_parse_member(np, &tree, &index); if (err) @@ -622,6 +622,11 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) ds->dst = dst; ds->index = index; + + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + ds->rtable[i] = DSA_RTABLE_NONE; + dsa_dst_add_ds(dst, ds, index); err = dsa_dst_complete(dst); -- cgit v1.2.3 From ca8bee5dde1f02c2dbe8c8453dce27f2dfafb21c Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 5 Jul 2016 14:30:14 +0200 Subject: Bluetooth: Rename HCI_BREDR into HCI_PRIMARY The HCI_BREDR naming is confusing since it actually stands for Primary Bluetooth Controller. Which is a term that has been used in the latest standard. However from a legacy point of view there only really have been Basic Rate (BR) and Enhanced Data Rate (EDR). Recent versions of Bluetooth introduced Low Energy (LE) and made this terminology a little bit confused since Dual Mode Controllers include BR/EDR and LE. To simplify this the name HCI_PRIMARY stands for the Primary Controller which can be a single mode or dual mode controller. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- drivers/bluetooth/btmrvl_main.c | 2 +- drivers/bluetooth/btsdio.c | 2 +- drivers/bluetooth/btusb.c | 2 +- drivers/bluetooth/hci_ldisc.c | 2 +- drivers/bluetooth/hci_vhci.c | 6 +++--- include/net/bluetooth/hci.h | 2 +- net/bluetooth/hci_conn.c | 2 +- net/bluetooth/hci_core.c | 28 +++++++++++++--------------- net/bluetooth/hci_event.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/l2cap_core.c | 2 +- net/bluetooth/mgmt.c | 16 ++++++++-------- 12 files changed, 33 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c index 7ad8d61c0c61..e6a85f0e6309 100644 --- a/drivers/bluetooth/btmrvl_main.c +++ b/drivers/bluetooth/btmrvl_main.c @@ -138,7 +138,7 @@ int btmrvl_process_event(struct btmrvl_private *priv, struct sk_buff *skb) if (event->length > 3 && event->data[3]) priv->btmrvl_dev.dev_type = HCI_AMP; else - priv->btmrvl_dev.dev_type = HCI_BREDR; + priv->btmrvl_dev.dev_type = HCI_PRIMARY; BT_DBG("dev_type: %d", priv->btmrvl_dev.dev_type); } else if (priv->btmrvl_dev.sendcmdflag && diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c index 2b05661e3818..1cb958e199eb 100644 --- a/drivers/bluetooth/btsdio.c +++ b/drivers/bluetooth/btsdio.c @@ -311,7 +311,7 @@ static int btsdio_probe(struct sdio_func *func, if (id->class == SDIO_CLASS_BT_AMP) hdev->dev_type = HCI_AMP; else - hdev->dev_type = HCI_BREDR; + hdev->dev_type = HCI_PRIMARY; data->hdev = hdev; diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 880bb5546b25..f2e8fd701bf6 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2832,7 +2832,7 @@ static int btusb_probe(struct usb_interface *intf, if (id->driver_info & BTUSB_AMP) hdev->dev_type = HCI_AMP; else - hdev->dev_type = HCI_BREDR; + hdev->dev_type = HCI_PRIMARY; data->hdev = hdev; diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 49b3e1e2d236..dda97398c59a 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -609,7 +609,7 @@ static int hci_uart_register_dev(struct hci_uart *hu) if (test_bit(HCI_UART_CREATE_AMP, &hu->hdev_flags)) hdev->dev_type = HCI_AMP; else - hdev->dev_type = HCI_BREDR; + hdev->dev_type = HCI_PRIMARY; if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return 0; diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index aba31210c802..3ff229b2e7f3 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -97,10 +97,10 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode) if (data->hdev) return -EBADFD; - /* bits 0-1 are dev_type (BR/EDR or AMP) */ + /* bits 0-1 are dev_type (Primary or AMP) */ dev_type = opcode & 0x03; - if (dev_type != HCI_BREDR && dev_type != HCI_AMP) + if (dev_type != HCI_PRIMARY && dev_type != HCI_AMP) return -EINVAL; /* bits 2-5 are reserved (must be zero) */ @@ -316,7 +316,7 @@ static void vhci_open_timeout(struct work_struct *work) struct vhci_data *data = container_of(work, struct vhci_data, open_timeout.work); - vhci_create_device(data, amp ? HCI_AMP : HCI_BREDR); + vhci_create_device(data, amp ? HCI_AMP : HCI_PRIMARY); } static int vhci_open(struct inode *inode, struct file *file) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index eefcf3e96421..a3f86de6f100 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -65,7 +65,7 @@ #define HCI_I2C 8 /* HCI controller types */ -#define HCI_BREDR 0x00 +#define HCI_PRIMARY 0x00 #define HCI_AMP 0x01 /* First BR/EDR Controller shall have ID = 0 */ diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index bf9f8a801a2e..3809617aa98d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -625,7 +625,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || hci_dev_test_flag(d, HCI_USER_CHANNEL) || - d->dev_type != HCI_BREDR) + d->dev_type != HCI_PRIMARY) continue; /* Simple routing: diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 45a9fc68c677..98f6c3770736 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -260,14 +260,12 @@ static int hci_init1_req(struct hci_request *req, unsigned long opt) hci_reset_req(req, 0); switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: bredr_init(req); break; - case HCI_AMP: amp_init1(req); break; - default: BT_ERR("Unknown device type %d", hdev->dev_type); break; @@ -791,11 +789,11 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; - /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode + /* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode * BR/EDR/LE type controllers. AMP controllers only need the * first two stages of init. */ - if (hdev->dev_type != HCI_BREDR) + if (hdev->dev_type != HCI_PRIMARY) return 0; err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL); @@ -1202,7 +1200,7 @@ int hci_inquiry(void __user *arg) goto done; } - if (hdev->dev_type != HCI_BREDR) { + if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } @@ -1307,7 +1305,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) * since AMP controllers do not have an address. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - hdev->dev_type == HCI_BREDR && + hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; @@ -1402,7 +1400,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT) && - hdev->dev_type == HCI_BREDR) { + hdev->dev_type == HCI_PRIMARY) { ret = __hci_req_hci_power_on(hdev); mgmt_power_on(hdev, ret); } @@ -1563,7 +1561,7 @@ int hci_dev_do_close(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); - if (!auto_off && hdev->dev_type == HCI_BREDR && + if (!auto_off && hdev->dev_type == HCI_PRIMARY && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); @@ -1802,7 +1800,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) goto done; } - if (hdev->dev_type != HCI_BREDR) { + if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } @@ -2043,7 +2041,7 @@ static void hci_power_on(struct work_struct *work) */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || - (hdev->dev_type == HCI_BREDR && + (hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); @@ -3030,7 +3028,7 @@ int hci_register_dev(struct hci_dev *hdev) * so the index can be used as the AMP controller ID. */ switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL); break; case HCI_AMP: @@ -3090,7 +3088,7 @@ int hci_register_dev(struct hci_dev *hdev) hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); - if (hdev->dev_type == HCI_BREDR) { + if (hdev->dev_type == HCI_PRIMARY) { /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ @@ -3415,7 +3413,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: hci_add_acl_hdr(skb, conn->handle, flags); break; case HCI_AMP: @@ -3826,7 +3824,7 @@ static void hci_sched_acl(struct hci_dev *hdev) BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ - if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_BREDR) + if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY) return; /* No AMP link over AMP controller */ diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d4b3dd5413be..3fb95c47243c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3249,7 +3249,7 @@ static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev, struct hci_chan *chan; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: return hci_conn_hash_lookup_handle(hdev, handle); case HCI_AMP: chan = hci_chan_lookup_handle(hdev, handle); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 12e9294b02e0..6ef8a01a9ad4 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -676,7 +676,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; - if (hdev->dev_type != HCI_BREDR) + if (hdev->dev_type != HCI_PRIMARY) return -EOPNOTSUPP; switch (cmd) { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index eb4f5f24cbe3..54ceb1f2cc9a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7468,7 +7468,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) int len; /* For AMP controller do not create l2cap conn */ - if (!conn && hcon->hdev->dev_type != HCI_BREDR) + if (!conn && hcon->hdev->dev_type != HCI_PRIMARY) goto drop; if (!conn) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9e4b931588cf..7983ec8d4c60 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -359,7 +359,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -384,7 +384,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); @@ -419,7 +419,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -444,7 +444,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR && + if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); @@ -479,7 +479,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP) + if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP) count++; } @@ -503,7 +503,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR) { + if (d->dev_type == HCI_PRIMARY) { if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) rp->entry[count].type = 0x01; else @@ -6366,7 +6366,7 @@ void mgmt_index_added(struct hci_dev *hdev) return; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); @@ -6399,7 +6399,7 @@ void mgmt_index_removed(struct hci_dev *hdev) return; switch (hdev->dev_type) { - case HCI_BREDR: + case HCI_PRIMARY: mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { -- cgit v1.2.3 From 1d984c2e03c1fb21539a9f50627e312788512013 Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:39 +0200 Subject: NFC: digital: Fix handling of saved PDU sk_buff pointers This patch fixes the way an I-PDU is saved in case it needs to be sent again. It is now copied using pskb_copy() and not simply referenced using skb_get() since it could be modified by the driver. digital_in_send_saved_skb() and digital_tg_send_saved_skb() still get a reference on the saved skb which is re-sent but release it if the send operation fails. That way the caller doesn't have to take care about skb ref in case of error. RTOX supervisor PDU must not be saved as this can override a previously saved I-PDU that should be re-sent later on. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- include/net/nfc/digital.h | 1 - net/nfc/digital_dep.c | 59 +++++++++++++++++++++++------------------------ 2 files changed, 29 insertions(+), 31 deletions(-) (limited to 'include/net') diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h index 506e3f6eabef..f9a4e4771861 100644 --- a/include/net/nfc/digital.h +++ b/include/net/nfc/digital.h @@ -237,7 +237,6 @@ struct nfc_digital_dev { int nack_count; struct sk_buff *saved_skb; - unsigned int saved_skb_len; u16 target_fsc; diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index b62c85dc12a2..804585cb3f8e 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -524,8 +524,7 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; + ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, data_exch); @@ -627,16 +626,10 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, data_exch); - if (rc) { + if (rc) kfree_skb(skb); - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; - } return rc; } @@ -644,11 +637,19 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev, struct digital_data_exch *data_exch) { + int rc; + + if (!ddev->saved_skb) + return -EINVAL; + skb_get(ddev->saved_skb); - skb_push(ddev->saved_skb, ddev->saved_skb_len); - return digital_in_send_cmd(ddev, ddev->saved_skb, 1500, - digital_in_recv_dep_res, data_exch); + rc = digital_in_send_cmd(ddev, ddev->saved_skb, 1500, + digital_in_recv_dep_res, data_exch); + if (rc) + kfree_skb(ddev->saved_skb); + + return rc; } static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, @@ -812,17 +813,12 @@ static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg, case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */ rc = digital_in_send_saved_skb(ddev, data_exch); - if (rc) { - kfree_skb(ddev->saved_skb); + if (rc) goto error; - } return; } - kfree_skb(ddev->saved_skb); - ddev->saved_skb = NULL; - rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]); if (rc) goto error; @@ -876,8 +872,7 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(tmp_skb); - ddev->saved_skb = skb_get(tmp_skb); - ddev->saved_skb_len = tmp_skb->len; + ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res, data_exch); @@ -956,8 +951,7 @@ static int digital_tg_send_ack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - ddev->saved_skb = skb_get(skb); - ddev->saved_skb_len = skb->len; + ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req, data_exch); @@ -1009,11 +1003,19 @@ static int digital_tg_send_atn(struct nfc_digital_dev *ddev) static int digital_tg_send_saved_skb(struct nfc_digital_dev *ddev) { + int rc; + + if (!ddev->saved_skb) + return -EINVAL; + skb_get(ddev->saved_skb); - skb_push(ddev->saved_skb, ddev->saved_skb_len); - return digital_tg_send_cmd(ddev, ddev->saved_skb, 1500, - digital_tg_recv_dep_req, NULL); + rc = digital_tg_send_cmd(ddev, ddev->saved_skb, 1500, + digital_tg_recv_dep_req, NULL); + if (rc) + kfree_skb(ddev->saved_skb); + + return rc; } static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, @@ -1163,10 +1165,8 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, ddev->atn_count = 0; rc = digital_tg_send_saved_skb(ddev); - if (rc) { - kfree_skb(ddev->saved_skb); + if (rc) goto exit; - } } return; @@ -1235,8 +1235,7 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb) ddev->skb_add_crc(tmp_skb); - ddev->saved_skb = skb_get(tmp_skb); - ddev->saved_skb_len = tmp_skb->len; + ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); rc = digital_tg_send_cmd(ddev, tmp_skb, 1500, digital_tg_recv_dep_req, NULL); -- cgit v1.2.3 From 1a09c56f545c8ff8d338a38c7c40d79f4165a94c Mon Sep 17 00:00:00 2001 From: Thierry Escande Date: Fri, 8 Jul 2016 15:52:45 +0200 Subject: NFC: digital: Add support for NFC DEP Response Waiting Time When sending an ATR_REQ, the initiator must wait for the ATR_RES at least 'RWT(nfcdep,activation) + dRWT(nfcdep)' and no more than 'RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator)'. This gives a timeout value between 1237 ms and 1337 ms. This patch defines DIGITAL_ATR_RES_RWT to 1337 used for the timeout value of ATR_REQ command. For other DEP PDUs, the initiator must wait between 'RWT + dRWT(nfcdep)' and 'RWT + dRWT(nfcdep) + dT(nfcdep,initiator)' where RWT is given by the following formula: '(256 * 16 / f(c)) * 2^wt' where wt is the value of the TO field in the ATR_RES response and is in the range between 0 and 14. This patch declares a mapping table for wt values and gives RWT max values between 100 ms and 5049 ms. This patch also defines DIGITAL_ATR_RES_TO_WT, the maximum wt value in target mode, to 8. Signed-off-by: Thierry Escande Signed-off-by: Samuel Ortiz --- include/net/nfc/digital.h | 1 + net/nfc/digital_dep.c | 71 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/nfc/digital.h b/include/net/nfc/digital.h index f9a4e4771861..74fa7eb94e72 100644 --- a/include/net/nfc/digital.h +++ b/include/net/nfc/digital.h @@ -226,6 +226,7 @@ struct nfc_digital_dev { u8 curr_rf_tech; u8 curr_nfc_dep_pni; u8 did; + u16 dep_rwt; u8 local_payload_max; u8 remote_payload_max; diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index ba52a5dbf3cc..6cf2eeb2e865 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -35,6 +35,8 @@ #define DIGITAL_ATR_REQ_MIN_SIZE 16 #define DIGITAL_ATR_REQ_MAX_SIZE 64 +#define DIGITAL_ATR_RES_TO_WT(s) ((s) & 0xF) + #define DIGITAL_DID_MAX 14 #define DIGITAL_PAYLOAD_SIZE_MAX 254 @@ -122,6 +124,37 @@ static const u8 digital_payload_bits_map[4] = { [3] = 254 }; +/* Response Waiting Time for ATR_RES PDU in ms + * + * RWT(ATR_RES) = RWT(nfcdep,activation) + dRWT(nfcdep) + dT(nfcdep,initiator) + * + * with: + * RWT(nfcdep,activation) = 4096 * 2^12 / f(c) s + * dRWT(nfcdep) = 16 / f(c) s + * dT(nfcdep,initiator) = 100 ms + * f(c) = 13560000 Hz + */ +#define DIGITAL_ATR_RES_RWT 1337 + +/* Response Waiting Time for other DEP PDUs in ms + * + * max_rwt = rwt + dRWT(nfcdep) + dT(nfcdep,initiator) + * + * with: + * rwt = (256 * 16 / f(c)) * 2^wt s + * dRWT(nfcdep) = 16 / f(c) s + * dT(nfcdep,initiator) = 100 ms + * f(c) = 13560000 Hz + * 0 <= wt <= 14 (given by the target by the TO field of ATR_RES response) + */ +#define DIGITAL_NFC_DEP_IN_MAX_WT 14 +#define DIGITAL_NFC_DEP_TG_MAX_WT 8 +static const u16 digital_rwt_map[DIGITAL_NFC_DEP_IN_MAX_WT + 1] = { + 100, 101, 101, 102, 105, + 110, 119, 139, 177, 255, + 409, 719, 1337, 2575, 5049, +}; + static u8 digital_payload_bits_to_size(u8 payload_bits) { if (payload_bits >= ARRAY_SIZE(digital_payload_bits_map)) @@ -366,8 +399,8 @@ static int digital_in_send_psl_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res, - target); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_psl_res, target); if (rc) kfree_skb(skb); @@ -380,6 +413,7 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, struct nfc_target *target = arg; struct digital_atr_res *atr_res; u8 gb_len, payload_bits; + u8 wt; int rc; if (IS_ERR(resp)) { @@ -409,6 +443,11 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, atr_res = (struct digital_atr_res *)resp->data; + wt = DIGITAL_ATR_RES_TO_WT(atr_res->to); + if (wt > DIGITAL_NFC_DEP_IN_MAX_WT) + wt = DIGITAL_NFC_DEP_IN_MAX_WT; + ddev->dep_rwt = digital_rwt_map[wt]; + payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_res->pp); ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits); @@ -490,8 +529,8 @@ int digital_in_send_atr_req(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res, - target); + rc = digital_in_send_cmd(ddev, skb, DIGITAL_ATR_RES_RWT, + digital_in_recv_atr_res, target); if (rc) kfree_skb(skb); @@ -524,8 +563,8 @@ static int digital_in_send_ack(struct nfc_digital_dev *ddev, ddev->saved_skb = pskb_copy(skb, GFP_KERNEL); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) { kfree_skb(skb); kfree_skb(ddev->saved_skb); @@ -559,8 +598,8 @@ static int digital_in_send_nack(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -590,8 +629,8 @@ static int digital_in_send_atn(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -624,8 +663,8 @@ static int digital_in_send_rtox(struct nfc_digital_dev *ddev, ddev->skb_add_crc(skb); - rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(skb); @@ -642,7 +681,7 @@ static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev, skb_get(ddev->saved_skb); - rc = digital_in_send_cmd(ddev, ddev->saved_skb, 1500, + rc = digital_in_send_cmd(ddev, ddev->saved_skb, ddev->dep_rwt, digital_in_recv_dep_res, data_exch); if (rc) kfree_skb(ddev->saved_skb); @@ -885,8 +924,8 @@ int digital_in_send_dep_req(struct nfc_digital_dev *ddev, ddev->saved_skb = pskb_copy(tmp_skb, GFP_KERNEL); - rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res, - data_exch); + rc = digital_in_send_cmd(ddev, tmp_skb, ddev->dep_rwt, + digital_in_recv_dep_res, data_exch); if (rc) { if (tmp_skb != skb) kfree_skb(tmp_skb); @@ -1465,7 +1504,7 @@ static int digital_tg_send_atr_res(struct nfc_digital_dev *ddev, atr_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN; atr_res->cmd = DIGITAL_CMD_ATR_RES; memcpy(atr_res->nfcid3, atr_req->nfcid3, sizeof(atr_req->nfcid3)); - atr_res->to = 8; + atr_res->to = DIGITAL_NFC_DEP_TG_MAX_WT; ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX; payload_bits = digital_payload_size_to_bits(ddev->local_payload_max); -- cgit v1.2.3 From 64b87639c9cbeb03e26bc65528416c961b1dde96 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 3 Jul 2016 13:18:43 +0800 Subject: netfilter: conntrack: fix race between nf_conntrack proc read and hash resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we do "cat /proc/net/nf_conntrack", and meanwhile resize the conntrack hash table via /sys/module/nf_conntrack/parameters/hashsize, race will happen, because reader can observe a newly allocated hash but the old size (or vice versa). So oops will happen like follows: BUG: unable to handle kernel NULL pointer dereference at 0000000000000017 IP: [] seq_print_acct+0x11/0x50 [nf_conntrack] Call Trace: [] ? ct_seq_show+0x14e/0x340 [nf_conntrack] [] seq_read+0x2cc/0x390 [] proc_reg_read+0x42/0x70 [] __vfs_read+0x37/0x130 [] ? security_file_permission+0xa0/0xc0 [] vfs_read+0x95/0x140 [] SyS_read+0x55/0xc0 [] entry_SYSCALL_64_fastpath+0x1a/0xa4 It is very easy to reproduce this kernel crash. 1. open one shell and input the following cmds: while : ; do echo $RANDOM > /sys/module/nf_conntrack/parameters/hashsize done 2. open more shells and input the following cmds: while : ; do cat /proc/net/nf_conntrack done 3. just wait a monent, oops will happen soon. The solution in this patch is based on Florian's Commit 5e3c61f98175 ("netfilter: conntrack: fix lookup race during hash resize"). And add a wrapper function nf_conntrack_get_ht to get hash and hsize suggested by Florian Westphal. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 2 ++ net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 14 ++++++++++---- net/netfilter/nf_conntrack_core.c | 17 +++++++++++++++++ net/netfilter/nf_conntrack_standalone.c | 14 +++++++++----- 4 files changed, 38 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3e2f3328945c..79d7ac5c9740 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -51,6 +51,8 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto); +void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize); + /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index c6f3c406f707..63923710f325 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -26,6 +26,8 @@ struct ct_iter_state { struct seq_net_private p; + struct hlist_nulls_head *hash; + unsigned int htable_size; unsigned int bucket; }; @@ -35,10 +37,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < st->htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -53,11 +55,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } @@ -75,7 +77,11 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ct_iter_state *st = seq->private; + rcu_read_lock(); + + nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 153e33ffeeaa..1289e7e5e0de 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -460,6 +460,23 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } +/* must be called with rcu read lock held */ +void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) +{ + struct hlist_nulls_head *hptr; + unsigned int sequence, hsz; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hsz = nf_conntrack_htable_size; + hptr = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + *hash = hptr; + *hsize = hsz; +} +EXPORT_SYMBOL_GPL(nf_conntrack_get_ht); + /* * Warning : * - Caller must take a reference on returned object diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 2aaa188ee961..958a1455ca7f 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -48,6 +48,8 @@ EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; + struct hlist_nulls_head *hash; + unsigned int htable_size; unsigned int bucket; u_int64_t time_now; }; @@ -58,9 +60,10 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < st->htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); + n = rcu_dereference( + hlist_nulls_first_rcu(&st->hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -75,12 +78,11 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= st->htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu( - &nf_conntrack_hash[st->bucket])); + hlist_nulls_first_rcu(&st->hash[st->bucket])); } return head; } @@ -102,6 +104,8 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos) st->time_now = ktime_get_real_ns(); rcu_read_lock(); + + nf_conntrack_get_ht(&st->hash, &st->htable_size); return ct_get_idx(seq, *pos); } -- cgit v1.2.3 From 242922a027176cd260c5adce4ba6bbfa3a05190c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 3 Jul 2016 20:44:01 +0200 Subject: netfilter: conntrack: simplify early_drop We don't need to acquire the bucket lock during early drop, we can use lockless traveral just like ____nf_conntrack_find. The timer deletion serves as synchronization point, if another cpu attempts to evict same entry, only one will succeed with timer deletion. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 95 ++++++++++++++++++------------------ 2 files changed, 48 insertions(+), 48 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 5d3397f34583..2a5133e214c9 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -301,6 +301,7 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) +#define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1289e7e5e0de..e0e9c9a0f5ba 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -834,67 +834,66 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(struct net *net, unsigned int _hash) +static unsigned int early_drop_list(struct net *net, + struct hlist_nulls_head *head) { - /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i, hash, sequence; - struct nf_conn *ct = NULL; - spinlock_t *lockp; - bool ret = false; + unsigned int drops = 0; + struct nf_conn *tmp; - i = 0; + hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); - local_bh_disable(); -restart: - sequence = read_seqcount_begin(&nf_conntrack_generation); - for (; i < NF_CT_EVICTION_RANGE; i++) { - hash = scale_hash(_hash++); - lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; - nf_conntrack_lock(lockp); - if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { - spin_unlock(lockp); - goto restart; - } - hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], - hnnode) { - tmp = nf_ct_tuplehash_to_ctrack(h); - - if (test_bit(IPS_ASSURED_BIT, &tmp->status) || - !net_eq(nf_ct_net(tmp), net) || - nf_ct_is_dying(tmp)) - continue; - - if (atomic_inc_not_zero(&tmp->ct_general.use)) { - ct = tmp; - break; - } - } + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; - spin_unlock(lockp); - if (ct) - break; + if (!atomic_inc_not_zero(&tmp->ct_general.use)) + continue; + + /* kill only if still in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules. + * + * We steal the timer reference. If that fails timer has + * already fired or someone else deleted it. Just drop ref + * and move to next entry. + */ + if (net_eq(nf_ct_net(tmp), net) && + nf_ct_is_confirmed(tmp) && + del_timer(&tmp->timeout) && + nf_ct_delete(tmp, 0, 0)) + drops++; + + nf_ct_put(tmp); } - local_bh_enable(); + return drops; +} - if (!ct) - return false; +static noinline int early_drop(struct net *net, unsigned int _hash) +{ + unsigned int i; - /* kill only if in same netns -- might have moved due to - * SLAB_DESTROY_BY_RCU rules - */ - if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { - if (nf_ct_delete(ct, 0, 0)) { - NF_CT_STAT_INC_ATOMIC(net, early_drop); - ret = true; + for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { + struct hlist_nulls_head *ct_hash; + unsigned hash, sequence, drops; + + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = scale_hash(_hash++); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + drops = early_drop_list(net, &ct_hash[hash]); + if (drops) { + NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); + return true; } } - nf_ct_put(ct); - return ret; + return false; } static struct nf_conn * -- cgit v1.2.3 From 7c9664351980aaa6a4b8837a314360b3a4ad382a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Jul 2016 12:07:23 +0200 Subject: netfilter: move nat hlist_head to nf_conn The nat extension structure is 32bytes in size on x86_64: struct nf_conn_nat { struct hlist_node bysource; /* 0 16 */ struct nf_conn * ct; /* 16 8 */ union nf_conntrack_nat_help help; /* 24 4 */ int masq_index; /* 28 4 */ /* size: 32, cachelines: 1, members: 4 */ /* last cacheline: 32 bytes */ }; The hlist is needed to quickly check for possible tuple collisions when installing a new nat binding. Storing this in the extension area has two drawbacks: 1. We need ct backpointer to get the conntrack struct from the extension. 2. When reallocation of extension area occurs we need to fixup the bysource hash head via hlist_replace_rcu. We can avoid both by placing the hlist_head in nf_conn and place nf_conn in the bysource hash rather than the extenstion. We can also remove the ->move support; no other extension needs it. Moving the entire nat extension into nf_conn would be possible as well but then we have to add yet another callback for deletion from the bysource hash table rather than just using nat extension ->destroy hook for this. nf_conn size doesn't increase due to aligment, followup patch replaces hlist_node with single pointer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 +++ include/net/netfilter/nf_conntrack_extend.h | 3 --- include/net/netfilter/nf_nat.h | 2 -- net/netfilter/nf_conntrack_extend.c | 15 ++----------- net/netfilter/nf_nat_core.c | 33 ++++++----------------------- 5 files changed, 12 insertions(+), 44 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 2a5133e214c9..e5135d8728b4 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -117,6 +117,9 @@ struct nf_conn { /* Extensions */ struct nf_ct_ext *ext; +#if IS_ENABLED(CONFIG_NF_NAT) + struct hlist_node nat_bysource; +#endif /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; }; diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index b925395fa5ed..1c3035dda31f 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -99,9 +99,6 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, struct nf_ct_ext_type { /* Destroys relationships (can be NULL). */ void (*destroy)(struct nf_conn *ct); - /* Called when realloacted (can be NULL). - Contents has already been moved. */ - void (*move)(void *new, void *old); enum nf_ct_ext_id id; diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 344b1ab19220..02515f7ed4cc 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -29,8 +29,6 @@ struct nf_conn; /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { - struct hlist_node bysource; - struct nf_conn *ct; union nf_conntrack_nat_help help; #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \ IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 1a9545965c0d..02bcf00c2492 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -73,7 +73,7 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, size_t var_alloc_len, gfp_t gfp) { struct nf_ct_ext *old, *new; - int i, newlen, newoff; + int newlen, newoff; struct nf_ct_ext_type *t; /* Conntrack must not be confirmed to avoid races on reallocation. */ @@ -99,19 +99,8 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, return NULL; if (new != old) { - for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!__nf_ct_ext_exist(old, i)) - continue; - - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[i]); - if (t && t->move) - t->move((void *)new + new->offset[i], - (void *)old + old->offset[i]); - rcu_read_unlock(); - } kfree_rcu(old, rcu); - ct->ext = new; + rcu_assign_pointer(ct->ext, new); } new->offset[id] = newoff; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6877a396f8fc..692534701426 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -198,11 +198,9 @@ find_appropriate_src(struct net *net, const struct nf_nat_range *range) { unsigned int h = hash_by_src(net, tuple); - const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { - ct = nat->ct; + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { @@ -435,8 +433,7 @@ nf_nat_setup_info(struct nf_conn *ct, spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate extension aera */ nat = nfct_nat(ct); - nat->ct = ct; - hlist_add_head_rcu(&nat->bysource, + hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -543,7 +540,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (nf_nat_proto_remove(ct, data)) return 1; - if (!nat || !nat->ct) + if (!nat) return 0; /* This netns is being destroyed, and conntrack has nat null binding. @@ -556,9 +553,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) return 1; spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); + hlist_del_rcu(&ct->nat_bysource); ct->status &= ~IPS_NAT_DONE_MASK; - nat->ct = NULL; spin_unlock_bh(&nf_nat_lock); add_timer(&ct->timeout); @@ -688,27 +684,13 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - if (nat == NULL || nat->ct == NULL) + if (!nat) return; - NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&nat->bysource); - spin_unlock_bh(&nf_nat_lock); -} - -static void nf_nat_move_storage(void *new, void *old) -{ - struct nf_conn_nat *new_nat = new; - struct nf_conn_nat *old_nat = old; - struct nf_conn *ct = old_nat->ct; - - if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) - return; + NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); - hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); + hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_lock); } @@ -716,7 +698,6 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .len = sizeof(struct nf_conn_nat), .align = __alignof__(struct nf_conn_nat), .destroy = nf_nat_cleanup_conntrack, - .move = nf_nat_move_storage, .id = NF_CT_EXT_NAT, .flags = NF_CT_EXT_F_PREALLOC, }; -- cgit v1.2.3 From 870190a9ec9075205c0fa795a09fa931694a3ff1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Jul 2016 12:07:24 +0200 Subject: netfilter: nat: convert nat bysrc hash to rhashtable It did use a fixed-size bucket list plus single lock to protect add/del. Unlike the main conntrack table we only need to add and remove keys. Convert it to rhashtable to get table autosizing and per-bucket locking. The maximum number of entries is -- as before -- tied to the number of conntracks so we do not need another upperlimit. The change does not handle rhashtable_remove_fast error, only possible "error" is -ENOENT, and that is something that can happen legitimetely, e.g. because nat module was inserted at a later time and no src manip took place yet. Tested with http-client-benchmark + httpterm with DNAT and SNAT rules in place. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 +- include/net/netfilter/nf_nat.h | 1 + net/netfilter/nf_nat_core.c | 126 +++++++++++++++++++---------------- 3 files changed, 71 insertions(+), 59 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index e5135d8728b4..a08825b7e955 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -118,7 +119,7 @@ struct nf_conn { struct nf_ct_ext *ext; #if IS_ENABLED(CONFIG_NF_NAT) - struct hlist_node nat_bysource; + struct rhash_head nat_bysource; #endif /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 02515f7ed4cc..c327a431a6f3 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -1,5 +1,6 @@ #ifndef _NF_NAT_H #define _NF_NAT_H +#include #include #include #include diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 692534701426..de31818417b8 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,17 +30,19 @@ #include #include -static DEFINE_SPINLOCK(nf_nat_lock); - static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; -static struct hlist_head *nf_nat_bysource __read_mostly; -static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +struct nf_nat_conn_key { + const struct net *net; + const struct nf_conntrack_tuple *tuple; + const struct nf_conntrack_zone *zone; +}; + +static struct rhashtable nf_nat_bysource_table; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -119,19 +121,17 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) { - unsigned int hash; - - get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + const struct nf_conntrack_tuple *t; + const struct nf_conn *ct = data; + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, nf_nat_htable_size); + seed ^= net_hash_mix(nf_ct_net(ct)); + return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), + t->dst.protonum ^ seed); } /* Is this tuple already taken? (not by us) */ @@ -187,6 +187,26 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } +static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct nf_nat_conn_key *key = arg->key; + const struct nf_conn *ct = obj; + + return same_src(ct, key->tuple) && + net_eq(nf_ct_net(ct), key->net) && + nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL); +} + +static struct rhashtable_params nf_nat_bysource_params = { + .head_offset = offsetof(struct nf_conn, nat_bysource), + .obj_hashfn = nf_nat_bysource_hash, + .obj_cmpfn = nf_nat_bysource_cmp, + .nelem_hint = 256, + .min_size = 1024, + .nulls_base = (1U << RHT_BASE_SHIFT), +}; + /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -197,23 +217,23 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; + struct nf_nat_conn_key key = { + .net = net, + .tuple = tuple, + .zone = zone + }; - hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { - if (same_src(ct, tuple) && - net_eq(net, nf_ct_net(ct)) && - nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { - /* Copy source part from reply tuple. */ - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(l3proto, l4proto, result, range)) - return 1; - } - } - return 0; + ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key, + nf_nat_bysource_params); + if (!ct) + return 0; + + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + return in_range(l3proto, l4proto, result, range); } /* For [FUTURE] fragmentation handling, we want the least-used @@ -385,7 +405,6 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { - struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; @@ -426,16 +445,13 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) { - unsigned int srchash; - - srchash = hash_by_src(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); - /* nf_conntrack_alter_reply might re-allocate extension aera */ - nat = nfct_nat(ct); - hlist_add_head_rcu(&ct->nat_bysource, - &nf_nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); + int err; + + err = rhashtable_insert_fast(&nf_nat_bysource_table, + &ct->nat_bysource, + nf_nat_bysource_params); + if (err) + return NF_DROP; } /* It's done. */ @@ -552,10 +568,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) if (!del_timer(&ct->timeout)) return 1; - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); ct->status &= ~IPS_NAT_DONE_MASK; - spin_unlock_bh(&nf_nat_lock); + + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); add_timer(&ct->timeout); @@ -687,11 +703,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (!nat) return; - NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -826,16 +839,13 @@ static int __init nf_nat_init(void) { int ret; - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - - nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); - if (!nf_nat_bysource) - return -ENOMEM; + ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); + if (ret) + return ret; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + rhashtable_destroy(&nf_nat_bysource_table); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -859,7 +869,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + rhashtable_destroy(&nf_nat_bysource_table); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -877,8 +887,8 @@ static void __exit nf_nat_cleanup(void) #endif for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - synchronize_net(); - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + + rhashtable_destroy(&nf_nat_bysource_table); } MODULE_LICENSE("GPL"); -- cgit v1.2.3 From d51ed8367bcbbb06f4f4986d1ef7dc2480bed1ad Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 8 Jul 2016 13:08:50 +0200 Subject: netfilter: constify arg to is_dying/confirmed Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a08825b7e955..1e04911b78ea 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -270,12 +270,12 @@ static inline int nf_ct_is_template(const struct nf_conn *ct) } /* It's confirmed if it is, or has been in the hash table. */ -static inline int nf_ct_is_confirmed(struct nf_conn *ct) +static inline int nf_ct_is_confirmed(const struct nf_conn *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } -static inline int nf_ct_is_dying(struct nf_conn *ct) +static inline int nf_ct_is_dying(const struct nf_conn *ct) { return test_bit(IPS_DYING_BIT, &ct->status); } -- cgit v1.2.3 From 42a55769132fdf4f44bac1471b371d7f80bcde35 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 8 Jul 2016 14:41:49 +0200 Subject: netfilter: nf_tables: get rid of possible_net_t from set and basechain We can pass the netns pointer as parameter to the functions that need to gain access to it. From basechains, I didn't find any client for this field anymore so let's remove this too. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 21 +++++++++++---------- net/netfilter/nf_tables_api.c | 10 ++++------ net/netfilter/nft_hash.c | 20 ++++++++++---------- net/netfilter/nft_lookup.c | 2 +- net/netfilter/nft_rbtree.c | 26 ++++++++++++++------------ 5 files changed, 40 insertions(+), 39 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 30c1d9489ae2..f2f13399ce44 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -236,7 +236,8 @@ struct nft_expr; * @features: features supported by the implementation */ struct nft_set_ops { - bool (*lookup)(const struct nft_set *set, + bool (*lookup)(const struct net *net, + const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, @@ -248,11 +249,14 @@ struct nft_set_ops { struct nft_regs *regs, const struct nft_set_ext **ext); - int (*insert)(const struct nft_set *set, + int (*insert)(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem); - void (*activate)(const struct nft_set *set, + void (*activate)(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem); - void * (*deactivate)(const struct nft_set *set, + void * (*deactivate)(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem); void (*remove)(const struct nft_set *set, const struct nft_set_elem *elem); @@ -295,7 +299,6 @@ void nft_unregister_set(struct nft_set_ops *ops); * @udlen: user data length * @udata: user data * @ops: set ops - * @pnet: network namespace * @flags: set flags * @genmask: generation mask * @klen: key length @@ -318,7 +321,6 @@ struct nft_set { unsigned char *udata; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - possible_net_t pnet; u16 flags:14, genmask:2; u8 klen; @@ -804,7 +806,6 @@ struct nft_stats { * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops - * @pnet: net namespace that this chain belongs to * @type: chain type * @policy: default policy * @stats: per-cpu chain stats @@ -813,7 +814,6 @@ struct nft_stats { */ struct nft_base_chain { struct nf_hook_ops ops[NFT_HOOK_OPS_MAX]; - possible_net_t pnet; const struct nf_chain_type *type; u8 policy; u8 flags; @@ -1009,10 +1009,11 @@ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, return !(ext->genmask & genmask); } -static inline void nft_set_elem_change_active(const struct nft_set *set, +static inline void nft_set_elem_change_active(const struct net *net, + const struct nft_set *set, struct nft_set_ext *ext) { - ext->genmask ^= nft_genmask_next(read_pnet(&set->pnet)); + ext->genmask ^= nft_genmask_next(net); } /* diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 18b7f8578ee0..0211eaec9060 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1405,7 +1405,6 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, rcu_assign_pointer(basechain->stats, stats); } - write_pnet(&basechain->pnet, net); basechain->type = type; chain = &basechain->chain; @@ -2841,7 +2840,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); - write_pnet(&set->pnet, net); set->ops = ops; set->ktype = ktype; set->klen = desc.klen; @@ -3520,7 +3518,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err4; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(set, &elem); + err = set->ops->insert(ctx->net, set, &elem); if (err < 0) goto err5; @@ -3644,7 +3642,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, goto err3; } - priv = set->ops->deactivate(set, &elem); + priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; goto err4; @@ -4018,7 +4016,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(te->set, &te->elem); + te->set->ops->activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); @@ -4143,7 +4141,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(te->set, &te->elem); + te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; nft_trans_destroy(trans); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index ea924816b7b8..564fa7929ed5 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -71,13 +71,13 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, return 0; } -static bool nft_hash_lookup(const struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext) +static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); const struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_cur(read_pnet(&set->pnet)), + .genmask = nft_genmask_cur(net), .set = set, .key = key, }; @@ -125,13 +125,13 @@ err1: return false; } -static int nft_hash_insert(const struct nft_set *set, +static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he = elem->priv; struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; @@ -140,20 +140,20 @@ static int nft_hash_insert(const struct nft_set *set, nft_hash_params); } -static void nft_hash_activate(const struct nft_set *set, +static void nft_hash_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash_elem *he = elem->priv; - nft_set_elem_change_active(set, &he->ext); + nft_set_elem_change_active(net, set, &he->ext); nft_set_elem_clear_busy(&he->ext); } -static void *nft_hash_deactivate(const struct nft_set *set, +static void *nft_hash_deactivate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->pnet); struct nft_hash_elem *he; struct nft_hash_cmp_arg arg = { .genmask = nft_genmask_next(net), @@ -166,7 +166,7 @@ static void *nft_hash_deactivate(const struct nft_set *set, if (he != NULL) { if (!nft_set_elem_mark_busy(&he->ext) || !nft_is_active(net, &he->ext)) - nft_set_elem_change_active(set, &he->ext); + nft_set_elem_change_active(net, set, &he->ext); else he = NULL; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b8d18f598569..e164325d1bc0 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -35,7 +35,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, const struct nft_set_ext *ext; bool found; - found = set->ops->lookup(set, ®s->data[priv->sreg], &ext) ^ + found = set->ops->lookup(pkt->net, set, ®s->data[priv->sreg], &ext) ^ priv->invert; if (!found) { diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index c0f638745adc..6473936d05c6 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -41,13 +41,13 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this, return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; } -static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext) +static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { const struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; + u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); const void *this; int d; @@ -93,13 +93,13 @@ out: return false; } -static int __nft_rbtree_insert(const struct nft_set *set, +static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new) { struct nft_rbtree *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; - u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); int d; parent = NULL; @@ -132,14 +132,14 @@ static int __nft_rbtree_insert(const struct nft_set *set, return 0; } -static int nft_rbtree_insert(const struct nft_set *set, +static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe = elem->priv; int err; spin_lock_bh(&nft_rbtree_lock); - err = __nft_rbtree_insert(set, rbe); + err = __nft_rbtree_insert(net, set, rbe); spin_unlock_bh(&nft_rbtree_lock); return err; @@ -156,21 +156,23 @@ static void nft_rbtree_remove(const struct nft_set *set, spin_unlock_bh(&nft_rbtree_lock); } -static void nft_rbtree_activate(const struct nft_set *set, +static void nft_rbtree_activate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe = elem->priv; - nft_set_elem_change_active(set, &rbe->ext); + nft_set_elem_change_active(net, set, &rbe->ext); } -static void *nft_rbtree_deactivate(const struct nft_set *set, +static void *nft_rbtree_deactivate(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; struct nft_rbtree_elem *rbe, *this = elem->priv; - u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); + u8 genmask = nft_genmask_next(net); int d; while (parent != NULL) { @@ -196,7 +198,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_right; continue; } - nft_set_elem_change_active(set, &rbe->ext); + nft_set_elem_change_active(net, set, &rbe->ext); return rbe; } } -- cgit v1.2.3 From 28aa4c26fce2202db8d42ae76b639ca1d9a23d25 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:40 +0800 Subject: sctp: add SCTP_PR_SUPPORTED on sctp sockopt According to section 4.5 of rfc7496, prsctp_enable should be per asoc. We will add prsctp_enable to both asoc and ep, and replace the places where it used net.sctp->prsctp_enable with asoc->prsctp_enable. ep->prsctp_enable will be initialized with net.sctp->prsctp_enable, and asoc->prsctp_enable will be initialized with ep->prsctp_enable. We can also modify it's value through sockopt SCTP_PR_SUPPORTED. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 6 ++-- include/uapi/linux/sctp.h | 1 + net/sctp/associola.c | 1 + net/sctp/endpointola.c | 1 + net/sctp/sm_make_chunk.c | 12 +++---- net/sctp/socket.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 93 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 83c5ec58b93a..07115ca9de4d 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1256,7 +1256,8 @@ struct sctp_endpoint { /* SCTP-AUTH: endpoint shared keys */ struct list_head endpoint_shared_keys; __u16 active_key_id; - __u8 auth_enable; + __u8 auth_enable:1, + prsctp_enable:1; }; /* Recover the outter endpoint structure. */ @@ -1848,7 +1849,8 @@ struct sctp_association { __u16 active_key_id; __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ - temp:1; /* Is it a temporary association? */ + temp:1, /* Is it a temporary association? */ + prsctp_enable:1; struct sctp_priv_assoc_stats stats; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index ce70fe6b45df..aa08906f292d 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -112,6 +112,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_SOCKOPT_CONNECTX 110 /* CONNECTX requests. */ #define SCTP_SOCKOPT_CONNECTX3 111 /* CONNECTX requests (updated) */ #define SCTP_GET_ASSOC_STATS 112 /* Read only */ +#define SCTP_PR_SUPPORTED 113 /* These are bit fields for msghdr->msg_flags. See section 5.1. */ /* On user space Linux, these live in as an enum. */ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index e1849f3714ad..1c23060c41a6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -268,6 +268,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a goto fail_init; asoc->active_key_id = ep->active_key_id; + asoc->prsctp_enable = ep->prsctp_enable; /* Save the hmacs and chunks list into this association */ if (ep->auth_hmacs_list) diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 9d494e35e7f9..1f03065686fe 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -163,6 +163,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, */ ep->auth_hmacs_list = auth_hmacs; ep->auth_chunk_list = auth_chunks; + ep->prsctp_enable = net->sctp.prsctp_enable; return ep; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 56f364d8f932..0e3045ef57fa 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); - if (net->sctp.prsctp_enable) + if (asoc->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: @@ -355,7 +355,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_addto_param(retval, num_ext, extensions); } - if (net->sctp.prsctp_enable) + if (asoc->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { @@ -2024,8 +2024,8 @@ static void sctp_process_ext_param(struct sctp_association *asoc, for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_FWD_TSN: - if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable) - asoc->peer.prsctp_capable = 1; + if (asoc->prsctp_enable && !asoc->peer.prsctp_capable) + asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he @@ -2169,7 +2169,7 @@ static sctp_ierror_t sctp_verify_param(struct net *net, break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (net->sctp.prsctp_enable) + if (ep->prsctp_enable) break; goto fallthrough; @@ -2653,7 +2653,7 @@ do_addr_param: break; case SCTP_PARAM_FWD_TSN_SUPPORT: - if (net->sctp.prsctp_enable) { + if (asoc->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index cdabbd8219b1..7460ddebd9ce 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3661,6 +3661,39 @@ static int sctp_setsockopt_recvnxtinfo(struct sock *sk, return 0; } +static int sctp_setsockopt_pr_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + asoc->prsctp_enable = !!params.assoc_value; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + sp->ep->prsctp_enable = !!params.assoc_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3821,6 +3854,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_RECVNXTINFO: retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); break; + case SCTP_PR_SUPPORTED: + retval = sctp_setsockopt_pr_supported(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6166,6 +6202,47 @@ static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_pr_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->prsctp_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->ep->prsctp_enable; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6319,6 +6396,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_RECVNXTINFO: retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); break; + case SCTP_PR_SUPPORTED: + retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From 826d253d57b11f69add81c8086d2e7f1dce5ec77 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:42 +0800 Subject: sctp: add SCTP_PR_ASSOC_STATUS on sctp sockopt This patch adds SCTP_PR_ASSOC_STATUS to sctp sockopt, which is used to dump the prsctp statistics info from the asoc. The prsctp statistics includes abandoned_sent/unsent from the asoc. abandoned_sent is the count of the packets we drop packets from retransmit/transmited queue, and abandoned_unsent is the count of the packets we drop from out_queue according to the policy. Note: another option for prsctp statistics dump described in rfc is SCTP_PR_STREAM_STATUS, which is used to dump the prsctp statistics info from each stream. But by now, linux doesn't yet have per stream statistics info, it needs rfc6525 to be implemented. As the prsctp statistics for each stream has to be based on per stream statistics, we will delay it until rfc6525 is done in linux. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 +++ include/uapi/linux/sctp.h | 12 +++++++++ net/sctp/socket.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 07115ca9de4d..d8e464aacb20 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1853,6 +1853,9 @@ struct sctp_association { prsctp_enable:1; struct sctp_priv_assoc_stats stats; + + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; + __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 984cf2e9a61d..d304f4c9792c 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -114,6 +114,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_GET_ASSOC_STATS 112 /* Read only */ #define SCTP_PR_SUPPORTED 113 #define SCTP_DEFAULT_PRINFO 114 +#define SCTP_PR_ASSOC_STATUS 115 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -926,6 +927,17 @@ struct sctp_paddrthlds { __u16 spt_pathpfthld; }; +/* + * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status + */ +struct sctp_prstatus { + sctp_assoc_t sprstat_assoc_id; + __u16 sprstat_sid; + __u16 sprstat_policy; + __u64 sprstat_abandoned_unsent; + __u64 sprstat_abandoned_sent; +}; + struct sctp_default_prinfo { sctp_assoc_t pr_assoc_id; __u32 pr_value; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c03fe1b76706..c3167c43a9c1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6330,6 +6330,64 @@ out: return retval; } +static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_prstatus params; + struct sctp_association *asoc; + int policy; + int retval = -EINVAL; + + if (len < sizeof(params)) + goto out; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) { + retval = -EFAULT; + goto out; + } + + policy = params.sprstat_policy; + if (policy & ~SCTP_PR_SCTP_MASK) + goto out; + + asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); + if (!asoc) + goto out; + + if (policy == SCTP_PR_SCTP_NONE) { + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { + params.sprstat_abandoned_unsent += + asoc->abandoned_unsent[policy]; + params.sprstat_abandoned_sent += + asoc->abandoned_sent[policy]; + } + } else { + params.sprstat_abandoned_unsent = + asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + params.sprstat_abandoned_sent = + asoc->abandoned_sent[__SCTP_PR_INDEX(policy)]; + } + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6490,6 +6548,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_default_prinfo(sk, len, optval, optlen); break; + case SCTP_PR_ASSOC_STATUS: + retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; -- cgit v1.2.3 From a6c2f792873aff332a4689717c3cd6104f46684c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:43 +0800 Subject: sctp: implement prsctp TTL policy prsctp TTL policy is a policy to abandon chunks when they expire at the specific time in local stack. It's similar with expires_at in struct sctp_datamsg. This patch uses sinfo->sinfo_timetolive to set the specific time for TTL policy. sinfo->sinfo_timetolive is also used for msg->expires_at. So if prsctp_enable or TTL policy is not enabled, msg->expires_at still works as before. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 10 ++++++++++ net/sctp/chunk.c | 20 +++++++++++++++++--- net/sctp/output.c | 2 ++ net/sctp/sm_make_chunk.c | 12 ++++++++++++ net/sctp/socket.c | 4 ++-- 5 files changed, 43 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d8e464aacb20..6bcda715008e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -602,6 +602,16 @@ struct sctp_chunk { /* This needs to be recoverable for SCTP_SEND_FAILED events. */ struct sctp_sndrcvinfo sinfo; + /* We use this field to record param for prsctp policies, + * for TTL policy, it is the time_to_drop of this chunk, + * for RTX policy, it is the max_sent_count of this chunk, + * for PRIO policy, it is the priority of this chunk. + */ + unsigned long prsctp_param; + + /* How many times this chunk have been sent, for prsctp RTX policy */ + int sent_count; + /* Which association does this belong to? */ struct sctp_association *asoc; diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 1eb94bf18ef4..2698d122e201 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -335,13 +335,27 @@ errout: /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { - struct sctp_datamsg *msg = chunk->msg; + if (!chunk->asoc->prsctp_enable || + !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) { + struct sctp_datamsg *msg = chunk->msg; + + if (!msg->can_abandon) + return 0; + + if (time_after(jiffies, msg->expires_at)) + return 1; - if (!msg->can_abandon) return 0; + } - if (time_after(jiffies, msg->expires_at)) + if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && + time_after(jiffies, chunk->prsctp_param)) { + if (chunk->sent_count) + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + else + chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; return 1; + } return 0; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 2e9223bb1b3a..7425f6c23888 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -316,6 +316,8 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; + /* Mainly used for prsctp RTX policy */ + chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 0e3045ef57fa..2c431eef3a50 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -711,6 +711,17 @@ nodata: return retval; } +static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, + const struct sctp_sndrcvinfo *sinfo) +{ + if (!chunk->asoc->prsctp_enable) + return; + + if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) + chunk->prsctp_param = + jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); +} + /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ @@ -744,6 +755,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); + sctp_set_prsctp_policy(retval, sinfo); nodata: return retval; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c3167c43a9c1..08614296628a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7099,7 +7099,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | + SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -7123,7 +7123,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | + SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; -- cgit v1.2.3 From 8dbdf1f5b09cb22560e7c7173b52fe3c631046bd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 9 Jul 2016 19:47:45 +0800 Subject: sctp: implement prsctp PRIO policy prsctp PRIO policy is a policy to abandon lower priority chunks when asoc doesn't have enough snd buffer, so that the current chunk with higher priority can be queued successfully. Similar to TTL/RTX policy, we will set the priority of the chunk to prsctp_param with sinfo->sinfo_timetolive in sctp_set_prsctp_policy(). So if PRIO policy is enabled, msg->expire_at won't work. asoc->sent_cnt_removable will record how many chunks can be checked to remove. If priority policy is enabled, when the chunk is queued into the out_queue, we will increase sent_cnt_removable. When the chunk is moved to abandon_queue or dequeue and free, we will decrease sent_cnt_removable. In sctp_sendmsg, we will check if there is enough snd buffer for current msg and if sent_cnt_removable is not 0. Then try to abandon chunks in sctp_prune_prsctp when sendmsg from the retransmit/transmited queue, and free chunks from out_queue in right order until the abandon+free size > msg_len - sctp_wfree. For the abandon size, we have to wait until it sends FORWARD TSN, receives the sack and the chunks are really freed. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++ net/sctp/chunk.c | 1 + net/sctp/outqueue.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++ net/sctp/sm_make_chunk.c | 3 +- net/sctp/socket.c | 3 ++ 5 files changed, 109 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6bcda715008e..8626bdd3249a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1084,6 +1084,8 @@ void sctp_retransmit(struct sctp_outq *, struct sctp_transport *, sctp_retransmit_reason_t); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); +void sctp_prsctp_prune(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, int msg_len); /* Uncork and flush an outqueue. */ static inline void sctp_outq_cork(struct sctp_outq *q) { @@ -1864,6 +1866,8 @@ struct sctp_association { struct sctp_priv_assoc_stats stats; + int sent_cnt_removable; + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; }; diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index b3692b55a8d2..a55e54738b81 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -360,6 +360,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } + /* PRIO policy is processed by sendmsg, not here */ return 0; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 084718f9b3da..72e54a416af6 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -326,6 +326,9 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); + if (chunk->asoc->prsctp_enable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else @@ -372,6 +375,96 @@ static void sctp_insert_list(struct list_head *head, struct list_head *new) list_add_tail(new, head); } +static int sctp_prsctp_prune_sent(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct list_head *queue, int msg_len) +{ + struct sctp_chunk *chk, *temp; + + list_for_each_entry_safe(chk, temp, queue, transmitted_list) { + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->prsctp_param <= sinfo->sinfo_timetolive) + continue; + + list_del_init(&chk->transmitted_list); + sctp_insert_list(&asoc->outqueue.abandoned, + &chk->transmitted_list); + + asoc->sent_cnt_removable--; + asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + + if (!chk->tsn_gap_acked) { + if (chk->transport) + chk->transport->flight_size -= + sctp_data_size(chk); + asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); + } + + msg_len -= SCTP_DATA_SNDSIZE(chk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + if (msg_len <= 0) + break; + } + + return msg_len; +} + +static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct list_head *queue, int msg_len) +{ + struct sctp_chunk *chk, *temp; + + list_for_each_entry_safe(chk, temp, queue, list) { + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || + chk->prsctp_param <= sinfo->sinfo_timetolive) + continue; + + list_del_init(&chk->list); + asoc->sent_cnt_removable--; + asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + + msg_len -= SCTP_DATA_SNDSIZE(chk) + + sizeof(struct sk_buff) + + sizeof(struct sctp_chunk); + sctp_chunk_free(chk); + if (msg_len <= 0) + break; + } + + return msg_len; +} + +/* Abandon the chunks according their priorities */ +void sctp_prsctp_prune(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, int msg_len) +{ + struct sctp_transport *transport; + + if (!asoc->prsctp_enable || !asoc->sent_cnt_removable) + return; + + msg_len = sctp_prsctp_prune_sent(asoc, sinfo, + &asoc->outqueue.retransmit, + msg_len); + if (msg_len <= 0) + return; + + list_for_each_entry(transport, &asoc->peer.transport_addr_list, + transports) { + msg_len = sctp_prsctp_prune_sent(asoc, sinfo, + &transport->transmitted, + msg_len); + if (msg_len <= 0) + return; + } + + sctp_prsctp_prune_unsent(asoc, sinfo, + &asoc->outqueue.out_chunk_list, + msg_len); +} + /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, @@ -962,6 +1055,9 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) /* Mark as failed send. */ sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); + if (asoc->prsctp_enable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; sctp_chunk_free(chunk); continue; } @@ -1251,6 +1347,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); + if (asoc->prsctp_enable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index cfde934af5c5..1c96f4740e67 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -720,7 +720,8 @@ static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) chunk->prsctp_param = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); - else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags)) + else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) || + SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags)) chunk->prsctp_param = sinfo->sinfo_timetolive; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 08614296628a..71c7dc5ea62e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1914,6 +1914,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + if (sctp_wspace(asoc) < msg_len) + sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); -- cgit v1.2.3 From 160b925163c0aabc2c2fbb7d58a75e38b7cd6a17 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Tue, 12 Jul 2016 02:12:16 +0200 Subject: Bluetooth: Add Authentication Failed reason to Disconnected Mgmt event If link is disconnected due to Authentication Failure (PIN or Key Missing status) userspace will be notified about this with proper error code. Many LE profiles define "PIN or Key Missing" status as indication of remote lost bond so this allows userspace to take action on this. @ Device Connected: 88:63:DF:88:0E:83 (1) flags 0x0000 02 01 1a 05 03 0a 18 0d 18 0b 09 48 65 61 72 74 ...........Heart 20 52 61 74 65 Rate > HCI Event: Command Status (0x0f) plen 4 LE Read Remote Used Features (0x08|0x0016) ncmd 1 Status: Success (0x00) > ACL Data RX: Handle 3585 flags 0x02 dlen 11 ATT: Read By Group Type Request (0x10) len 6 Handle range: 0x0001-0xffff Attribute group type: Primary Service (0x2800) > HCI Event: LE Meta Event (0x3e) plen 12 LE Read Remote Used Features (0x04) Status: Success (0x00) Handle: 3585 Features: 0x01 0x00 0x00 0x00 0x00 0x00 0x00 0x00 LE Encryption < HCI Command: LE Start Encryption (0x08|0x0019) plen 28 Handle: 3585 Random number: 0x0000000000000000 Encrypted diversifier: 0x0000 Long term key: 26201cd479a0921b6f949f0b1fa8dc82 > HCI Event: Command Status (0x0f) plen 4 LE Start Encryption (0x08|0x0019) ncmd 1 Status: Success (0x00) > HCI Event: Encryption Change (0x08) plen 4 Status: PIN or Key Missing (0x06) Handle: 3585 Encryption: Disabled (0x00) < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 3585 Reason: Authentication Failure (0x05) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 1 Status: Success (0x00) > HCI Event: Disconnect Complete (0x05) plen 4 Status: Success (0x00) Handle: 3585 Reason: Connection Terminated By Local Host (0x16) @ Device Disconnected: 88:63:DF:88:0E:83 (1) reason 4 @ Device Connected: C4:43:8F:A3:4D:83 (0) flags 0x0000 08 09 4e 65 78 75 73 20 35 ..Nexus 5 > HCI Event: Command Status (0x0f) plen 4 Authentication Requested (0x01|0x0011) ncmd 1 Status: Success (0x00) > HCI Event: Link Key Request (0x17) plen 6 Address: C4:43:8F:A3:4D:83 (LG Electronics) < HCI Command: Link Key Request Reply (0x01|0x000b) plen 22 Address: C4:43:8F:A3:4D:83 (LG Electronics) Link key: 080812e4aa97a863d11826f71f65a933 > HCI Event: Command Complete (0x0e) plen 10 Link Key Request Reply (0x01|0x000b) ncmd 1 Status: Success (0x00) Address: C4:43:8F:A3:4D:83 (LG Electronics) > HCI Event: Auth Complete (0x06) plen 3 Status: PIN or Key Missing (0x06) Handle: 75 @ Authentication Failed: C4:43:8F:A3:4D:83 (0) status 0x05 < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 75 Reason: Remote User Terminated Connection (0x13) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 1 Status: Success (0x00) > HCI Event: Disconnect Complete (0x05) plen 4 Status: Success (0x00) Handle: 75 Reason: Connection Terminated By Local Host (0x16) @ Device Disconnected: C4:43:8F:A3:4D:83 (0) reason 4 Signed-off-by: Szymon Janc Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 1 + net/bluetooth/hci_event.c | 16 +++++++++++++++- 4 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index a3f86de6f100..003b25283407 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -445,6 +445,7 @@ enum { /* ---- HCI Error Codes ---- */ #define HCI_ERROR_UNKNOWN_CONN_ID 0x02 #define HCI_ERROR_AUTH_FAILURE 0x05 +#define HCI_ERROR_PIN_OR_KEY_MISSING 0x06 #define HCI_ERROR_MEMORY_EXCEEDED 0x07 #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index dc71473462ac..77d7fe115a0d 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -654,6 +654,7 @@ enum { HCI_CONN_PARAM_REMOVAL_PEND, HCI_CONN_NEW_LINK_KEY, HCI_CONN_SCANNING, + HCI_CONN_AUTH_FAILURE, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index ea73e0826aa7..7647964b1efa 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -645,6 +645,7 @@ struct mgmt_ev_device_connected { #define MGMT_DEV_DISCONN_TIMEOUT 0x01 #define MGMT_DEV_DISCONN_LOCAL_HOST 0x02 #define MGMT_DEV_DISCONN_REMOTE 0x03 +#define MGMT_DEV_DISCONN_AUTH_FAILURE 0x04 #define MGMT_EV_DEVICE_DISCONNECTED 0x000C struct mgmt_ev_device_disconnected { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3fb95c47243c..e17aacbc5630 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2332,7 +2332,7 @@ static u8 hci_to_mgmt_reason(u8 err) static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_disconn_complete *ev = (void *) skb->data; - u8 reason = hci_to_mgmt_reason(ev->reason); + u8 reason; struct hci_conn_params *params; struct hci_conn *conn; bool mgmt_connected; @@ -2355,6 +2355,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->state = BT_CLOSED; mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); + + if (test_bit(HCI_CONN_AUTH_FAILURE, &conn->flags)) + reason = MGMT_DEV_DISCONN_AUTH_FAILURE; + else + reason = hci_to_mgmt_reason(ev->reason); + mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, reason, mgmt_connected); @@ -2421,6 +2427,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; if (!ev->status) { + clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + if (!hci_conn_ssp_enabled(conn) && test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) { BT_INFO("re-auth of legacy device is not possible."); @@ -2429,6 +2437,9 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->sec_level = conn->pending_sec_level; } } else { + if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) + set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + mgmt_auth_failed(conn, ev->status); } @@ -2613,6 +2624,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { + if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING) + set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags); + hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE); hci_conn_drop(conn); goto unlock; -- cgit v1.2.3 From 9e238323799fb8c2add2b1de9a22edd4d4e51e30 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:55 -0300 Subject: sctp: allow others to use sctp_input_cb We process input path in other files too and having access to it is nice, so move it to a header where it's shared. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 15 +++++++++++++++ net/sctp/input.c | 11 ----------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8626bdd3249a..966c3a40039c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -59,6 +59,7 @@ #include /* We need tq_struct. */ #include /* We need sctp* header structs. */ #include /* We need auth specific structs */ +#include /* For inet_skb_parm */ /* A convenience structure for handling sockaddr structures. * We should wean ourselves off this. @@ -1092,6 +1093,20 @@ static inline void sctp_outq_cork(struct sctp_outq *q) q->cork = 1; } +/* SCTP skb control block. + * sctp_input_cb is currently used on rx and sock rx queue + */ +struct sctp_input_cb { + union { + struct inet_skb_parm h4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; + struct sctp_chunk *chunk; +}; +#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) + /* These bind address data fields common between endpoints and associations */ struct sctp_bind_addr { diff --git a/net/sctp/input.c b/net/sctp/input.c index 6f8e676d285e..7a327ff71f08 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) return 0; } -struct sctp_input_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - struct sctp_chunk *chunk; -}; -#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) - /* * This is the routine which IP calls when receiving an SCTP packet. */ -- cgit v1.2.3 From f5d258e60722142e88cb6f0f337d78bca67cf973 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:56 -0300 Subject: sctp: reorder sctp_ulpevent and shrink msg_flags The next patch needs 8 bytes in there. sctp_ulpevent has a hole due to bad alignment; msg_flags is using 4 bytes while it actually uses only 2, so we shrink it, and iif member (4 bytes) which can be easily fetched from another place once the next patch is there, so we remove it and thus creating space for 8 bytes. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/ulpevent.h | 10 +++++----- net/sctp/ulpevent.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index cccdcfd14973..aa342645dbce 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -48,15 +48,15 @@ */ struct sctp_ulpevent { struct sctp_association *asoc; - __u16 stream; - __u16 ssn; - __u16 flags; + unsigned int rmem_len; __u32 ppid; __u32 tsn; __u32 cumtsn; - int msg_flags; int iif; - unsigned int rmem_len; + __u16 stream; + __u16 ssn; + __u16 flags; + __u16 msg_flags; }; /* Retrieve the skb this event sits inside of. */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d1e38308f615..706f5bc9f0c3 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ static void sctp_ulpevent_init(struct sctp_ulpevent *event, - int msg_flags, + __u16 msg_flags, unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); @@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event, } /* Create a new sctp_ulpevent. */ -static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, +static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags, gfp_t gfp) { struct sctp_ulpevent *event; -- cgit v1.2.3 From 1f45f78f8e511203f03138f2ccde3d2cf90d2cbf Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:57 -0300 Subject: sctp: allow GSO frags to access the chunk too SCTP will try to access original IP headers on sctp_recvmsg in order to copy the addresses used. There are also other places that do similar access to IP or even SCTP headers. But after 90017accff61 ("sctp: Add GSO support") they aren't always there because they are only present in the header skb. SCTP handles the queueing of incoming data by cloning the incoming skb and limiting to only the relevant payload. This clone has its cb updated to something different and it's then queued on socket rx queue. Thus we need to fix this in two moments. For rx path, not related to socket queue yet, this patch uses a partially copied sctp_input_cb to such GSO frags. This restores the ability to access the headers for this part of the code. Regarding the socket rx queue, it removes iif member from sctp_event and also add a chunk pointer on it. With these changes we're always able to reach the headers again. The biggest change here is that now the sctp_chunk struct and the original skb are only freed after the application consumed the buffer. Note however that the original payload was already like this due to the skb cloning. For iif, SCTP's IPv4 code doesn't use it, so no change is necessary. IPv6 now can fetch it directly from original's IPv6 CB as the original skb is still accessible. In the future we probably can simplify sctp_v*_skb_iif() stuff, as sctp_v4_skb_iif() was called but it's return value not used, and now it's not even called, but such cleanup is out of scope for this change. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 7 +++++++ include/net/sctp/ulpevent.h | 2 +- net/sctp/inqueue.c | 7 +++++++ net/sctp/ipv6.c | 9 ++++----- net/sctp/protocol.c | 1 + net/sctp/sm_statefuns.c | 3 ++- net/sctp/socket.c | 10 +++++++--- net/sctp/ulpevent.c | 10 +++++++++- 8 files changed, 38 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 966c3a40039c..f6f201de6fa4 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1107,6 +1107,13 @@ struct sctp_input_cb { }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) +static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb) +{ + const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + + return chunk->head_skb ? : skb; +} + /* These bind address data fields common between endpoints and associations */ struct sctp_bind_addr { diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index aa342645dbce..2c098cd7e7e2 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -48,11 +48,11 @@ */ struct sctp_ulpevent { struct sctp_association *asoc; + struct sctp_chunk *chunk; unsigned int rmem_len; __u32 ppid; __u32 tsn; __u32 cumtsn; - int iif; __u16 stream; __u16 ssn; __u16 flags; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index edabbbdfca54..147d975b0455 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -218,6 +218,13 @@ new_skb: chunk->has_asconf = 0; chunk->end_of_packet = 0; chunk->ecn_ce_done = 0; + if (chunk->head_skb) { + struct sctp_input_cb + *cb = SCTP_INPUT_CB(chunk->skb), + *head_cb = SCTP_INPUT_CB(chunk->head_skb); + + cb->chunk = head_cb->chunk; + } } chunk->chunk_hdr = ch; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0657d18a85bf..ae6f1a2178ba 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; @@ -710,8 +711,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { - struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; - return opt->iif; + return IP6CB(skb)->iif; } /* Was this packet marked by Explicit Congestion Notification? */ @@ -780,15 +780,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; - addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; + addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { - struct sctp_ulpevent *ev = sctp_skb2event(skb); - addr->v6.sin6_scope_id = ev->iif; + addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); } } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 3b56ae55aba3..1adb9270e317 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -240,6 +240,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, port = &addr->v4.sin_port; addr->v4.sin_family = AF_INET; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f1f08c8f277b..5aabf42065e2 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6125,7 +6125,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, af = sctp_get_af_specific( ipver2af(ip_hdr(chunk->skb)->version)); - if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) { + if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) && + asoc->peer.ecn_capable) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 71c7dc5ea62e..52fdd540a9ef 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2066,7 +2066,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *head_skb; int copied; int err = 0; int skb_len; @@ -2102,12 +2102,16 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (err) goto out_free; - sock_recv_ts_and_drops(msg, sk, skb); + if (event->chunk && event->chunk->head_skb) + head_skb = event->chunk->head_skb; + else + head_skb = skb; + sock_recv_ts_and_drops(msg, sk, head_skb); if (sctp_ulpevent_is_notification(event)) { msg->msg_flags |= MSG_NOTIFICATION; sp->pf->event_msgname(event, msg->msg_name, addr_len); } else { - sp->pf->skb_msgname(skb, msg->msg_name, addr_len); + sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len); } /* Check if we allow SCTP_NXTINFO. */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 706f5bc9f0c3..f6219b164b42 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -701,6 +701,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, sctp_ulpevent_receive_data(event, asoc); + /* And hold the chunk as we need it for getting the IP headers + * later in recvmsg + */ + sctp_chunk_hold(chunk); + event->chunk = chunk; + event->stream = ntohs(chunk->subh.data_hdr->stream); event->ssn = ntohs(chunk->subh.data_hdr->ssn); event->ppid = chunk->subh.data_hdr->ppid; @@ -710,11 +716,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, } event->tsn = ntohl(chunk->subh.data_hdr->tsn); event->msg_flags |= chunk->chunk_hdr->flags; - event->iif = sctp_chunk_iif(chunk); return event; fail_mark: + sctp_chunk_put(chunk); kfree_skb(skb); fail: return NULL; @@ -1007,6 +1013,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) done: sctp_assoc_rwnd_increase(event->asoc, len); + sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } @@ -1029,6 +1036,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) } done: + sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } -- cgit v1.2.3 From e7487c86dc5c4a528a7dbd9dc14f453a0de61a84 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:58 -0300 Subject: sctp: avoid identifying address family many times for a chunk Identifying address family operations during rx path is not something expensive but it's ugly to the eye to have it done multiple times, specially when we already validated it during initial rx processing. This patch takes advantage of the now shared sctp_input_cb and make the pointer to the operations readily available. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + net/sctp/input.c | 1 + net/sctp/inqueue.c | 1 + net/sctp/sm_make_chunk.c | 20 ++++---------------- net/sctp/sm_statefuns.c | 7 ++----- 5 files changed, 9 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index f6f201de6fa4..ce93c4b10d26 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1104,6 +1104,7 @@ struct sctp_input_cb { #endif } header; struct sctp_chunk *chunk; + struct sctp_af *af; }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) diff --git a/net/sctp/input.c b/net/sctp/input.c index 7a327ff71f08..30d72f7707b6 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -140,6 +140,7 @@ int sctp_rcv(struct sk_buff *skb) af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; + SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 147d975b0455..8fc773f9b59a 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -224,6 +224,7 @@ new_skb: *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; + cb->af = head_cb->af; } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 1c96f4740e67..8c77b87a8565 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { - struct sctp_af *af; - int iif = 0; - - af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version)); - if (af) - iif = af->skb_iif(chunk->skb); + struct sk_buff *skb = chunk->skb; - return iif; + return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) @@ -1600,7 +1595,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_association *asoc; struct sk_buff *skb; sctp_scope_t scope; - struct sctp_af *af; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); @@ -1610,16 +1604,10 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ - af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version)); - if (unlikely(!af)) - goto fail; - af->from_skb(&asoc->c.peer_addr, skb, 1); + SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); + nodata: return asoc; - -fail: - sctp_association_free(asoc); - return NULL; } /* Build a cookie representing asoc. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5aabf42065e2..b7c1f7f3c838 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6119,13 +6119,10 @@ static int sctp_eat_data(const struct sctp_association *asoc, */ if (!chunk->ecn_ce_done) { - struct sctp_af *af; + struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af; chunk->ecn_ce_done = 1; - af = sctp_get_af_specific( - ipver2af(ip_hdr(chunk->skb)->version)); - - if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) && + if (af->is_ce(sctp_gso_headskb(chunk->skb)) && asoc->peer.ecn_capable) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, -- cgit v1.2.3 From 8438884d4ab423161b974854ebb90c08219dd678 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 14 Jul 2016 10:32:43 +0300 Subject: net/switchdev: Export the same parent ID service function This helper serves to know if two switchdev port netdevices belong to the same HW ASIC, e.g to figure out if forwarding offload is possible between them. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/switchdev.h | 8 ++++++++ net/switchdev/switchdev.c | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 985619a59323..9023e3e3be0b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -227,6 +227,8 @@ void switchdev_port_fwd_mark_set(struct net_device *dev, struct net_device *group_dev, bool joining); +bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b); #else static inline void switchdev_deferred_process(void) @@ -351,6 +353,12 @@ static inline void switchdev_port_fwd_mark_set(struct net_device *dev, { } +static inline bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) +{ + return false; +} + #endif #endif /* _LINUX_SWITCHDEV_H_ */ diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 59658b2e9cdf..a5fc9dd24aa9 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1286,8 +1286,8 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi) } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); -static bool switchdev_port_same_parent_id(struct net_device *a, - struct net_device *b) +bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) { struct switchdev_attr a_attr = { .orig_dev = a, @@ -1323,6 +1323,7 @@ static u32 switchdev_port_fwd_mark_get(struct net_device *dev, return dev->ifindex; } +EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, u32 old_mark, u32 *reset_mark) -- cgit v1.2.3 From f962fe32f2f85769cd835ddcecbff8c1d34cf561 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 17 Jul 2016 19:55:15 +0200 Subject: Bluetooth: Move hci_recv_frame and hci_recv_diag prototypes The protoypes for hci_recv_frame and hci_recv_diag are in the wrong location in the header file. Move them close to all the other hci_dev related exported functions. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 77d7fe115a0d..84d0273d826a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1022,6 +1022,8 @@ void hci_unregister_dev(struct hci_dev *hdev); int hci_suspend_dev(struct hci_dev *hdev); int hci_resume_dev(struct hci_dev *hdev); int hci_reset_dev(struct hci_dev *hdev); +int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); +int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); @@ -1098,9 +1100,6 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); -int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); -int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); - void hci_init_sysfs(struct hci_dev *hdev); void hci_conn_init_sysfs(struct hci_conn *conn); void hci_conn_add_sysfs(struct hci_conn *conn); -- cgit v1.2.3 From 5177a83827cd0b8cf6ce0391b00dd4417352d2f1 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 17 Jul 2016 19:55:16 +0200 Subject: Bluetooth: Add debugfs fields for hardware and firmware info Some Bluetooth controllers allow for reading hardware and firmware related vendor specific infos. If they are available, then they can be exposed via debugfs now. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 4 ++++ net/bluetooth/hci_core.c | 24 ++++++++++++++++++++++++ net/bluetooth/hci_debugfs.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 84d0273d826a..ee7fc47680a1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -372,6 +372,8 @@ struct hci_dev { atomic_t promisc; + const char *hw_info; + const char *fw_info; struct dentry *debugfs; struct device dev; @@ -1024,6 +1026,8 @@ int hci_resume_dev(struct hci_dev *hdev); int hci_reset_dev(struct hci_dev *hdev); int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb); int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb); +void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...); +void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...); int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 98f6c3770736..ddf8432fe8fb 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3163,6 +3163,8 @@ void hci_unregister_dev(struct hci_dev *hdev) device_del(&hdev->dev); debugfs_remove_recursive(hdev->debugfs); + kfree_const(hdev->hw_info); + kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); @@ -3266,6 +3268,28 @@ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_diag); +void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + kfree_const(hdev->hw_info); + hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); + va_end(vargs); +} +EXPORT_SYMBOL(hci_set_hw_info); + +void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) +{ + va_list vargs; + + va_start(vargs, fmt); + kfree_const(hdev->fw_info); + hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); + va_end(vargs); +} +EXPORT_SYMBOL(hci_set_fw_info); + /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 7db4220941cc..63df63ebfb24 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -76,6 +76,30 @@ static const struct file_operations __name ## _fops = { \ .llseek = default_llseek, \ } \ +#define DEFINE_INFO_ATTRIBUTE(__name, __field) \ +static int __name ## _show(struct seq_file *f, void *ptr) \ +{ \ + struct hci_dev *hdev = f->private; \ + \ + hci_dev_lock(hdev); \ + seq_printf(f, "%s\n", hdev->__field ? : ""); \ + hci_dev_unlock(hdev); \ + \ + return 0; \ +} \ + \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .open = __name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} \ + static int features_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -349,6 +373,9 @@ static const struct file_operations sc_only_mode_fops = { .llseek = default_llseek, }; +DEFINE_INFO_ATTRIBUTE(hardware_info, hw_info); +DEFINE_INFO_ATTRIBUTE(firmware_info, fw_info); + void hci_debugfs_create_common(struct hci_dev *hdev) { debugfs_create_file("features", 0444, hdev->debugfs, hdev, @@ -382,6 +409,14 @@ void hci_debugfs_create_common(struct hci_dev *hdev) if (lmp_sc_capable(hdev) || lmp_le_capable(hdev)) debugfs_create_file("sc_only_mode", 0444, hdev->debugfs, hdev, &sc_only_mode_fops); + + if (hdev->hw_info) + debugfs_create_file("hardware_info", 0444, hdev->debugfs, + hdev, &hardware_info_fops); + + if (hdev->fw_info) + debugfs_create_file("firmware_info", 0444, hdev->debugfs, + hdev, &firmware_info_fops); } static int inquiry_cache_show(struct seq_file *f, void *p) -- cgit v1.2.3 From 359ebda25aa06fe3a1d028f7e338a849165e661b Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 18 Jul 2016 14:49:33 +0300 Subject: net/ipv4: Introduce IPSKB_FRAG_SEGS bit to inet_skb_parm.flags This flag indicates whether fragmentation of segments is allowed. Formerly this policy was hardcoded according to IPSKB_FORWARDED (set by either ip_forward or ipmr_forward). Cc: Hannes Frederic Sowa Cc: Florian Westphal Signed-off-by: Shmulik Ladkani Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 6 ++++-- net/ipv4/ipmr.c | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 08f36cd2b874..9742b92dc933 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -47,6 +47,7 @@ struct inet_skb_parm { #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) +#define IPSKB_FRAG_SEGS BIT(7) u16 frag_max_size; }; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 9f0a7b96646f..8b4ffd216839 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e23f141c9ba5..dde37fb340bf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -223,8 +223,10 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: locally created skb or seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + /* common case: fragmentation of segments is not allowed, + * or seglen is <= mtu + */ + if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e0d76f5f0113..eec234161b89 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1749,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output -- cgit v1.2.3 From 34a79f63bbe49c888f95e75dd759685a238556b6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 18 Jul 2016 20:45:38 -0400 Subject: net: dsa: support switchdev ageing time attr Add a new function for DSA drivers to handle the switchdev SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute. The ageing time is passed as milliseconds. Also because we can have multiple logical bridges on top of a physical switch and ageing time are switch-wide, call the driver function with the fastest ageing time in use on the chip instead of the requested one. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ net/dsa/slave.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 52ab18bc2b0d..2217a3f817f8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -141,6 +141,7 @@ struct dsa_switch_tree { struct dsa_port { struct net_device *netdev; struct device_node *dn; + unsigned int ageing_time; }; struct dsa_switch { @@ -329,6 +330,7 @@ struct dsa_switch_driver { /* * Bridge integration */ + int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); void (*port_bridge_leave)(struct dsa_switch *ds, int port); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7236eb26dc97..fc9196745225 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -333,6 +333,44 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, return 0; } +static int dsa_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) +{ + int i; + + for (i = 0; i < DSA_MAX_PORTS; ++i) { + struct dsa_port *dp = &ds->ports[i]; + + if (dp && dp->ageing_time && dp->ageing_time < ageing_time) + ageing_time = dp->ageing_time; + } + + return ageing_time; +} + +static int dsa_slave_ageing_time(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); + unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); + + /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ + if (switchdev_trans_ph_prepare(trans)) + return 0; + + /* Keep the fastest ageing time in case of multiple bridges */ + ds->ports[p->port].ageing_time = ageing_time; + ageing_time = dsa_fastest_ageing_time(ds, ageing_time); + + if (ds->drv->set_ageing_time) + return ds->drv->set_ageing_time(ds, ageing_time); + + return 0; +} + static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -346,6 +384,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev, case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: ret = dsa_slave_vlan_filtering(dev, attr, trans); break; + case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: + ret = dsa_slave_ageing_time(dev, attr, trans); + break; default: ret = -EOPNOTSUPP; break; -- cgit v1.2.3 From 2d283bdd079c0ad4da020bbc9e9c2a4280823098 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:16 +1000 Subject: net/ncsi: Resource management NCSI spec (DSP0222) defines several objects: package, channel, mode, filter, version and statistics etc. This introduces the data structs to represent those objects and implement functions to manage them. Also, this introduces CONFIG_NET_NCSI for the newly implemented NCSI stack. * The user (e.g. netdev driver) dereference NCSI device by "struct ncsi_dev", which is embedded to "struct ncsi_dev_priv". The later one is used by NCSI stack internally. * Every NCSI device can have multiple packages simultaneously, up to 8 packages. It's represented by "struct ncsi_package" and identified by 3-bits ID. * Every NCSI package can have multiple channels, up to 32. It's represented by "struct ncsi_channel" and identified by 5-bits ID. * Every NCSI channel has version, statistics, various modes and filters. They are represented by "struct ncsi_channel_version", "struct ncsi_channel_stats", "struct ncsi_channel_mode" and "struct ncsi_channel_filter" separately. * Apart from AEN (Asynchronous Event Notification), the NCSI stack works in terms of command and response. This introduces "struct ncsi_req" to represent a complete NCSI transaction made of NCSI request and response. link: https://www.dmtf.org/sites/default/files/standards/documents/DSP0222_1.1.0.pdf Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- include/net/ncsi.h | 46 ++++++ net/Kconfig | 1 + net/Makefile | 1 + net/ncsi/Kconfig | 12 ++ net/ncsi/Makefile | 4 + net/ncsi/internal.h | 256 +++++++++++++++++++++++++++++ net/ncsi/ncsi-manage.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 756 insertions(+) create mode 100644 include/net/ncsi.h create mode 100644 net/ncsi/Kconfig create mode 100644 net/ncsi/Makefile create mode 100644 net/ncsi/internal.h create mode 100644 net/ncsi/ncsi-manage.c (limited to 'include/net') diff --git a/include/net/ncsi.h b/include/net/ncsi.h new file mode 100644 index 000000000000..70d14ee1ef84 --- /dev/null +++ b/include/net/ncsi.h @@ -0,0 +1,46 @@ +#ifndef __NET_NCSI_H +#define __NET_NCSI_H + +/* + * The NCSI device states seen from external. More NCSI device states are + * only visible internally (in net/ncsi/internal.h). When the NCSI device + * is registered, it's in ncsi_dev_state_registered state. The state + * ncsi_dev_state_start is used to drive to choose active package and + * channel. After that, its state is changed to ncsi_dev_state_functional. + * + * The state ncsi_dev_state_stop helps to shut down the currently active + * package and channel while ncsi_dev_state_config helps to reconfigure + * them. + */ +enum { + ncsi_dev_state_registered = 0x0000, + ncsi_dev_state_functional = 0x0100, + ncsi_dev_state_probe = 0x0200, + ncsi_dev_state_config = 0x0300, + ncsi_dev_state_suspend = 0x0400, +}; + +struct ncsi_dev { + int state; + int link_up; + struct net_device *dev; + void (*handler)(struct ncsi_dev *ndev); +}; + +#ifdef CONFIG_NET_NCSI +struct ncsi_dev *ncsi_register_dev(struct net_device *dev, + void (*notifier)(struct ncsi_dev *nd)); +void ncsi_unregister_dev(struct ncsi_dev *nd); +#else /* !CONFIG_NET_NCSI */ +static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev, + void (*notifier)(struct ncsi_dev *nd)) +{ + return NULL; +} + +static inline void ncsi_unregister_dev(struct ncsi_dev *nd) +{ +} +#endif /* CONFIG_NET_NCSI */ + +#endif /* __NET_NCSI_H */ diff --git a/net/Kconfig b/net/Kconfig index ff40562a782c..c2cdbce629bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -237,6 +237,7 @@ source "net/hsr/Kconfig" source "net/switchdev/Kconfig" source "net/l3mdev/Kconfig" source "net/qrtr/Kconfig" +source "net/ncsi/Kconfig" config RPS bool diff --git a/net/Makefile b/net/Makefile index bdd14553a774..9bd20bb86cc6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -79,3 +79,4 @@ ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ +obj-$(CONFIG_NET_NCSI) += ncsi/ diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig new file mode 100644 index 000000000000..08a8a6031fd7 --- /dev/null +++ b/net/ncsi/Kconfig @@ -0,0 +1,12 @@ +# +# Configuration for NCSI support +# + +config NET_NCSI + bool "NCSI interface support" + depends on INET + ---help--- + This module provides NCSI (Network Controller Sideband Interface) + support. Enable this only if your system connects to a network + device via NCSI and the ethernet driver you're using supports + the protocol explicitly. diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile new file mode 100644 index 000000000000..07b5625155d7 --- /dev/null +++ b/net/ncsi/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for NCSI API +# +obj-$(CONFIG_NET_NCSI) += ncsi-manage.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h new file mode 100644 index 000000000000..89028e1f83cd --- /dev/null +++ b/net/ncsi/internal.h @@ -0,0 +1,256 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_INTERNAL_H__ +#define __NCSI_INTERNAL_H__ + +enum { + NCSI_CAP_BASE = 0, + NCSI_CAP_GENERIC = 0, + NCSI_CAP_BC, + NCSI_CAP_MC, + NCSI_CAP_BUFFER, + NCSI_CAP_AEN, + NCSI_CAP_VLAN, + NCSI_CAP_MAX +}; + +enum { + NCSI_CAP_GENERIC_HWA = 0x01, /* HW arbitration */ + NCSI_CAP_GENERIC_HDS = 0x02, /* HNC driver status change */ + NCSI_CAP_GENERIC_FC = 0x04, /* HNC to MC flow control */ + NCSI_CAP_GENERIC_FC1 = 0x08, /* MC to HNC flow control */ + NCSI_CAP_GENERIC_MC = 0x10, /* Global MC filtering */ + NCSI_CAP_GENERIC_HWA_UNKNOWN = 0x00, /* Unknown HW arbitration */ + NCSI_CAP_GENERIC_HWA_SUPPORT = 0x20, /* Supported HW arbitration */ + NCSI_CAP_GENERIC_HWA_NOT_SUPPORT = 0x40, /* No HW arbitration */ + NCSI_CAP_GENERIC_HWA_RESERVED = 0x60, /* Reserved HW arbitration */ + NCSI_CAP_GENERIC_HWA_MASK = 0x60, /* Mask for HW arbitration */ + NCSI_CAP_GENERIC_MASK = 0x7f, + NCSI_CAP_BC_ARP = 0x01, /* ARP packet filtering */ + NCSI_CAP_BC_DHCPC = 0x02, /* DHCP client filtering */ + NCSI_CAP_BC_DHCPS = 0x04, /* DHCP server filtering */ + NCSI_CAP_BC_NETBIOS = 0x08, /* NetBIOS packet filtering */ + NCSI_CAP_BC_MASK = 0x0f, + NCSI_CAP_MC_IPV6_NEIGHBOR = 0x01, /* IPv6 neighbor filtering */ + NCSI_CAP_MC_IPV6_ROUTER = 0x02, /* IPv6 router filering */ + NCSI_CAP_MC_DHCPV6_RELAY = 0x04, /* DHCPv6 relay / server MC */ + NCSI_CAP_MC_DHCPV6_WELL_KNOWN = 0x08, /* DHCPv6 well-known MC */ + NCSI_CAP_MC_IPV6_MLD = 0x10, /* IPv6 MLD filtering */ + NCSI_CAP_MC_IPV6_NEIGHBOR_S = 0x20, /* IPv6 neighbour filtering */ + NCSI_CAP_MC_MASK = 0x3f, + NCSI_CAP_AEN_LSC = 0x01, /* Link status change */ + NCSI_CAP_AEN_CR = 0x02, /* Configuration required */ + NCSI_CAP_AEN_HDS = 0x04, /* HNC driver status */ + NCSI_CAP_AEN_MASK = 0x07, + NCSI_CAP_VLAN_ONLY = 0x01, /* Filter VLAN packet only */ + NCSI_CAP_VLAN_NO = 0x02, /* Filter VLAN and non-VLAN */ + NCSI_CAP_VLAN_ANY = 0x04, /* Filter Any-and-non-VLAN */ + NCSI_CAP_VLAN_MASK = 0x07 +}; + +enum { + NCSI_MODE_BASE = 0, + NCSI_MODE_ENABLE = 0, + NCSI_MODE_TX_ENABLE, + NCSI_MODE_LINK, + NCSI_MODE_VLAN, + NCSI_MODE_BC, + NCSI_MODE_MC, + NCSI_MODE_AEN, + NCSI_MODE_FC, + NCSI_MODE_MAX +}; + +enum { + NCSI_FILTER_BASE = 0, + NCSI_FILTER_VLAN = 0, + NCSI_FILTER_UC, + NCSI_FILTER_MC, + NCSI_FILTER_MIXED, + NCSI_FILTER_MAX +}; + +struct ncsi_channel_version { + u32 version; /* Supported BCD encoded NCSI version */ + u32 alpha2; /* Supported BCD encoded NCSI version */ + u8 fw_name[12]; /* Firware name string */ + u32 fw_version; /* Firmware version */ + u16 pci_ids[4]; /* PCI identification */ + u32 mf_id; /* Manufacture ID */ +}; + +struct ncsi_channel_cap { + u32 index; /* Index of channel capabilities */ + u32 cap; /* NCSI channel capability */ +}; + +struct ncsi_channel_mode { + u32 index; /* Index of channel modes */ + u32 enable; /* Enabled or disabled */ + u32 size; /* Valid entries in ncm_data[] */ + u32 data[8]; /* Data entries */ +}; + +struct ncsi_channel_filter { + u32 index; /* Index of channel filters */ + u32 total; /* Total entries in the filter table */ + u64 bitmap; /* Bitmap of valid entries */ + u32 data[]; /* Data for the valid entries */ +}; + +struct ncsi_channel_stats { + u32 hnc_cnt_hi; /* Counter cleared */ + u32 hnc_cnt_lo; /* Counter cleared */ + u32 hnc_rx_bytes; /* Rx bytes */ + u32 hnc_tx_bytes; /* Tx bytes */ + u32 hnc_rx_uc_pkts; /* Rx UC packets */ + u32 hnc_rx_mc_pkts; /* Rx MC packets */ + u32 hnc_rx_bc_pkts; /* Rx BC packets */ + u32 hnc_tx_uc_pkts; /* Tx UC packets */ + u32 hnc_tx_mc_pkts; /* Tx MC packets */ + u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u32 hnc_fcs_err; /* FCS errors */ + u32 hnc_align_err; /* Alignment errors */ + u32 hnc_false_carrier; /* False carrier detection */ + u32 hnc_runt_pkts; /* Rx runt packets */ + u32 hnc_jabber_pkts; /* Rx jabber packets */ + u32 hnc_rx_pause_xon; /* Rx pause XON frames */ + u32 hnc_rx_pause_xoff; /* Rx XOFF frames */ + u32 hnc_tx_pause_xon; /* Tx XON frames */ + u32 hnc_tx_pause_xoff; /* Tx XOFF frames */ + u32 hnc_tx_s_collision; /* Single collision frames */ + u32 hnc_tx_m_collision; /* Multiple collision frames */ + u32 hnc_l_collision; /* Late collision frames */ + u32 hnc_e_collision; /* Excessive collision frames */ + u32 hnc_rx_ctl_frames; /* Rx control frames */ + u32 hnc_rx_64_frames; /* Rx 64-bytes frames */ + u32 hnc_rx_127_frames; /* Rx 65-127 bytes frames */ + u32 hnc_rx_255_frames; /* Rx 128-255 bytes frames */ + u32 hnc_rx_511_frames; /* Rx 256-511 bytes frames */ + u32 hnc_rx_1023_frames; /* Rx 512-1023 bytes frames */ + u32 hnc_rx_1522_frames; /* Rx 1024-1522 bytes frames */ + u32 hnc_rx_9022_frames; /* Rx 1523-9022 bytes frames */ + u32 hnc_tx_64_frames; /* Tx 64-bytes frames */ + u32 hnc_tx_127_frames; /* Tx 65-127 bytes frames */ + u32 hnc_tx_255_frames; /* Tx 128-255 bytes frames */ + u32 hnc_tx_511_frames; /* Tx 256-511 bytes frames */ + u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ + u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ + u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ + u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u32 hnc_rx_runt_pkts; /* Rx error runt packets */ + u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ + u32 ncsi_rx_cmds; /* Rx NCSI commands */ + u32 ncsi_dropped_cmds; /* Dropped commands */ + u32 ncsi_cmd_type_errs; /* Command type errors */ + u32 ncsi_cmd_csum_errs; /* Command checksum errors */ + u32 ncsi_rx_pkts; /* Rx NCSI packets */ + u32 ncsi_tx_pkts; /* Tx NCSI packets */ + u32 ncsi_tx_aen_pkts; /* Tx AEN packets */ + u32 pt_tx_pkts; /* Tx packets */ + u32 pt_tx_dropped; /* Tx dropped packets */ + u32 pt_tx_channel_err; /* Tx channel errors */ + u32 pt_tx_us_err; /* Tx undersize errors */ + u32 pt_rx_pkts; /* Rx packets */ + u32 pt_rx_dropped; /* Rx dropped packets */ + u32 pt_rx_channel_err; /* Rx channel errors */ + u32 pt_rx_us_err; /* Rx undersize errors */ + u32 pt_rx_os_err; /* Rx oversize errors */ +}; + +struct ncsi_dev_priv; +struct ncsi_package; + +#define NCSI_PACKAGE_SHIFT 5 +#define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7) +#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) +#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) + +struct ncsi_channel { + unsigned char id; + int state; +#define NCSI_CHANNEL_INACTIVE 1 +#define NCSI_CHANNEL_ACTIVE 2 + spinlock_t lock; /* Protect filters etc */ + struct ncsi_package *package; + struct ncsi_channel_version version; + struct ncsi_channel_cap caps[NCSI_CAP_MAX]; + struct ncsi_channel_mode modes[NCSI_MODE_MAX]; + struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; + struct ncsi_channel_stats stats; + struct list_head node; +}; + +struct ncsi_package { + unsigned char id; /* NCSI 3-bits package ID */ + unsigned char uuid[16]; /* UUID */ + struct ncsi_dev_priv *ndp; /* NCSI device */ + spinlock_t lock; /* Protect the package */ + unsigned int channel_num; /* Number of channels */ + struct list_head channels; /* List of chanels */ + struct list_head node; /* Form list of packages */ +}; + +struct ncsi_request { + unsigned char id; /* Request ID - 0 to 255 */ + bool used; /* Request that has been assigned */ + bool driven; /* Drive state machine */ + struct ncsi_dev_priv *ndp; /* Associated NCSI device */ + struct sk_buff *cmd; /* Associated NCSI command packet */ + struct sk_buff *rsp; /* Associated NCSI response packet */ + struct timer_list timer; /* Timer on waiting for response */ + bool enabled; /* Time has been enabled or not */ +}; + +struct ncsi_dev_priv { + struct ncsi_dev ndev; /* Associated NCSI device */ + unsigned int flags; /* NCSI device flags */ + spinlock_t lock; /* Protect the NCSI device */ + unsigned int package_num; /* Number of packages */ + struct list_head packages; /* List of packages */ + struct ncsi_request requests[256]; /* Request table */ + unsigned int request_id; /* Last used request ID */ + struct list_head node; /* Form NCSI device list */ +}; + +extern struct list_head ncsi_dev_list; +extern spinlock_t ncsi_dev_lock; + +#define TO_NCSI_DEV_PRIV(nd) \ + container_of(nd, struct ncsi_dev_priv, ndev) +#define NCSI_FOR_EACH_DEV(ndp) \ + list_for_each_entry_rcu(ndp, &ncsi_dev_list, node) +#define NCSI_FOR_EACH_PACKAGE(ndp, np) \ + list_for_each_entry_rcu(np, &ndp->packages, node) +#define NCSI_FOR_EACH_CHANNEL(np, nc) \ + list_for_each_entry_rcu(nc, &np->channels, node) + +/* Resources */ +int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); +int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); +int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); +struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, + unsigned char id); +struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, + unsigned char id); +struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, + unsigned char id); +struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, + unsigned char id); +void ncsi_remove_package(struct ncsi_package *np); +void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, + unsigned char id, + struct ncsi_package **np, + struct ncsi_channel **nc); +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); +void ncsi_free_request(struct ncsi_request *nr); +struct ncsi_dev *ncsi_find_dev(struct net_device *dev); + +#endif /* __NCSI_INTERNAL_H__ */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c new file mode 100644 index 000000000000..0e28ed8f2703 --- /dev/null +++ b/net/ncsi/ncsi-manage.c @@ -0,0 +1,436 @@ +/* + * Copyright Gavin Shan, IBM Corporation 2016. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +LIST_HEAD(ncsi_dev_list); +DEFINE_SPINLOCK(ncsi_dev_lock); + +static inline int ncsi_filter_size(int table) +{ + int sizes[] = { 2, 6, 6, 6 }; + + BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX); + if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX) + return -EINVAL; + + return sizes[table]; +} + +int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data) +{ + struct ncsi_channel_filter *ncf; + void *bitmap; + int index, size; + unsigned long flags; + + ncf = nc->filters[table]; + if (!ncf) + return -ENXIO; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + index = -1; + while ((index = find_next_bit(bitmap, ncf->total, index + 1)) + < ncf->total) { + if (!memcmp(ncf->data + size * index, data, size)) { + spin_unlock_irqrestore(&nc->lock, flags); + return index; + } + } + spin_unlock_irqrestore(&nc->lock, flags); + + return -ENOENT; +} + +int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data) +{ + struct ncsi_channel_filter *ncf; + int index, size; + void *bitmap; + unsigned long flags; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + index = ncsi_find_filter(nc, table, data); + if (index >= 0) + return index; + + ncf = nc->filters[table]; + if (!ncf) + return -ENODEV; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + do { + index = find_next_zero_bit(bitmap, ncf->total, 0); + if (index >= ncf->total) { + spin_unlock_irqrestore(&nc->lock, flags); + return -ENOSPC; + } + } while (test_and_set_bit(index, bitmap)); + + memcpy(ncf->data + size * index, data, size); + spin_unlock_irqrestore(&nc->lock, flags); + + return index; +} + +int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index) +{ + struct ncsi_channel_filter *ncf; + int size; + void *bitmap; + unsigned long flags; + + size = ncsi_filter_size(table); + if (size < 0) + return size; + + ncf = nc->filters[table]; + if (!ncf || index >= ncf->total) + return -ENODEV; + + spin_lock_irqsave(&nc->lock, flags); + bitmap = (void *)&ncf->bitmap; + if (test_and_clear_bit(index, bitmap)) + memset(ncf->data + size * index, 0, size); + spin_unlock_irqrestore(&nc->lock, flags); + + return 0; +} + +struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, + unsigned char id) +{ + struct ncsi_channel *nc; + + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (nc->id == id) + return nc; + } + + return NULL; +} + +struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) +{ + struct ncsi_channel *nc, *tmp; + int index; + unsigned long flags; + + nc = kzalloc(sizeof(*nc), GFP_ATOMIC); + if (!nc) + return NULL; + + nc->id = id; + nc->package = np; + nc->state = NCSI_CHANNEL_INACTIVE; + spin_lock_init(&nc->lock); + for (index = 0; index < NCSI_CAP_MAX; index++) + nc->caps[index].index = index; + for (index = 0; index < NCSI_MODE_MAX; index++) + nc->modes[index].index = index; + + spin_lock_irqsave(&np->lock, flags); + tmp = ncsi_find_channel(np, id); + if (tmp) { + spin_unlock_irqrestore(&np->lock, flags); + kfree(nc); + return tmp; + } + + list_add_tail_rcu(&nc->node, &np->channels); + np->channel_num++; + spin_unlock_irqrestore(&np->lock, flags); + + return nc; +} + +static void ncsi_remove_channel(struct ncsi_channel *nc) +{ + struct ncsi_package *np = nc->package; + struct ncsi_channel_filter *ncf; + unsigned long flags; + int i; + + /* Release filters */ + spin_lock_irqsave(&nc->lock, flags); + for (i = 0; i < NCSI_FILTER_MAX; i++) { + ncf = nc->filters[i]; + if (!ncf) + continue; + + nc->filters[i] = NULL; + kfree(ncf); + } + + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + + /* Remove and free channel */ + spin_lock_irqsave(&np->lock, flags); + list_del_rcu(&nc->node); + np->channel_num--; + spin_unlock_irqrestore(&np->lock, flags); + + kfree(nc); +} + +struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, + unsigned char id) +{ + struct ncsi_package *np; + + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (np->id == id) + return np; + } + + return NULL; +} + +struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, + unsigned char id) +{ + struct ncsi_package *np, *tmp; + unsigned long flags; + + np = kzalloc(sizeof(*np), GFP_ATOMIC); + if (!np) + return NULL; + + np->id = id; + np->ndp = ndp; + spin_lock_init(&np->lock); + INIT_LIST_HEAD(&np->channels); + + spin_lock_irqsave(&ndp->lock, flags); + tmp = ncsi_find_package(ndp, id); + if (tmp) { + spin_unlock_irqrestore(&ndp->lock, flags); + kfree(np); + return tmp; + } + + list_add_tail_rcu(&np->node, &ndp->packages); + ndp->package_num++; + spin_unlock_irqrestore(&ndp->lock, flags); + + return np; +} + +void ncsi_remove_package(struct ncsi_package *np) +{ + struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_channel *nc, *tmp; + unsigned long flags; + + /* Release all child channels */ + list_for_each_entry_safe(nc, tmp, &np->channels, node) + ncsi_remove_channel(nc); + + /* Remove and free package */ + spin_lock_irqsave(&ndp->lock, flags); + list_del_rcu(&np->node); + ndp->package_num--; + spin_unlock_irqrestore(&ndp->lock, flags); + + kfree(np); +} + +void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, + unsigned char id, + struct ncsi_package **np, + struct ncsi_channel **nc) +{ + struct ncsi_package *p; + struct ncsi_channel *c; + + p = ncsi_find_package(ndp, NCSI_PACKAGE_INDEX(id)); + c = p ? ncsi_find_channel(p, NCSI_CHANNEL_INDEX(id)) : NULL; + + if (np) + *np = p; + if (nc) + *nc = c; +} + +/* For two consecutive NCSI commands, the packet IDs shouldn't + * be same. Otherwise, the bogus response might be replied. So + * the available IDs are allocated in round-robin fashion. + */ +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) +{ + struct ncsi_request *nr = NULL; + int i, limit = ARRAY_SIZE(ndp->requests); + unsigned long flags; + + /* Check if there is one available request until the ceiling */ + spin_lock_irqsave(&ndp->lock, flags); + for (i = ndp->request_id; !nr && i < limit; i++) { + if (ndp->requests[i].used) + continue; + + nr = &ndp->requests[i]; + nr->used = true; + nr->driven = driven; + if (++ndp->request_id >= limit) + ndp->request_id = 0; + } + + /* Fail back to check from the starting cursor */ + for (i = 0; !nr && i < ndp->request_id; i++) { + if (ndp->requests[i].used) + continue; + + nr = &ndp->requests[i]; + nr->used = true; + nr->driven = driven; + if (++ndp->request_id >= limit) + ndp->request_id = 0; + } + spin_unlock_irqrestore(&ndp->lock, flags); + + return nr; +} + +void ncsi_free_request(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct sk_buff *cmd, *rsp; + unsigned long flags; + + if (nr->enabled) { + nr->enabled = false; + del_timer_sync(&nr->timer); + } + + spin_lock_irqsave(&ndp->lock, flags); + cmd = nr->cmd; + rsp = nr->rsp; + nr->cmd = NULL; + nr->rsp = NULL; + nr->used = false; + spin_unlock_irqrestore(&ndp->lock, flags); + + /* Release command and response */ + consume_skb(cmd); + consume_skb(rsp); +} + +struct ncsi_dev *ncsi_find_dev(struct net_device *dev) +{ + struct ncsi_dev_priv *ndp; + + NCSI_FOR_EACH_DEV(ndp) { + if (ndp->ndev.dev == dev) + return &ndp->ndev; + } + + return NULL; +} + +static void ncsi_request_timeout(unsigned long data) +{ + struct ncsi_request *nr = (struct ncsi_request *)data; + struct ncsi_dev_priv *ndp = nr->ndp; + unsigned long flags; + + /* If the request already had associated response, + * let the response handler to release it. + */ + spin_lock_irqsave(&ndp->lock, flags); + nr->enabled = false; + if (nr->rsp || !nr->cmd) { + spin_unlock_irqrestore(&ndp->lock, flags); + return; + } + spin_unlock_irqrestore(&ndp->lock, flags); + + /* Release the request */ + ncsi_free_request(nr); +} + +struct ncsi_dev *ncsi_register_dev(struct net_device *dev, + void (*handler)(struct ncsi_dev *ndev)) +{ + struct ncsi_dev_priv *ndp; + struct ncsi_dev *nd; + unsigned long flags; + int i; + + /* Check if the device has been registered or not */ + nd = ncsi_find_dev(dev); + if (nd) + return nd; + + /* Create NCSI device */ + ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC); + if (!ndp) + return NULL; + + nd = &ndp->ndev; + nd->state = ncsi_dev_state_registered; + nd->dev = dev; + nd->handler = handler; + + /* Initialize private NCSI device */ + spin_lock_init(&ndp->lock); + INIT_LIST_HEAD(&ndp->packages); + ndp->request_id = 0; + for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { + ndp->requests[i].id = i; + ndp->requests[i].ndp = ndp; + setup_timer(&ndp->requests[i].timer, + ncsi_request_timeout, + (unsigned long)&ndp->requests[i]); + } + + spin_lock_irqsave(&ncsi_dev_lock, flags); + list_add_tail_rcu(&ndp->node, &ncsi_dev_list); + spin_unlock_irqrestore(&ncsi_dev_lock, flags); + + return nd; +} +EXPORT_SYMBOL_GPL(ncsi_register_dev); + +void ncsi_unregister_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np, *tmp; + unsigned long flags; + + list_for_each_entry_safe(np, tmp, &ndp->packages, node) + ncsi_remove_package(np); + + spin_lock_irqsave(&ncsi_dev_lock, flags); + list_del_rcu(&ndp->node); + spin_unlock_irqrestore(&ncsi_dev_lock, flags); + + kfree(ndp); +} +EXPORT_SYMBOL_GPL(ncsi_unregister_dev); -- cgit v1.2.3 From e6f44ed6d04d3185dcd8e8e98af8742d87bdffcc Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 19 Jul 2016 11:54:19 +1000 Subject: net/ncsi: Package and channel management This manages NCSI packages and channels: * The available packages and channels are enumerated in the first time of calling ncsi_start_dev(). The channels' capabilities are probed in the meanwhile. The NCSI network topology won't change until the NCSI device is destroyed. * There in a queue in every NCSI device. The element in the queue, channel, is waiting for configuration (bringup) or suspending (teardown). The channel's state (inactive/active) indicates the futher action (configuration or suspending) will be applied on the channel. Another channel's state (invisible) means the requested action is being applied. * The hardware arbitration will be enabled if all available packages and channels support it. All available channels try to provide service when hardware arbitration is enabled. Otherwise, one channel is selected as the active one at once. * When channel is in active state, meaning it's providing service, a timer started to retrieve the channe's link status. If the channel's link status fails to be updated in the determined period, the channel is going to be reconfigured. It's the error handling implementation as defined in NCSI spec. Signed-off-by: Gavin Shan Acked-by: Joel Stanley Signed-off-by: David S. Miller --- include/net/ncsi.h | 6 + net/ncsi/internal.h | 50 ++++ net/ncsi/ncsi-manage.c | 763 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-rsp.c | 15 + 4 files changed, 834 insertions(+) (limited to 'include/net') diff --git a/include/net/ncsi.h b/include/net/ncsi.h index 70d14ee1ef84..1dbf42f79750 100644 --- a/include/net/ncsi.h +++ b/include/net/ncsi.h @@ -30,6 +30,7 @@ struct ncsi_dev { #ifdef CONFIG_NET_NCSI struct ncsi_dev *ncsi_register_dev(struct net_device *dev, void (*notifier)(struct ncsi_dev *nd)); +int ncsi_start_dev(struct ncsi_dev *nd); void ncsi_unregister_dev(struct ncsi_dev *nd); #else /* !CONFIG_NET_NCSI */ static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev, @@ -38,6 +39,11 @@ static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev, return NULL; } +static inline int ncsi_start_dev(struct ncsi_dev *nd) +{ + return -ENOTTY; +} + static inline void ncsi_unregister_dev(struct ncsi_dev *nd) { } diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index bd000c9c8249..38fc95a26f8f 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -178,6 +178,7 @@ struct ncsi_channel { int state; #define NCSI_CHANNEL_INACTIVE 1 #define NCSI_CHANNEL_ACTIVE 2 +#define NCSI_CHANNEL_INVISIBLE 3 spinlock_t lock; /* Protect filters etc */ struct ncsi_package *package; struct ncsi_channel_version version; @@ -185,7 +186,11 @@ struct ncsi_channel { struct ncsi_channel_mode modes[NCSI_MODE_MAX]; struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; struct ncsi_channel_stats stats; + struct timer_list timer; /* Link monitor timer */ + bool enabled; /* Timer is enabled */ + unsigned int timeout; /* Times of timeout */ struct list_head node; + struct list_head link; }; struct ncsi_package { @@ -209,14 +214,56 @@ struct ncsi_request { bool enabled; /* Time has been enabled or not */ }; +enum { + ncsi_dev_state_major = 0xff00, + ncsi_dev_state_minor = 0x00ff, + ncsi_dev_state_probe_deselect = 0x0201, + ncsi_dev_state_probe_package, + ncsi_dev_state_probe_channel, + ncsi_dev_state_probe_cis, + ncsi_dev_state_probe_gvi, + ncsi_dev_state_probe_gc, + ncsi_dev_state_probe_gls, + ncsi_dev_state_probe_dp, + ncsi_dev_state_config_sp = 0x0301, + ncsi_dev_state_config_cis, + ncsi_dev_state_config_sma, + ncsi_dev_state_config_ebf, +#if IS_ENABLED(CONFIG_IPV6) + ncsi_dev_state_config_egmf, +#endif + ncsi_dev_state_config_ecnt, + ncsi_dev_state_config_ec, + ncsi_dev_state_config_ae, + ncsi_dev_state_config_gls, + ncsi_dev_state_config_done, + ncsi_dev_state_suspend_select = 0x0401, + ncsi_dev_state_suspend_dcnt, + ncsi_dev_state_suspend_dc, + ncsi_dev_state_suspend_deselect, + ncsi_dev_state_suspend_done +}; + struct ncsi_dev_priv { struct ncsi_dev ndev; /* Associated NCSI device */ unsigned int flags; /* NCSI device flags */ +#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */ +#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */ +#define NCSI_DEV_RESHUFFLE 4 spinlock_t lock; /* Protect the NCSI device */ +#if IS_ENABLED(CONFIG_IPV6) + unsigned int inet6_addr_num; /* Number of IPv6 addresses */ +#endif unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ + unsigned int pending_req_num; /* Number of pending requests */ + struct ncsi_package *active_package; /* Currently handled package */ + struct ncsi_channel *active_channel; /* Currently handled channel */ + struct list_head channel_queue; /* Config queue of channels */ + struct work_struct work; /* For channel management */ + struct packet_type ptype; /* NCSI packet Rx handler */ struct list_head node; /* Form NCSI device list */ }; @@ -251,6 +298,8 @@ extern spinlock_t ncsi_dev_lock; int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); +void ncsi_start_channel_monitor(struct ncsi_channel *nc); +void ncsi_stop_channel_monitor(struct ncsi_channel *nc); struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, unsigned char id); struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, @@ -267,6 +316,7 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); +int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); /* Packet handlers */ u32 ncsi_calculate_checksum(unsigned char *data, int len); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 0e28ed8f2703..d627a39ddcd0 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -17,8 +17,12 @@ #include #include #include +#include +#include +#include #include "internal.h" +#include "ncsi-pkt.h" LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); @@ -123,6 +127,120 @@ int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index) return 0; } +static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np; + struct ncsi_channel *nc; + + nd->state = ncsi_dev_state_functional; + if (force_down) { + nd->link_up = 0; + goto report; + } + + nd->link_up = 0; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_ACTIVE) + continue; + + if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + nd->link_up = 1; + goto report; + } + } + } + +report: + nd->handler(nd); +} + +static void ncsi_channel_monitor(unsigned long data) +{ + struct ncsi_channel *nc = (struct ncsi_channel *)data; + struct ncsi_package *np = nc->package; + struct ncsi_dev_priv *ndp = np->ndp; + struct ncsi_cmd_arg nca; + bool enabled; + unsigned int timeout; + unsigned long flags; + int ret; + + spin_lock_irqsave(&nc->lock, flags); + timeout = nc->timeout; + enabled = nc->enabled; + spin_unlock_irqrestore(&nc->lock, flags); + + if (!enabled || !list_empty(&nc->link)) + return; + if (nc->state != NCSI_CHANNEL_INACTIVE && + nc->state != NCSI_CHANNEL_ACTIVE) + return; + + if (!(timeout % 2)) { + nca.ndp = ndp; + nca.package = np->id; + nca.channel = nc->id; + nca.type = NCSI_PKT_CMD_GLS; + nca.driven = false; + ret = ncsi_xmit_cmd(&nca); + if (ret) { + netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", + ret); + return; + } + } + + if (timeout + 1 >= 3) { + if (!(ndp->flags & NCSI_DEV_HWA) && + nc->state == NCSI_CHANNEL_ACTIVE) + ncsi_report_link(ndp, true); + + spin_lock_irqsave(&ndp->lock, flags); + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + ncsi_process_next_channel(ndp); + return; + } + + spin_lock_irqsave(&nc->lock, flags); + nc->timeout = timeout + 1; + nc->enabled = true; + spin_unlock_irqrestore(&nc->lock, flags); + mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); +} + +void ncsi_start_channel_monitor(struct ncsi_channel *nc) +{ + unsigned long flags; + + spin_lock_irqsave(&nc->lock, flags); + WARN_ON_ONCE(nc->enabled); + nc->timeout = 0; + nc->enabled = true; + spin_unlock_irqrestore(&nc->lock, flags); + + mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); +} + +void ncsi_stop_channel_monitor(struct ncsi_channel *nc) +{ + unsigned long flags; + + spin_lock_irqsave(&nc->lock, flags); + if (!nc->enabled) { + spin_unlock_irqrestore(&nc->lock, flags); + return; + } + nc->enabled = false; + spin_unlock_irqrestore(&nc->lock, flags); + + del_timer_sync(&nc->timer); +} + struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, unsigned char id) { @@ -149,7 +267,10 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) nc->id = id; nc->package = np; nc->state = NCSI_CHANNEL_INACTIVE; + nc->enabled = false; + setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc); spin_lock_init(&nc->lock); + INIT_LIST_HEAD(&nc->link); for (index = 0; index < NCSI_CAP_MAX; index++) nc->caps[index].index = index; for (index = 0; index < NCSI_MODE_MAX; index++) @@ -190,6 +311,7 @@ static void ncsi_remove_channel(struct ncsi_channel *nc) nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); + ncsi_stop_channel_monitor(nc); /* Remove and free channel */ spin_lock_irqsave(&np->lock, flags); @@ -323,6 +445,7 @@ void ncsi_free_request(struct ncsi_request *nr) struct ncsi_dev_priv *ndp = nr->ndp; struct sk_buff *cmd, *rsp; unsigned long flags; + bool driven; if (nr->enabled) { nr->enabled = false; @@ -335,8 +458,12 @@ void ncsi_free_request(struct ncsi_request *nr) nr->cmd = NULL; nr->rsp = NULL; nr->used = false; + driven = nr->driven; spin_unlock_irqrestore(&ndp->lock, flags); + if (driven && cmd && --ndp->pending_req_num == 0) + schedule_work(&ndp->work); + /* Release command and response */ consume_skb(cmd); consume_skb(rsp); @@ -375,6 +502,587 @@ static void ncsi_request_timeout(unsigned long data) ncsi_free_request(nr); } +static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np = ndp->active_package; + struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_cmd_arg nca; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_suspend: + nd->state = ncsi_dev_state_suspend_select; + /* Fall through */ + case ncsi_dev_state_suspend_select: + case ncsi_dev_state_suspend_dcnt: + case ncsi_dev_state_suspend_dc: + case ncsi_dev_state_suspend_deselect: + ndp->pending_req_num = 1; + + np = ndp->active_package; + nc = ndp->active_channel; + nca.package = np->id; + if (nd->state == ncsi_dev_state_suspend_select) { + nca.type = NCSI_PKT_CMD_SP; + nca.channel = 0x1f; + if (ndp->flags & NCSI_DEV_HWA) + nca.bytes[0] = 0; + else + nca.bytes[0] = 1; + nd->state = ncsi_dev_state_suspend_dcnt; + } else if (nd->state == ncsi_dev_state_suspend_dcnt) { + nca.type = NCSI_PKT_CMD_DCNT; + nca.channel = nc->id; + nd->state = ncsi_dev_state_suspend_dc; + } else if (nd->state == ncsi_dev_state_suspend_dc) { + nca.type = NCSI_PKT_CMD_DC; + nca.channel = nc->id; + nca.bytes[0] = 1; + nd->state = ncsi_dev_state_suspend_deselect; + } else if (nd->state == ncsi_dev_state_suspend_deselect) { + nca.type = NCSI_PKT_CMD_DP; + nca.channel = 0x1f; + nd->state = ncsi_dev_state_suspend_done; + } + + ret = ncsi_xmit_cmd(&nca); + if (ret) { + nd->state = ncsi_dev_state_functional; + return; + } + + break; + case ncsi_dev_state_suspend_done: + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + ncsi_process_next_channel(ndp); + + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", + nd->state); + } +} + +static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct net_device *dev = nd->dev; + struct ncsi_package *np = ndp->active_package; + struct ncsi_channel *nc = ndp->active_channel; + struct ncsi_cmd_arg nca; + unsigned char index; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_config: + case ncsi_dev_state_config_sp: + ndp->pending_req_num = 1; + + /* Select the specific package */ + nca.type = NCSI_PKT_CMD_SP; + if (ndp->flags & NCSI_DEV_HWA) + nca.bytes[0] = 0; + else + nca.bytes[0] = 1; + nca.package = np->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_config_cis; + break; + case ncsi_dev_state_config_cis: + ndp->pending_req_num = 1; + + /* Clear initial state */ + nca.type = NCSI_PKT_CMD_CIS; + nca.package = np->id; + nca.channel = nc->id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_config_sma; + break; + case ncsi_dev_state_config_sma: + case ncsi_dev_state_config_ebf: +#if IS_ENABLED(CONFIG_IPV6) + case ncsi_dev_state_config_egmf: +#endif + case ncsi_dev_state_config_ecnt: + case ncsi_dev_state_config_ec: + case ncsi_dev_state_config_ae: + case ncsi_dev_state_config_gls: + ndp->pending_req_num = 1; + + nca.package = np->id; + nca.channel = nc->id; + + /* Use first entry in unicast filter table. Note that + * the MAC filter table starts from entry 1 instead of + * 0. + */ + if (nd->state == ncsi_dev_state_config_sma) { + nca.type = NCSI_PKT_CMD_SMA; + for (index = 0; index < 6; index++) + nca.bytes[index] = dev->dev_addr[index]; + nca.bytes[6] = 0x1; + nca.bytes[7] = 0x1; + nd->state = ncsi_dev_state_config_ebf; + } else if (nd->state == ncsi_dev_state_config_ebf) { + nca.type = NCSI_PKT_CMD_EBF; + nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; + nd->state = ncsi_dev_state_config_ecnt; +#if IS_ENABLED(CONFIG_IPV6) + if (ndp->inet6_addr_num > 0 && + (nc->caps[NCSI_CAP_GENERIC].cap & + NCSI_CAP_GENERIC_MC)) + nd->state = ncsi_dev_state_config_egmf; + else + nd->state = ncsi_dev_state_config_ecnt; + } else if (nd->state == ncsi_dev_state_config_egmf) { + nca.type = NCSI_PKT_CMD_EGMF; + nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + nd->state = ncsi_dev_state_config_ecnt; +#endif /* CONFIG_IPV6 */ + } else if (nd->state == ncsi_dev_state_config_ecnt) { + nca.type = NCSI_PKT_CMD_ECNT; + nd->state = ncsi_dev_state_config_ec; + } else if (nd->state == ncsi_dev_state_config_ec) { + /* Enable AEN if it's supported */ + nca.type = NCSI_PKT_CMD_EC; + nd->state = ncsi_dev_state_config_ae; + if (!(nc->caps[NCSI_CAP_AEN].cap & NCSI_CAP_AEN_MASK)) + nd->state = ncsi_dev_state_config_gls; + } else if (nd->state == ncsi_dev_state_config_ae) { + nca.type = NCSI_PKT_CMD_AE; + nca.bytes[0] = 0; + nca.dwords[1] = nc->caps[NCSI_CAP_AEN].cap; + nd->state = ncsi_dev_state_config_gls; + } else if (nd->state == ncsi_dev_state_config_gls) { + nca.type = NCSI_PKT_CMD_GLS; + nd->state = ncsi_dev_state_config_done; + } + + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + break; + case ncsi_dev_state_config_done: + if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) + xchg(&nc->state, NCSI_CHANNEL_ACTIVE); + else + xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + + ncsi_start_channel_monitor(nc); + ncsi_process_next_channel(ndp); + break; + default: + netdev_warn(dev, "Wrong NCSI state 0x%x in config\n", + nd->state); + } + + return; + +error: + ncsi_report_link(ndp, true); +} + +static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc, *found; + struct ncsi_channel_mode *ncm; + unsigned long flags; + + /* The search is done once an inactive channel with up + * link is found. + */ + found = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + if (!list_empty(&nc->link) || + nc->state != NCSI_CHANNEL_INACTIVE) + continue; + + if (!found) + found = nc; + + ncm = &nc->modes[NCSI_MODE_LINK]; + if (ncm->data[2] & 0x1) { + found = nc; + goto out; + } + } + } + + if (!found) { + ncsi_report_link(ndp, true); + return -ENODEV; + } + +out: + spin_lock_irqsave(&ndp->lock, flags); + list_add_tail_rcu(&found->link, &ndp->channel_queue); + spin_unlock_irqrestore(&ndp->lock, flags); + + return ncsi_process_next_channel(ndp); +} + +static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned int cap; + + /* The hardware arbitration is disabled if any one channel + * doesn't support explicitly. + */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + cap = nc->caps[NCSI_CAP_GENERIC].cap; + if (!(cap & NCSI_CAP_GENERIC_HWA) || + (cap & NCSI_CAP_GENERIC_HWA_MASK) != + NCSI_CAP_GENERIC_HWA_SUPPORT) { + ndp->flags &= ~NCSI_DEV_HWA; + return false; + } + } + } + + ndp->flags |= NCSI_DEV_HWA; + return true; +} + +static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp) +{ + struct ncsi_package *np; + struct ncsi_channel *nc; + unsigned long flags; + + /* Move all available channels to processing queue */ + spin_lock_irqsave(&ndp->lock, flags); + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE || + !list_empty(&nc->link)); + ncsi_stop_channel_monitor(nc); + list_add_tail_rcu(&nc->link, &ndp->channel_queue); + } + } + spin_unlock_irqrestore(&ndp->lock, flags); + + /* We can have no channels in extremely case */ + if (list_empty(&ndp->channel_queue)) { + ncsi_report_link(ndp, false); + return -ENOENT; + } + + return ncsi_process_next_channel(ndp); +} + +static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_dev *nd = &ndp->ndev; + struct ncsi_package *np; + struct ncsi_channel *nc; + struct ncsi_cmd_arg nca; + unsigned char index; + int ret; + + nca.ndp = ndp; + nca.driven = true; + switch (nd->state) { + case ncsi_dev_state_probe: + nd->state = ncsi_dev_state_probe_deselect; + /* Fall through */ + case ncsi_dev_state_probe_deselect: + ndp->pending_req_num = 8; + + /* Deselect all possible packages */ + nca.type = NCSI_PKT_CMD_DP; + nca.channel = 0x1f; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_package; + break; + case ncsi_dev_state_probe_package: + ndp->pending_req_num = 16; + + /* Select all possible packages */ + nca.type = NCSI_PKT_CMD_SP; + nca.bytes[0] = 1; + nca.channel = 0x1f; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + /* Disable all possible packages */ + nca.type = NCSI_PKT_CMD_DP; + for (index = 0; index < 8; index++) { + nca.package = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_channel; + break; + case ncsi_dev_state_probe_channel: + if (!ndp->active_package) + ndp->active_package = list_first_or_null_rcu( + &ndp->packages, struct ncsi_package, node); + else if (list_is_last(&ndp->active_package->node, + &ndp->packages)) + ndp->active_package = NULL; + else + ndp->active_package = list_next_entry( + ndp->active_package, node); + + /* All available packages and channels are enumerated. The + * enumeration happens for once when the NCSI interface is + * started. So we need continue to start the interface after + * the enumeration. + * + * We have to choose an active channel before configuring it. + * Note that we possibly don't have active channel in extreme + * situation. + */ + if (!ndp->active_package) { + ndp->flags |= NCSI_DEV_PROBED; + if (ncsi_check_hwa(ndp)) + ncsi_enable_hwa(ndp); + else + ncsi_choose_active_channel(ndp); + return; + } + + /* Select the active package */ + ndp->pending_req_num = 1; + nca.type = NCSI_PKT_CMD_SP; + nca.bytes[0] = 1; + nca.package = ndp->active_package->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + nd->state = ncsi_dev_state_probe_cis; + break; + case ncsi_dev_state_probe_cis: + ndp->pending_req_num = 32; + + /* Clear initial state */ + nca.type = NCSI_PKT_CMD_CIS; + nca.package = ndp->active_package->id; + for (index = 0; index < 0x20; index++) { + nca.channel = index; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + nd->state = ncsi_dev_state_probe_gvi; + break; + case ncsi_dev_state_probe_gvi: + case ncsi_dev_state_probe_gc: + case ncsi_dev_state_probe_gls: + np = ndp->active_package; + ndp->pending_req_num = np->channel_num; + + /* Retrieve version, capability or link status */ + if (nd->state == ncsi_dev_state_probe_gvi) + nca.type = NCSI_PKT_CMD_GVI; + else if (nd->state == ncsi_dev_state_probe_gc) + nca.type = NCSI_PKT_CMD_GC; + else + nca.type = NCSI_PKT_CMD_GLS; + + nca.package = np->id; + NCSI_FOR_EACH_CHANNEL(np, nc) { + nca.channel = nc->id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + } + + if (nd->state == ncsi_dev_state_probe_gvi) + nd->state = ncsi_dev_state_probe_gc; + else if (nd->state == ncsi_dev_state_probe_gc) + nd->state = ncsi_dev_state_probe_gls; + else + nd->state = ncsi_dev_state_probe_dp; + break; + case ncsi_dev_state_probe_dp: + ndp->pending_req_num = 1; + + /* Deselect the active package */ + nca.type = NCSI_PKT_CMD_DP; + nca.package = ndp->active_package->id; + nca.channel = 0x1f; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + /* Scan channels in next package */ + nd->state = ncsi_dev_state_probe_channel; + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", + nd->state); + } + + return; +error: + ncsi_report_link(ndp, true); +} + +static void ncsi_dev_work(struct work_struct *work) +{ + struct ncsi_dev_priv *ndp = container_of(work, + struct ncsi_dev_priv, work); + struct ncsi_dev *nd = &ndp->ndev; + + switch (nd->state & ncsi_dev_state_major) { + case ncsi_dev_state_probe: + ncsi_probe_channel(ndp); + break; + case ncsi_dev_state_suspend: + ncsi_suspend_channel(ndp); + break; + case ncsi_dev_state_config: + ncsi_configure_channel(ndp); + break; + default: + netdev_warn(nd->dev, "Wrong NCSI state 0x%x in workqueue\n", + nd->state); + } +} + +int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) +{ + struct ncsi_channel *nc; + int old_state; + unsigned long flags; + + spin_lock_irqsave(&ndp->lock, flags); + nc = list_first_or_null_rcu(&ndp->channel_queue, + struct ncsi_channel, link); + if (nc) { + old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); + list_del_init(&nc->link); + } + spin_unlock_irqrestore(&ndp->lock, flags); + + ndp->active_channel = nc; + ndp->active_package = nc ? nc->package : NULL; + if (!nc) { + if (ndp->flags & NCSI_DEV_RESHUFFLE) { + ndp->flags &= ~NCSI_DEV_RESHUFFLE; + return ncsi_choose_active_channel(ndp); + } + + ncsi_report_link(ndp, false); + return -ENODEV; + } + + switch (old_state) { + case NCSI_CHANNEL_INACTIVE: + ndp->ndev.state = ncsi_dev_state_config; + ncsi_configure_channel(ndp); + break; + case NCSI_CHANNEL_ACTIVE: + ndp->ndev.state = ncsi_dev_state_suspend; + ncsi_suspend_channel(ndp); + break; + default: + netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", + nc->state, nc->package->id, nc->id); + ncsi_report_link(ndp, false); + return -EINVAL; + } + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int ncsi_inet6addr_event(struct notifier_block *this, + unsigned long event, void *data) +{ + struct inet6_ifaddr *ifa = data; + struct net_device *dev = ifa->idev->dev; + struct ncsi_dev *nd = ncsi_find_dev(dev); + struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + struct ncsi_package *np; + struct ncsi_channel *nc; + struct ncsi_cmd_arg nca; + bool action; + int ret; + + if (!ndp || (ipv6_addr_type(&ifa->addr) & + (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK))) + return NOTIFY_OK; + + switch (event) { + case NETDEV_UP: + action = (++ndp->inet6_addr_num) == 1; + nca.type = NCSI_PKT_CMD_EGMF; + break; + case NETDEV_DOWN: + action = (--ndp->inet6_addr_num == 0); + nca.type = NCSI_PKT_CMD_DGMF; + break; + default: + return NOTIFY_OK; + } + + /* We might not have active channel or packages. The IPv6 + * required multicast will be enabled when active channel + * or packages are chosen. + */ + np = ndp->active_package; + nc = ndp->active_channel; + if (!action || !np || !nc) + return NOTIFY_OK; + + /* We needn't enable or disable it if the function isn't supported */ + if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC)) + return NOTIFY_OK; + + nca.ndp = ndp; + nca.driven = false; + nca.package = np->id; + nca.channel = nc->id; + nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; + ret = ncsi_xmit_cmd(&nca); + if (ret) { + netdev_warn(dev, "Fail to %s global multicast filter (%d)\n", + (event == NETDEV_UP) ? "enable" : "disable", ret); + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static struct notifier_block ncsi_inet6addr_notifier = { + .notifier_call = ncsi_inet6addr_event, +}; +#endif /* CONFIG_IPV6 */ + struct ncsi_dev *ncsi_register_dev(struct net_device *dev, void (*handler)(struct ncsi_dev *ndev)) { @@ -397,6 +1105,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, nd->state = ncsi_dev_state_registered; nd->dev = dev; nd->handler = handler; + ndp->pending_req_num = 0; + INIT_LIST_HEAD(&ndp->channel_queue); + INIT_WORK(&ndp->work, ncsi_dev_work); /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); @@ -411,24 +1122,76 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, } spin_lock_irqsave(&ncsi_dev_lock, flags); +#if IS_ENABLED(CONFIG_IPV6) + ndp->inet6_addr_num = 0; + if (list_empty(&ncsi_dev_list)) + register_inet6addr_notifier(&ncsi_inet6addr_notifier); +#endif list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); + /* Register NCSI packet Rx handler */ + ndp->ptype.type = cpu_to_be16(ETH_P_NCSI); + ndp->ptype.func = ncsi_rcv_rsp; + ndp->ptype.dev = dev; + dev_add_pack(&ndp->ptype); + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); +int ncsi_start_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np; + struct ncsi_channel *nc; + int old_state, ret; + + if (nd->state != ncsi_dev_state_registered && + nd->state != ncsi_dev_state_functional) + return -ENOTTY; + + if (!(ndp->flags & NCSI_DEV_PROBED)) { + nd->state = ncsi_dev_state_probe; + schedule_work(&ndp->work); + return 0; + } + + /* Reset channel's state and start over */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + WARN_ON_ONCE(!list_empty(&nc->link) || + old_state == NCSI_CHANNEL_INVISIBLE); + } + } + + if (ndp->flags & NCSI_DEV_HWA) + ret = ncsi_enable_hwa(ndp); + else + ret = ncsi_choose_active_channel(ndp); + + return ret; +} +EXPORT_SYMBOL_GPL(ncsi_start_dev); + void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); struct ncsi_package *np, *tmp; unsigned long flags; + dev_remove_pack(&ndp->ptype); + list_for_each_entry_safe(np, tmp, &ndp->packages, node) ncsi_remove_package(np); spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); +#if IS_ENABLED(CONFIG_IPV6) + if (list_empty(&ncsi_dev_list)) + unregister_inet6addr_notifier(&ncsi_inet6addr_notifier); +#endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); kfree(ndp); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 6ec25cb2608c..a21af88330aa 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -69,6 +69,9 @@ static int ncsi_rsp_handler_cis(struct ncsi_request *nr) rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, &nc); if (!nc) { + if (ndp->flags & NCSI_DEV_PROBED) + return -ENXIO; + id = NCSI_CHANNEL_INDEX(rsp->rsp.common.channel); nc = ncsi_add_channel(np, id); } @@ -90,6 +93,9 @@ static int ncsi_rsp_handler_sp(struct ncsi_request *nr) ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, &np, NULL); if (!np) { + if (ndp->flags & NCSI_DEV_PROBED) + return -ENXIO; + id = NCSI_PACKAGE_INDEX(rsp->rsp.common.channel); np = ncsi_add_package(ndp, id); if (!np) @@ -297,6 +303,7 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; struct ncsi_channel_mode *ncm; + unsigned long flags; /* Find the package and channel */ rsp = (struct ncsi_rsp_gls_pkt *)skb_network_header(nr->rsp); @@ -310,6 +317,14 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) ncm->data[3] = ntohl(rsp->other); ncm->data[4] = ntohl(rsp->oem_status); + if (nr->driven) + return 0; + + /* Reset the channel monitor if it has been enabled */ + spin_lock_irqsave(&nc->lock, flags); + nc->timeout = 0; + spin_unlock_irqrestore(&nc->lock, flags); + return 0; } -- cgit v1.2.3 From 82de0be6862cdca2e6802267bda57cfc8844d3a7 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 18 Jul 2016 11:39:23 +0800 Subject: netfilter: Add helper array register/unregister functions Add nf_ct_helper_init(), nf_conntrack_helpers_register() and nf_conntrack_helpers_unregister() functions to avoid repetitive opencoded initialization in helpers. This patch keeps an id parameter for nf_ct_helper_init() not to break helper matching by name that has been inconsistently exposed to userspace through ports, eg. ftp-2121, and through an incremental id, eg. tftp-1. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 15 ++++++ net/netfilter/nf_conntrack_ftp.c | 58 +++++++--------------- net/netfilter/nf_conntrack_helper.c | 57 ++++++++++++++++++++++ net/netfilter/nf_conntrack_irc.c | 36 +++++--------- net/netfilter/nf_conntrack_sane.c | 57 ++++++++-------------- net/netfilter/nf_conntrack_sip.c | 75 +++++++++++------------------ net/netfilter/nf_conntrack_tftp.c | 48 ++++++------------ 7 files changed, 165 insertions(+), 181 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 6cf614bc0029..1eaac1f4cd6a 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -58,10 +58,25 @@ struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); +void nf_ct_helper_init(struct nf_conntrack_helper *helper, + u16 l3num, u16 protonum, const char *name, + u16 default_port, u16 spec_port, u32 id, + const struct nf_conntrack_expect_policy *exp_pol, + u32 expect_class_max, u32 data_len, + int (*help)(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo), + int (*from_nlattr)(struct nlattr *attr, + struct nf_conn *ct), + struct module *module); int nf_conntrack_helper_register(struct nf_conntrack_helper *); void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); +int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); +void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, + unsigned int); + struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, struct nf_conntrack_helper *helper, gfp_t gfp); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 19efeba02abb..43147005bea3 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -572,7 +572,7 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) return 0; } -static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper ftp[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy ftp_exp_policy = { .max_expected = 1, @@ -582,24 +582,13 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_ftp_fini(void) { - int i, j; - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { - if (ftp[i][j].me == NULL) - continue; - - pr_debug("unregistering helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - nf_conntrack_helper_unregister(&ftp[i][j]); - } - } - + nf_conntrack_helpers_unregister(ftp, ports_c * 2); kfree(ftp_buffer); } static int __init nf_conntrack_ftp_init(void) { - int i, j = -1, ret = 0; + int i, ret = 0; ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) @@ -611,32 +600,21 @@ static int __init nf_conntrack_ftp_init(void) /* FIXME should be configurable whether IPv4 and IPv6 FTP connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - ftp[i][0].tuple.src.l3num = PF_INET; - ftp[i][1].tuple.src.l3num = PF_INET6; - for (j = 0; j < 2; j++) { - ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master); - ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); - ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; - ftp[i][j].expect_policy = &ftp_exp_policy; - ftp[i][j].me = THIS_MODULE; - ftp[i][j].help = help; - ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr; - if (ports[i] == FTP_PORT) - sprintf(ftp[i][j].name, "ftp"); - else - sprintf(ftp[i][j].name, "ftp-%d", ports[i]); - - pr_debug("registering helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - ret = nf_conntrack_helper_register(&ftp[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_ftp_fini(); - return ret; - } - } + nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp", + FTP_PORT, ports[i], ports[i], &ftp_exp_policy, + 0, sizeof(struct nf_ct_ftp_master), help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp", + FTP_PORT, ports[i], ports[i], &ftp_exp_policy, + 0, sizeof(struct nf_ct_ftp_master), help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(ftp, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + kfree(ftp_buffer); + return ret; } return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a4294e949cdc..b989b81ac156 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -465,6 +465,63 @@ restart: } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); +void nf_ct_helper_init(struct nf_conntrack_helper *helper, + u16 l3num, u16 protonum, const char *name, + u16 default_port, u16 spec_port, u32 id, + const struct nf_conntrack_expect_policy *exp_pol, + u32 expect_class_max, u32 data_len, + int (*help)(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo), + int (*from_nlattr)(struct nlattr *attr, + struct nf_conn *ct), + struct module *module) +{ + helper->tuple.src.l3num = l3num; + helper->tuple.dst.protonum = protonum; + helper->tuple.src.u.all = htons(spec_port); + helper->expect_policy = exp_pol; + helper->expect_class_max = expect_class_max; + helper->data_len = data_len; + helper->help = help; + helper->from_nlattr = from_nlattr; + helper->me = module; + + if (spec_port == default_port) + snprintf(helper->name, sizeof(helper->name), "%s", name); + else + snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_init); + +int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_conntrack_helper_register(&helper[i]); + if (err < 0) + goto err; + } + + return err; +err: + if (i > 0) + nf_conntrack_helpers_unregister(helper, i); + return err; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register); + +void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, + unsigned int n) +{ + while (n-- > 0) + nf_conntrack_helper_unregister(&helper[n]); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); + static struct nf_ct_ext_type helper_extend __read_mostly = { .len = sizeof(struct nf_conn_help), .align = __alignof__(struct nf_conn_help), diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index f97ac61d2536..1972a149f958 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -255,27 +255,18 @@ static int __init nf_conntrack_irc_init(void) ports[ports_c++] = IRC_PORT; for (i = 0; i < ports_c; i++) { - irc[i].tuple.src.l3num = AF_INET; - irc[i].tuple.src.u.tcp.port = htons(ports[i]); - irc[i].tuple.dst.protonum = IPPROTO_TCP; - irc[i].expect_policy = &irc_exp_policy; - irc[i].me = THIS_MODULE; - irc[i].help = help; - - if (ports[i] == IRC_PORT) - sprintf(irc[i].name, "irc"); - else - sprintf(irc[i].name, "irc-%u", i); - - ret = nf_conntrack_helper_register(&irc[i]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - irc[i].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_irc_fini(); - return ret; - } + nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc", + IRC_PORT, ports[i], i, &irc_exp_policy, + 0, 0, help, NULL, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(&irc[0], ports_c); + if (ret) { + pr_err("failed to register helpers\n"); + kfree(irc_buffer); + return ret; } + return 0; } @@ -283,10 +274,7 @@ static int __init nf_conntrack_irc_init(void) * it is needed by the init function */ static void nf_conntrack_irc_fini(void) { - int i; - - for (i = 0; i < ports_c; i++) - nf_conntrack_helper_unregister(&irc[i]); + nf_conntrack_helpers_unregister(irc, ports_c); kfree(irc_buffer); } diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 3fcbaab83b3d..9dcb9ee9b97d 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -166,7 +166,7 @@ out: return ret; } -static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper sane[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy sane_exp_policy = { .max_expected = 1, @@ -176,22 +176,13 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_sane_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) { - pr_debug("unregistering helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - nf_conntrack_helper_unregister(&sane[i][j]); - } - } - + nf_conntrack_helpers_unregister(sane, ports_c * 2); kfree(sane_buffer); } static int __init nf_conntrack_sane_init(void) { - int i, j = -1, ret = 0; + int i, ret = 0; sane_buffer = kmalloc(65536, GFP_KERNEL); if (!sane_buffer) @@ -203,31 +194,23 @@ static int __init nf_conntrack_sane_init(void) /* FIXME should be configurable whether IPv4 and IPv6 connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - sane[i][0].tuple.src.l3num = PF_INET; - sane[i][1].tuple.src.l3num = PF_INET6; - for (j = 0; j < 2; j++) { - sane[i][j].data_len = sizeof(struct nf_ct_sane_master); - sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); - sane[i][j].tuple.dst.protonum = IPPROTO_TCP; - sane[i][j].expect_policy = &sane_exp_policy; - sane[i][j].me = THIS_MODULE; - sane[i][j].help = help; - if (ports[i] == SANE_PORT) - sprintf(sane[i][j].name, "sane"); - else - sprintf(sane[i][j].name, "sane-%d", ports[i]); - - pr_debug("registering helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - ret = nf_conntrack_helper_register(&sane[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_sane_fini(); - return ret; - } - } + nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane", + SANE_PORT, ports[i], ports[i], + &sane_exp_policy, 0, + sizeof(struct nf_ct_sane_master), help, NULL, + THIS_MODULE); + nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane", + SANE_PORT, ports[i], ports[i], + &sane_exp_policy, 0, + sizeof(struct nf_ct_sane_master), help, NULL, + THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(sane, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + kfree(sane_buffer); + return ret; } return 0; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index f72ba5587588..8d9db9d4702b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1589,7 +1589,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen); } -static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; +static struct nf_conntrack_helper sip[MAX_PORTS * 4] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1616,20 +1616,12 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1 static void nf_conntrack_sip_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { - if (sip[i][j].me == NULL) - continue; - nf_conntrack_helper_unregister(&sip[i][j]); - } - } + nf_conntrack_helpers_unregister(sip, ports_c * 4); } static int __init nf_conntrack_sip_init(void) { - int i, j, ret; + int i, ret; if (ports_c == 0) ports[ports_c++] = SIP_PORT; @@ -1637,43 +1629,32 @@ static int __init nf_conntrack_sip_init(void) for (i = 0; i < ports_c; i++) { memset(&sip[i], 0, sizeof(sip[i])); - sip[i][0].tuple.src.l3num = AF_INET; - sip[i][0].tuple.dst.protonum = IPPROTO_UDP; - sip[i][0].help = sip_help_udp; - sip[i][1].tuple.src.l3num = AF_INET; - sip[i][1].tuple.dst.protonum = IPPROTO_TCP; - sip[i][1].help = sip_help_tcp; - - sip[i][2].tuple.src.l3num = AF_INET6; - sip[i][2].tuple.dst.protonum = IPPROTO_UDP; - sip[i][2].help = sip_help_udp; - sip[i][3].tuple.src.l3num = AF_INET6; - sip[i][3].tuple.dst.protonum = IPPROTO_TCP; - sip[i][3].help = sip_help_tcp; - - for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { - sip[i][j].data_len = sizeof(struct nf_ct_sip_master); - sip[i][j].tuple.src.u.udp.port = htons(ports[i]); - sip[i][j].expect_policy = sip_exp_policy; - sip[i][j].expect_class_max = SIP_EXPECT_MAX; - sip[i][j].me = THIS_MODULE; - - if (ports[i] == SIP_PORT) - sprintf(sip[i][j].name, "sip"); - else - sprintf(sip[i][j].name, "sip-%u", i); - - pr_debug("port #%u: %u\n", i, ports[i]); + nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_udp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_tcp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_udp, + NULL, THIS_MODULE); + nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip", + SIP_PORT, ports[i], i, sip_exp_policy, + SIP_EXPECT_MAX, + sizeof(struct nf_ct_sip_master), sip_help_tcp, + NULL, THIS_MODULE); + } - ret = nf_conntrack_helper_register(&sip[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - sip[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_sip_fini(); - return ret; - } - } + ret = nf_conntrack_helpers_register(sip, ports_c * 4); + if (ret < 0) { + pr_err("failed to register helpers\n"); + return ret; } return 0; } diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 2e65b5430fba..b1227dc6f75e 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -97,7 +97,7 @@ static int tftp_help(struct sk_buff *skb, return ret; } -static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; +static struct nf_conntrack_helper tftp[MAX_PORTS * 2] __read_mostly; static const struct nf_conntrack_expect_policy tftp_exp_policy = { .max_expected = 1, @@ -106,47 +106,29 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = { static void nf_conntrack_tftp_fini(void) { - int i, j; - - for (i = 0; i < ports_c; i++) { - for (j = 0; j < 2; j++) - nf_conntrack_helper_unregister(&tftp[i][j]); - } + nf_conntrack_helpers_unregister(tftp, ports_c * 2); } static int __init nf_conntrack_tftp_init(void) { - int i, j, ret; + int i, ret; if (ports_c == 0) ports[ports_c++] = TFTP_PORT; for (i = 0; i < ports_c; i++) { - memset(&tftp[i], 0, sizeof(tftp[i])); - - tftp[i][0].tuple.src.l3num = AF_INET; - tftp[i][1].tuple.src.l3num = AF_INET6; - for (j = 0; j < 2; j++) { - tftp[i][j].tuple.dst.protonum = IPPROTO_UDP; - tftp[i][j].tuple.src.u.udp.port = htons(ports[i]); - tftp[i][j].expect_policy = &tftp_exp_policy; - tftp[i][j].me = THIS_MODULE; - tftp[i][j].help = tftp_help; - - if (ports[i] == TFTP_PORT) - sprintf(tftp[i][j].name, "tftp"); - else - sprintf(tftp[i][j].name, "tftp-%u", i); - - ret = nf_conntrack_helper_register(&tftp[i][j]); - if (ret) { - pr_err("failed to register helper for pf: %u port: %u\n", - tftp[i][j].tuple.src.l3num, ports[i]); - ports_c = i; - nf_conntrack_tftp_fini(); - return ret; - } - } + nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp", + TFTP_PORT, ports[i], i, &tftp_exp_policy, + 0, 0, tftp_help, NULL, THIS_MODULE); + nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp", + TFTP_PORT, ports[i], i, &tftp_exp_policy, + 0, 0, tftp_help, NULL, THIS_MODULE); + } + + ret = nf_conntrack_helpers_register(tftp, ports_c * 2); + if (ret < 0) { + pr_err("failed to register helpers\n"); + return ret; } return 0; } -- cgit v1.2.3 From 5f652bb2eb3eb38f97194ce5c41da1fa12e914b8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Jul 2016 18:11:31 +0200 Subject: gro_cells: gro_cells_receive now return error code so that the caller can update stats accordingly, if needed Signed-off-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/gro_cells.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h index cf6c74550baa..d15214d673b2 100644 --- a/include/net/gro_cells.h +++ b/include/net/gro_cells.h @@ -14,27 +14,26 @@ struct gro_cells { struct gro_cell __percpu *cells; }; -static inline void gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) +static inline int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct gro_cell *cell; struct net_device *dev = skb->dev; - if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) { - netif_rx(skb); - return; - } + if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) + return netif_rx(skb); cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); - return; + return NET_RX_DROP; } __skb_queue_tail(&cell->napi_skbs, skb); if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); + return NET_RX_SUCCESS; } /* called under BH context */ -- cgit v1.2.3 From 23014011ba4209a086931ff402eac1c41abbe456 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Jul 2016 12:51:16 +0200 Subject: netfilter: conntrack: support a fixed size of 128 distinct labels The conntrack label extension is currently variable-sized, e.g. if only 2 labels are used by iptables rules then the labels->bits[] array will only contain one element. We track size of each label storage area in the 'words' member. But in nftables and openvswitch we always have to ask for worst-case since we don't know what bit will be used at configuration time. As most arches are 64bit we need to allocate 24 bytes in this case: struct nf_conn_labels { u8 words; /* 0 1 */ /* XXX 7 bytes hole, try to pack */ long unsigned bits[2]; /* 8 24 */ Make bits a fixed size and drop the words member, it simplifies the code and only increases memory requirements on x86 when less than 64bit labels are required. We still only allocate the extension if its needed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_labels.h | 16 ++++------------ net/netfilter/nf_conntrack_labels.c | 13 +++---------- net/netfilter/nf_conntrack_netlink.c | 10 +++++----- net/netfilter/nft_ct.c | 13 +++---------- net/netfilter/xt_connlabel.c | 2 +- net/openvswitch/conntrack.c | 4 ++-- 6 files changed, 18 insertions(+), 40 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index c5f8fc736b3d..0fd4989de836 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -10,8 +10,7 @@ #define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) struct nf_conn_labels { - u8 words; - unsigned long bits[]; + unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)]; }; static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) @@ -26,20 +25,13 @@ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS - struct nf_conn_labels *cl_ext; struct net *net = nf_ct_net(ct); - u8 words; - words = ACCESS_ONCE(net->ct.label_words); - if (words == 0) + if (net->ct.labels_used == 0) return NULL; - cl_ext = nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS, - words * sizeof(long), GFP_ATOMIC); - if (cl_ext != NULL) - cl_ext->words = words; - - return cl_ext; + return nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS, + sizeof(struct nf_conn_labels), GFP_ATOMIC); #else return NULL; #endif diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 252e6a7cd2f1..7686200f9ace 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -20,7 +20,7 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - if (!labels || BIT_WORD(bit) >= labels->words) + if (!labels) return -ENOSPC; if (test_bit(bit, labels->bits)) @@ -60,7 +60,7 @@ int nf_connlabels_replace(struct nf_conn *ct, if (!labels) return -ENOSPC; - size = labels->words * sizeof(long); + size = sizeof(labels->bits); if (size < (words32 * sizeof(u32))) words32 = size / sizeof(u32); @@ -80,16 +80,11 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { - size_t words; - - words = BIT_WORD(bits) + 1; - if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long)) + if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; spin_lock(&nf_connlabels_lock); net->ct.labels_used++; - if (words > net->ct.label_words) - net->ct.label_words = words; spin_unlock(&nf_connlabels_lock); return 0; @@ -100,8 +95,6 @@ void nf_connlabels_put(struct net *net) { spin_lock(&nf_connlabels_lock); net->ct.labels_used--; - if (net->ct.labels_used == 0) - net->ct.label_words = 0; spin_unlock(&nf_connlabels_lock); } EXPORT_SYMBOL_GPL(nf_connlabels_put); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index a18d1ceabad5..050bb3420a6b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -346,25 +346,25 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct) if (!labels) return 0; - return nla_total_size(labels->words * sizeof(long)); + return nla_total_size(sizeof(labels->bits)); } static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - unsigned int len, i; + unsigned int i; if (!labels) return 0; - len = labels->words * sizeof(long); i = 0; do { if (labels->bits[i] != 0) - return nla_put(skb, CTA_LABELS, len, labels->bits); + return nla_put(skb, CTA_LABELS, sizeof(labels->bits), + labels->bits); i++; - } while (i < labels->words); + } while (i < ARRAY_SIZE(labels->bits)); return 0; } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index d9e44ca34055..2f47d5d3ae3b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -113,18 +113,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr, #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - unsigned int size; - if (!labels) { + if (labels) + memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); + else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); - return; - } - - size = labels->words * sizeof(long); - memcpy(dest, labels->bits, size); - if (size < NF_CT_LABELS_MAX_SIZE) - memset(((char *) dest) + size, 0, - NF_CT_LABELS_MAX_SIZE - size); return; } #endif diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index a79af255561a..c9fba8ade3d5 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -25,7 +25,7 @@ static bool connlabel_match(const struct nf_conn *ct, u16 bit) if (!labels) return false; - return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits); + return test_bit(bit, labels->bits); } static bool diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index b4069a90e375..c644c78ed485 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -135,7 +135,7 @@ static void ovs_ct_get_labels(const struct nf_conn *ct, struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; if (cl) { - size_t len = cl->words * sizeof(long); + size_t len = sizeof(cl->bits); if (len > OVS_CT_LABELS_LEN) len = OVS_CT_LABELS_LEN; @@ -274,7 +274,7 @@ static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN) return -ENOSPC; err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, -- cgit v1.2.3 From 857ed310c013fe0d0059f955048dab589fa7a57a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Jul 2016 12:51:17 +0200 Subject: netfilter: connlabels: move set helper to xt_connlabel xt_connlabel is the only user so move it. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_labels.h | 2 -- net/netfilter/nf_conntrack_labels.c | 17 ----------------- net/netfilter/xt_connlabel.c | 29 ++++++++++++++++------------- 3 files changed, 16 insertions(+), 32 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index 0fd4989de836..498814626e28 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -37,8 +37,6 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) #endif } -int nf_connlabel_set(struct nf_conn *ct, u16 bit); - int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words); diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 7686200f9ace..bcab8bde7312 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -16,23 +16,6 @@ static spinlock_t nf_connlabels_lock; -int nf_connlabel_set(struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return -ENOSPC; - - if (test_bit(bit, labels->bits)) - return 0; - - if (!test_and_set_bit(bit, labels->bits)) - nf_conntrack_event_cache(IPCT_LABEL, ct); - - return 0; -} -EXPORT_SYMBOL_GPL(nf_connlabel_set); - static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index c9fba8ade3d5..03d66f1c5e69 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,21 +19,12 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); -static bool connlabel_match(const struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return false; - - return test_bit(bit, labels->bits); -} - static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connlabel_mtinfo *info = par->matchinfo; enum ip_conntrack_info ctinfo; + struct nf_conn_labels *labels; struct nf_conn *ct; bool invert = info->options & XT_CONNLABEL_OP_INVERT; @@ -40,10 +32,21 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL || nf_ct_is_untracked(ct)) return invert; - if (info->options & XT_CONNLABEL_OP_SET) - return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; + labels = nf_ct_labels_find(ct); + if (!labels) + return invert; + + if (test_bit(info->bit, labels->bits)) + return !invert; + + if (info->options & XT_CONNLABEL_OP_SET) { + if (!test_and_set_bit(info->bit, labels->bits)) + nf_conntrack_event_cache(IPCT_LABEL, ct); + + return !invert; + } - return connlabel_match(ct, info->bit) ^ invert; + return invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) -- cgit v1.2.3 From b87f7936a93246804cf70e7e2e0568799c948bb1 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Thu, 21 Jul 2016 12:03:12 +0200 Subject: net/sched: Add match-all classifier hw offloading. Following the work that have been done on offloading classifiers like u32 and flower, now the match-all classifier hw offloading is possible. if the interface supports tc offloading. To control the offloading, two tc flags have been introduced: skip_sw and skip_hw. Typical usage: tc filter add dev eth25 parent ffff: \ matchall skip_sw \ action mirred egress mirror \ dev eth27 Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ include/net/pkt_cls.h | 11 +++++++ include/uapi/linux/pkt_cls.h | 1 + net/sched/cls_matchall.c | 76 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 87 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 43c749b1b619..076df5360ba5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -787,6 +787,7 @@ enum { TC_SETUP_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, + TC_SETUP_MATCHALL, }; struct tc_cls_u32_offload; @@ -797,6 +798,7 @@ struct tc_to_netdev { u8 tc; struct tc_cls_u32_offload *cls_u32; struct tc_cls_flower_offload *cls_flower; + struct tc_cls_matchall_offload *cls_mall; }; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 3722dda0199d..6f8d65342d3a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -442,4 +442,15 @@ struct tc_cls_flower_offload { struct tcf_exts *exts; }; +enum tc_matchall_command { + TC_CLSMATCHALL_REPLACE, + TC_CLSMATCHALL_DESTROY, +}; + +struct tc_cls_matchall_offload { + enum tc_matchall_command command; + struct tcf_exts *exts; + unsigned long cookie; +}; + #endif diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index a32494887e01..d1c1ccaba787 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -439,6 +439,7 @@ enum { TCA_MATCHALL_UNSPEC, TCA_MATCHALL_CLASSID, TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, __TCA_MATCHALL_MAX, }; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 8a6b4de7a99a..25927b6c4436 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_filter { struct tcf_result res; u32 handle; struct rcu_head rcu; + u32 flags; }; struct cls_mall_head { @@ -34,6 +35,9 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_mall_head *head = rcu_dereference_bh(tp->root); struct cls_mall_filter *f = head->filter; + if (tc_skip_sw(f->flags)) + return -1; + return tcf_exts_exec(skb, &f->exts, res); } @@ -55,18 +59,61 @@ static void mall_destroy_filter(struct rcu_head *head) struct cls_mall_filter *f = container_of(head, struct cls_mall_filter, rcu); tcf_exts_destroy(&f->exts); + kfree(f); } +static int mall_replace_hw_filter(struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_to_netdev offload; + struct tc_cls_matchall_offload mall_offload = {0}; + + offload.type = TC_SETUP_MATCHALL; + offload.cls_mall = &mall_offload; + offload.cls_mall->command = TC_CLSMATCHALL_REPLACE; + offload.cls_mall->exts = &f->exts; + offload.cls_mall->cookie = cookie; + + return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); +} + +static void mall_destroy_hw_filter(struct tcf_proto *tp, + struct cls_mall_filter *f, + unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_to_netdev offload; + struct tc_cls_matchall_offload mall_offload = {0}; + + offload.type = TC_SETUP_MATCHALL; + offload.cls_mall = &mall_offload; + offload.cls_mall->command = TC_CLSMATCHALL_DESTROY; + offload.cls_mall->exts = NULL; + offload.cls_mall->cookie = cookie; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &offload); +} + static bool mall_destroy(struct tcf_proto *tp, bool force) { struct cls_mall_head *head = rtnl_dereference(tp->root); + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_mall_filter *f = head->filter; - if (!force && head->filter) + if (!force && f) return false; - if (head->filter) - call_rcu(&head->filter->rcu, mall_destroy_filter); + if (f) { + if (tc_should_offload(dev, tp, f->flags)) + mall_destroy_hw_filter(tp, f, (unsigned long) f); + + call_rcu(&f->rcu, mall_destroy_filter); + } RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); return true; @@ -117,8 +164,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, { struct cls_mall_head *head = rtnl_dereference(tp->root); struct cls_mall_filter *fold = (struct cls_mall_filter *) *arg; + struct net_device *dev = tp->q->dev_queue->dev; struct cls_mall_filter *f; struct nlattr *tb[TCA_MATCHALL_MAX + 1]; + u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -135,6 +184,12 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; + if (tb[TCA_MATCHALL_FLAGS]) { + flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); + if (!tc_flags_valid(flags)) + return -EINVAL; + } + f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return -ENOBUFS; @@ -144,11 +199,22 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!handle) handle = 1; f->handle = handle; + f->flags = flags; err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); if (err) goto errout; + if (tc_should_offload(dev, tp, flags)) { + err = mall_replace_hw_filter(tp, f, (unsigned long) f); + if (err) { + if (tc_skip_sw(flags)) + goto errout; + else + err = 0; + } + } + *arg = (unsigned long) f; rcu_assign_pointer(head->filter, f); @@ -163,6 +229,10 @@ static int mall_delete(struct tcf_proto *tp, unsigned long arg) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct cls_mall_filter *f = (struct cls_mall_filter *) arg; + struct net_device *dev = tp->q->dev_queue->dev; + + if (tc_should_offload(dev, tp, f->flags)) + mall_destroy_hw_filter(tp, f, (unsigned long) f); RCU_INIT_POINTER(head->filter, NULL); tcf_unbind_filter(tp, &f->res); -- cgit v1.2.3 From 56a20680f70393d199fc5e8ecc7859549ca5a0c0 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Thu, 21 Jul 2016 12:03:16 +0200 Subject: net/sched: act_mirred: Add helper inlines to access tcf_mirred info. The helper function is_tcf_mirred_mirror helps finding whether an action struct is of type mirred and is configured to be of type mirror. Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index e891835eb74e..6a13a7c74e0c 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -24,6 +24,15 @@ static inline bool is_tcf_mirred_redirect(const struct tc_action *a) return false; } +static inline bool is_tcf_mirred_mirror(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_MIRRED) + return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR; +#endif + return false; +} + static inline int tcf_mirred_ifindex(const struct tc_action *a) { return to_mirred(a)->tcfm_ifindex; -- cgit v1.2.3 From 9b8ac4f9dd60ac7375fb0e221dbf596db4c4e622 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 24 Jul 2016 19:24:09 +0100 Subject: gtp: #define #define _GTP_H_ and not #define _GTP_H Fix clang build warning: ./include/net/gtp.h:1:9: warning: '_GTP_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard] fix by defining _GTP_H_ and not _GTP_H Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- include/net/gtp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/gtp.h b/include/net/gtp.h index 894a37b87d63..6398891b99ba 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -1,5 +1,5 @@ #ifndef _GTP_H_ -#define _GTP_H +#define _GTP_H_ /* General GTP protocol related definitions. */ -- cgit v1.2.3 From 86cb13e4ec5060d94069a8418fd4f3ccb38edee2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 25 Jul 2016 13:12:33 +0300 Subject: mlxsw: spectrum: Fix compilation error when CLS_ACT isn't set When CONFIG_NET_CLS_ACT isn't set 'struct tcf_exts' has no member named 'actions' and we therefore must not access it. Otherwise compilation fails. Fix this by introducing a new macro similar to tc_no_actions(), which always returns 'false' if CONFIG_NET_CLS_ACT isn't set. Fixes: 763b4b70afcd ("mlxsw: spectrum: Add support in matchall mirror TC offloading") Reported-by: kbuild test robot Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 11 +++++------ include/net/act_api.h | 4 ++++ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 552636b73416..c3e61500819d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1148,23 +1148,22 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, struct tc_cls_matchall_offload *cls, bool ingress) { - struct tcf_exts *exts = cls->exts; const struct tc_action *a; int err; - if (!list_is_singular(&exts->actions)) { + if (!tc_single_action(cls->exts)) { netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n"); return -ENOTSUPP; } - a = list_first_entry(&exts->actions, struct tc_action, list); - if (is_tcf_mirred_mirror(a) && protocol == htons(ETH_P_ALL)) { + tc_for_each_action(a, cls->exts) { + if (!is_tcf_mirred_mirror(a) || protocol != htons(ETH_P_ALL)) + return -ENOTSUPP; + err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port, cls, a, ingress); if (err) return err; - } else { - return -ENOTSUPP; } return 0; diff --git a/include/net/act_api.h b/include/net/act_api.h index fb82b5b5d9e7..0bb210635e5f 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -192,6 +192,9 @@ int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); #define tc_for_each_action(_a, _exts) \ list_for_each_entry(a, &(_exts)->actions, list) +#define tc_single_action(_exts) \ + (list_is_singular(&(_exts)->actions)) + static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 lastuse) { @@ -205,6 +208,7 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, #define tc_no_actions(_exts) true #define tc_for_each_action(_a, _exts) while ((void)(_a), 0) +#define tc_single_action(_exts) false #define tcf_action_stats_update(a, bytes, packets, lastuse) #endif /* CONFIG_NET_CLS_ACT */ -- cgit v1.2.3 From a85a970af265f156740977168b542234511b28a8 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Jul 2016 16:09:41 -0700 Subject: net_sched: move tc_action into tcf_common struct tc_action is confusing, currently we use it for two purposes: 1) Pass in arguments and carry out results from helper functions 2) A generic representation for tc actions The first one is error-prone, since we need to make sure we don't miss anything. This patch aims to get rid of this use, by moving tc_action into tcf_common, so that they are allocated together in hashtable and can be cast'ed easily. And together with the following patch, we could really make tc_action a generic representation for all tc actions and each type of action can inherit from it. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 52 ++++++++------ include/net/tc_act/tc_bpf.h | 3 +- include/net/tc_act/tc_connmark.h | 3 +- include/net/tc_act/tc_csum.h | 3 +- include/net/tc_act/tc_defact.h | 3 +- include/net/tc_act/tc_gact.h | 5 +- include/net/tc_act/tc_ife.h | 3 +- include/net/tc_act/tc_ipt.h | 3 +- include/net/tc_act/tc_mirred.h | 3 +- include/net/tc_act/tc_nat.h | 5 +- include/net/tc_act/tc_pedit.h | 3 +- include/net/tc_act/tc_skbedit.h | 3 +- include/net/tc_act/tc_vlan.h | 3 +- net/sched/act_api.c | 149 ++++++++++++++------------------------- net/sched/act_bpf.c | 26 +++---- net/sched/act_connmark.c | 24 ++++--- net/sched/act_csum.c | 22 +++--- net/sched/act_gact.c | 24 ++++--- net/sched/act_ife.c | 38 +++++----- net/sched/act_ipt.c | 48 +++++++------ net/sched/act_mirred.c | 26 +++---- net/sched/act_nat.c | 22 +++--- net/sched/act_pedit.c | 28 ++++---- net/sched/act_police.c | 45 ++++++------ net/sched/act_simple.c | 29 ++++---- net/sched/act_skbedit.c | 26 +++---- net/sched/act_vlan.c | 28 ++++---- 27 files changed, 298 insertions(+), 329 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 0bb210635e5f..8b199095ea51 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -10,7 +10,26 @@ #include #include + +struct tcf_hashinfo { + struct hlist_head *htab; + unsigned int hmask; + spinlock_t lock; + u32 index; +}; + +struct tc_action_ops; + +struct tc_action { + const struct tc_action_ops *ops; + __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ + __u32 order; + struct list_head list; + struct tcf_hashinfo *hinfo; +}; + struct tcf_common { + struct tc_action tcfc_act; struct hlist_node tcfc_head; u32 tcfc_index; int tcfc_refcnt; @@ -26,6 +45,7 @@ struct tcf_common { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; }; +#define tcf_act common.tcfc_act #define tcf_head common.tcfc_head #define tcf_index common.tcfc_index #define tcf_refcnt common.tcfc_refcnt @@ -39,13 +59,6 @@ struct tcf_common { #define tcf_lock common.tcfc_lock #define tcf_rcu common.tcfc_rcu -struct tcf_hashinfo { - struct hlist_head *htab; - unsigned int hmask; - spinlock_t lock; - u32 index; -}; - static inline unsigned int tcf_hash(u32 index, unsigned int hmask) { return index & hmask; @@ -88,15 +101,6 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) dtm->expires = jiffies_to_clock_t(stm->expires); } -struct tc_action { - void *priv; - const struct tc_action_ops *ops; - __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ - __u32 order; - struct list_head list; - struct tcf_hashinfo *hinfo; -}; - #ifdef CONFIG_NET_CLS_ACT #define ACT_P_CREATED 1 @@ -106,17 +110,18 @@ struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; __u32 type; /* TBD to match kind */ + size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *, int bind); - int (*lookup)(struct net *, struct tc_action *, u32); + int (*lookup)(struct net *, struct tc_action **, u32); int (*init)(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *act, int ovr, + struct nlattr *est, struct tc_action **act, int ovr, int bind); int (*walk)(struct net *, struct sk_buff *, - struct netlink_callback *, int, struct tc_action *); + struct netlink_callback *, int, const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); }; @@ -152,13 +157,14 @@ static inline void tc_action_net_exit(struct tc_action_net *tn) int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a); -int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index); + const struct tc_action_ops *ops); +int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index); u32 tcf_hash_new_index(struct tc_action_net *tn); -bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind); int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, - struct tc_action *a, int size, int bind, bool cpustats); + struct tc_action **a, const struct tc_action_ops *ops, int bind, + bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a); diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h index 958d69cfb19c..80a4d6f49773 100644 --- a/include/net/tc_act/tc_bpf.h +++ b/include/net/tc_act/tc_bpf.h @@ -23,7 +23,6 @@ struct tcf_bpf { struct sock_filter *bpf_ops; const char *bpf_name; }; -#define to_bpf(a) \ - container_of(a->priv, struct tcf_bpf, common) +#define to_bpf(a) ((struct tcf_bpf *)a) #endif /* __NET_TC_BPF_H */ diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index 02caa406611b..8a661135f4ac 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -9,7 +9,6 @@ struct tcf_connmark_info { u16 zone; }; -#define to_connmark(a) \ - container_of(a->priv, struct tcf_connmark_info, common) +#define to_connmark(a) ((struct tcf_connmark_info *)a) #endif /* __NET_TC_CONNMARK_H */ diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index fa8f5fac65e9..1a9ef15d573b 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -9,7 +9,6 @@ struct tcf_csum { u32 update_flags; }; -#define to_tcf_csum(a) \ - container_of(a->priv,struct tcf_csum,common) +#define to_tcf_csum(a) ((struct tcf_csum *)a) #endif /* __NET_TC_CSUM_H */ diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h index ab9b5d6be67b..e25b4eb4fc66 100644 --- a/include/net/tc_act/tc_defact.h +++ b/include/net/tc_act/tc_defact.h @@ -8,7 +8,6 @@ struct tcf_defact { u32 tcfd_datalen; void *tcfd_defdata; }; -#define to_defact(a) \ - container_of(a->priv, struct tcf_defact, common) +#define to_defact(a) ((struct tcf_defact *)a) #endif /* __NET_TC_DEF_H */ diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 93c520b83d10..119cdb418c23 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -13,8 +13,7 @@ struct tcf_gact { atomic_t packets; #endif }; -#define to_gact(a) \ - container_of(a->priv, struct tcf_gact, common) +#define to_gact(a) ((struct tcf_gact *)a) static inline bool is_tcf_gact_shot(const struct tc_action *a) { @@ -24,7 +23,7 @@ static inline bool is_tcf_gact_shot(const struct tc_action *a) if (a->ops && a->ops->type != TCA_ACT_GACT) return false; - gact = a->priv; + gact = to_gact(a); if (gact->tcf_action == TC_ACT_SHOT) return true; diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index c55facd17b7e..7921abe42adc 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -16,8 +16,7 @@ struct tcf_ife_info { /* list of metaids allowed */ struct list_head metalist; }; -#define to_ife(a) \ - container_of(a->priv, struct tcf_ife_info, common) +#define to_ife(a) ((struct tcf_ife_info *)a) struct tcf_meta_info { const struct tcf_meta_ops *ops; diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h index c0f4193f432c..c22ae7ab66ed 100644 --- a/include/net/tc_act/tc_ipt.h +++ b/include/net/tc_act/tc_ipt.h @@ -11,7 +11,6 @@ struct tcf_ipt { char *tcfi_tname; struct xt_entry_target *tcfi_t; }; -#define to_ipt(a) \ - container_of(a->priv, struct tcf_ipt, common) +#define to_ipt(a) ((struct tcf_ipt *)a) #endif /* __NET_TC_IPT_H */ diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 6a13a7c74e0c..89aebd22cd79 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -12,8 +12,7 @@ struct tcf_mirred { struct net_device __rcu *tcfm_dev; struct list_head tcfm_list; }; -#define to_mirred(a) \ - container_of(a->priv, struct tcf_mirred, common) +#define to_mirred(a) ((struct tcf_mirred *)a) static inline bool is_tcf_mirred_redirect(const struct tc_action *a) { diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index 63d8e9ca9d99..a91ad3ad565e 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -13,9 +13,6 @@ struct tcf_nat { u32 flags; }; -static inline struct tcf_nat *to_tcf_nat(struct tc_action *a) -{ - return container_of(a->priv, struct tcf_nat, common); -} +#define to_tcf_nat(a) ((struct tcf_nat *)a) #endif /* __NET_TC_NAT_H */ diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 5b80998879c7..2cccfbaae800 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -9,7 +9,6 @@ struct tcf_pedit { unsigned char tcfp_flags; struct tc_pedit_key *tcfp_keys; }; -#define to_pedit(a) \ - container_of(a->priv, struct tcf_pedit, common) +#define to_pedit(a) ((struct tcf_pedit *)a) #endif /* __NET_TC_PED_H */ diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index d01a5d40cfb5..9e0548998327 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -30,8 +30,7 @@ struct tcf_skbedit { u16 queue_mapping; u16 ptype; }; -#define to_skbedit(a) \ - container_of(a->priv, struct tcf_skbedit, common) +#define to_skbedit(a) ((struct tcf_skbedit *)a) /* Return true iff action is mark */ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 93b70ade1ff3..584b80788d52 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -21,7 +21,6 @@ struct tcf_vlan { u16 tcfv_push_vid; __be16 tcfv_push_proto; }; -#define to_vlan(a) \ - container_of(a->priv, struct tcf_vlan, common) +#define to_vlan(a) ((struct tcf_vlan *)a) #endif /* __NET_TC_VLAN_H */ diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 47ec2305f920..d97419f35e7e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -38,7 +38,7 @@ static void free_tcf(struct rcu_head *head) static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) { - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; spin_lock_bh(&hinfo->lock); hlist_del(&p->tcfc_head); @@ -54,7 +54,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) { - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; int ret = 0; if (p) { @@ -67,6 +67,7 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { if (a->ops->cleanup) a->ops->cleanup(a, bind); + list_del(&a->list); tcf_hash_destroy(a->hinfo, a); ret = ACT_P_DELETED; } @@ -77,10 +78,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) EXPORT_SYMBOL(__tcf_hash_release); static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, - struct netlink_callback *cb, struct tc_action *a) + struct netlink_callback *cb) { - struct hlist_head *head; - struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; @@ -89,19 +88,20 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, s_i = cb->args[0]; for (i = 0; i < (hinfo->hmask + 1); i++) { + struct hlist_head *head; + struct tcf_common *p; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; - a->priv = p; - a->order = n_i; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, a, 0, 0); + err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -125,27 +125,27 @@ nla_put_failure: } static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, - struct tc_action *a) + const struct tc_action_ops *ops) { - struct hlist_head *head; - struct hlist_node *n; - struct tcf_common *p; struct nlattr *nest; int i = 0, n_i = 0; int ret = -EINVAL; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, 0); if (nest == NULL) goto nla_put_failure; - if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { + struct hlist_head *head; + struct hlist_node *n; + struct tcf_common *p; + head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; hlist_for_each_entry_safe(p, n, head, tcfc_head) { - a->priv = p; - ret = __tcf_hash_release(a, false, true); + ret = __tcf_hash_release((struct tc_action *)p, false, true); if (ret == ACT_P_DELETED) { - module_put(a->ops->owner); + module_put(p->tcfc_act.ops->owner); n_i++; } else if (ret < 0) goto nla_put_failure; @@ -163,16 +163,14 @@ nla_put_failure: int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tcf_hashinfo *hinfo = tn->hinfo; - a->hinfo = hinfo; - if (type == RTM_DELACTION) { - return tcf_del_walker(hinfo, skb, a); + return tcf_del_walker(hinfo, skb, ops); } else if (type == RTM_GETACTION) { - return tcf_dump_walker(hinfo, skb, cb, a); + return tcf_dump_walker(hinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown action %d\n", type); return -EINVAL; @@ -210,21 +208,20 @@ u32 tcf_hash_new_index(struct tc_action_net *tn) } EXPORT_SYMBOL(tcf_hash_new_index); -int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) +int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = tcf_hash_lookup(index, hinfo); if (p) { - a->priv = p; - a->hinfo = hinfo; + *a = &p->tcfc_act; return 1; } return 0; } EXPORT_SYMBOL(tcf_hash_search); -bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; @@ -233,8 +230,7 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, if (bind) p->tcfc_bindcnt++; p->tcfc_refcnt++; - a->priv = p; - a->hinfo = hinfo; + *a = &p->tcfc_act; return true; } return false; @@ -243,7 +239,7 @@ EXPORT_SYMBOL(tcf_hash_check); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { - struct tcf_common *pc = a->priv; + struct tcf_common *pc = (struct tcf_common *)a; if (est) gen_kill_estimator(&pc->tcfc_bstats, &pc->tcfc_rate_est); @@ -252,9 +248,10 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, - struct tc_action *a, int size, int bind, bool cpustats) + struct tc_action **a, const struct tc_action_ops *ops, + int bind, bool cpustats) { - struct tcf_common *p = kzalloc(size, GFP_KERNEL); + struct tcf_common *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; @@ -294,15 +291,17 @@ err2: } } - a->priv = (void *) p; - a->hinfo = hinfo; + p->tcfc_act.hinfo = hinfo; + p->tcfc_act.ops = ops; + INIT_LIST_HEAD(&p->tcfc_act.list); + *a = &p->tcfc_act; return 0; } EXPORT_SYMBOL(tcf_hash_create); void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; struct tcf_hashinfo *hinfo = tn->hinfo; unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); @@ -315,10 +314,6 @@ EXPORT_SYMBOL(tcf_hash_insert); void tcf_hashinfo_destroy(const struct tc_action_ops *ops, struct tcf_hashinfo *hinfo) { - struct tc_action a = { - .ops = ops, - .hinfo = hinfo, - }; int i; for (i = 0; i < hinfo->hmask + 1; i++) { @@ -328,8 +323,7 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops, hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { int ret; - a.priv = p; - ret = __tcf_hash_release(&a, false, true); + ret = __tcf_hash_release((struct tc_action *)p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) @@ -466,8 +460,6 @@ int tcf_action_destroy(struct list_head *actions, int bind) module_put(a->ops->owner); else if (ret < 0) return ret; - list_del(&a->list); - kfree(a); } return ret; } @@ -581,20 +573,13 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, goto err_out; } - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_KERNEL); - if (a == NULL) - goto err_mod; - - a->ops = a_o; - INIT_LIST_HEAD(&a->list); /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind); else - err = a_o->init(net, nla, est, a, ovr, bind); + err = a_o->init(net, nla, est, &a, ovr, bind); if (err < 0) - goto err_free; + goto err_mod; /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then @@ -605,8 +590,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, return a; -err_free: - kfree(a); err_mod: module_put(a_o->owner); err_out: @@ -647,7 +630,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, { int err = 0; struct gnet_dump d; - struct tcf_common *p = a->priv; + struct tcf_common *p = (struct tcf_common *)a; if (p == NULL) goto errout; @@ -740,24 +723,11 @@ act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, return rtnl_unicast(skb, net, portid); } -static struct tc_action *create_a(int i) -{ - struct tc_action *act; - - act = kzalloc(sizeof(*act), GFP_KERNEL); - if (act == NULL) { - pr_debug("create_a: failed to alloc!\n"); - return NULL; - } - act->order = i; - INIT_LIST_HEAD(&act->list); - return act; -} - static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid) { struct nlattr *tb[TCA_ACT_MAX + 1]; + const struct tc_action_ops *ops; struct tc_action *a; int index; int err; @@ -772,26 +742,19 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, goto err_out; index = nla_get_u32(tb[TCA_ACT_INDEX]); - err = -ENOMEM; - a = create_a(0); - if (a == NULL) - goto err_out; - err = -EINVAL; - a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (a->ops == NULL) /* could happen in batch of actions */ - goto err_free; + ops = tc_lookup_action(tb[TCA_ACT_KIND]); + if (!ops) /* could happen in batch of actions */ + goto err_out; err = -ENOENT; - if (a->ops->lookup(net, a, index) == 0) + if (ops->lookup(net, &a, index) == 0) goto err_mod; - module_put(a->ops->owner); + module_put(ops->owner); return a; err_mod: - module_put(a->ops->owner); -err_free: - kfree(a); + module_put(ops->owner); err_out: return ERR_PTR(err); } @@ -816,8 +779,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; + const struct tc_action_ops *ops; struct nlattr *kind; - struct tc_action a; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); @@ -834,10 +797,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, err = -EINVAL; kind = tb[TCA_ACT_KIND]; - memset(&a, 0, sizeof(struct tc_action)); - INIT_LIST_HEAD(&a.list); - a.ops = tc_lookup_action(kind); - if (a.ops == NULL) /*some idjot trying to flush unknown action */ + ops = tc_lookup_action(kind); + if (!ops) /*some idjot trying to flush unknown action */ goto err_out; nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, @@ -853,7 +814,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, if (nest == NULL) goto out_module_put; - err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a); + err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); if (err < 0) goto out_module_put; if (err == 0) @@ -863,7 +824,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; - module_put(a.ops->owner); + module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) @@ -872,7 +833,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, return err; out_module_put: - module_put(a.ops->owner); + module_put(ops->owner); err_out: noflush_out: kfree_skb(skb); @@ -1084,7 +1045,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; - struct tc_action a; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *kind = find_dump_kind(cb->nlh); @@ -1098,9 +1058,6 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o == NULL) return 0; - memset(&a, 0, sizeof(struct tc_action)); - a.ops = a_o; - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) @@ -1114,7 +1071,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index ef74bffa6101..bfa870731e74 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -34,11 +34,12 @@ struct tcf_bpf_cfg { }; static int bpf_net_id; +static struct tc_action_ops act_bpf_ops; static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { - struct tcf_bpf *prog = act->priv; + struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; @@ -134,7 +135,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); - struct tcf_bpf *prog = act->priv; + struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, .refcnt = prog->tcf_refcnt - ref, @@ -270,7 +271,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, } static int tcf_bpf_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *act, + struct nlattr *est, struct tc_action **act, int replace, int bind) { struct tc_action_net *tn = net_generic(net, bpf_net_id); @@ -295,7 +296,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, act, bind)) { ret = tcf_hash_create(tn, parm->index, est, act, - sizeof(*prog), bind, true); + &act_bpf_ops, bind, true); if (ret < 0) return ret; @@ -305,7 +306,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - tcf_hash_release(act, bind); + tcf_hash_release(*act, bind); if (!replace) return -EEXIST; } @@ -325,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (ret < 0) goto out; - prog = to_bpf(act); + prog = to_bpf(*act); ASSERT_RTNL(); if (res != ACT_P_CREATED) @@ -343,7 +344,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(prog->filter, cfg.filter); if (res == ACT_P_CREATED) { - tcf_hash_insert(tn, act); + tcf_hash_insert(tn, *act); } else { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); @@ -353,7 +354,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: if (res == ACT_P_CREATED) - tcf_hash_cleanup(act, est); + tcf_hash_cleanup(*act, est); return ret; } @@ -362,20 +363,20 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind) { struct tcf_bpf_cfg tmp; - tcf_bpf_prog_fill_cfg(act->priv, &tmp); + tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp); tcf_bpf_cfg_cleanup(&tmp); } static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, bpf_net_id); @@ -392,6 +393,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .size = sizeof(struct tcf_bpf), }; static __net_init int bpf_init_net(struct net *net) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 35a5270f289d..eae07a2e774d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -31,6 +31,7 @@ #define CONNMARK_TAB_MASK 3 static int connmark_net_id; +static struct tc_action_ops act_connmark_ops; static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -38,7 +39,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; - struct tcf_connmark_info *ca = a->priv; + struct tcf_connmark_info *ca = to_connmark(a); struct nf_conntrack_zone zone; struct nf_conn *c; int proto; @@ -96,7 +97,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { }; static int tcf_connmark_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -116,22 +117,22 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*ci), bind, false); + &act_connmark_ops, bind, false); if (ret) return ret; - ci = to_connmark(a); + ci = to_connmark(*a); ci->tcf_action = parm->action; ci->net = net; ci->zone = parm->zone; - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); ret = ACT_P_CREATED; } else { - ci = to_connmark(a); + ci = to_connmark(*a); if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; /* replacing action and zone */ @@ -146,7 +147,7 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = a->priv; + struct tcf_connmark_info *ci = to_connmark(a); struct tc_connmark opt = { .index = ci->tcf_index, @@ -173,14 +174,14 @@ nla_put_failure: static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -196,6 +197,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .size = sizeof(struct tcf_connmark_info), }; static __net_init int connmark_init_net(struct net *net) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index dcd9ababd351..b5dbf633a863 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -43,9 +43,10 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { }; static int csum_net_id; +static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, csum_net_id); @@ -67,26 +68,26 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_csum_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - p = to_tcf_csum(a); + p = to_tcf_csum(*a); spin_lock_bh(&p->tcf_lock); p->tcf_action = parm->action; p->update_flags = parm->update_flags; spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -496,7 +497,7 @@ fail: static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_csum *p = a->priv; + struct tcf_csum *p = to_tcf_csum(a); int action; u32 update_flags; @@ -534,7 +535,7 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = a->priv; + struct tcf_csum *p = to_tcf_csum(a); struct tc_csum opt = { .update_flags = p->update_flags, .index = p->tcf_index, @@ -560,14 +561,14 @@ nla_put_failure: static int tcf_csum_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, csum_net_id); @@ -583,6 +584,7 @@ static struct tc_action_ops act_csum_ops = { .init = tcf_csum_init, .walk = tcf_csum_walker, .lookup = tcf_csum_search, + .size = sizeof(struct tcf_csum), }; static __net_init int csum_init_net(struct net *net) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 19058a7f3e5c..e24a4093d6f6 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -26,6 +26,7 @@ #define GACT_TAB_MASK 15 static int gact_net_id; +static struct tc_action_ops act_gact_ops; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) @@ -56,7 +57,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { }; static int tcf_gact_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, gact_net_id); @@ -93,19 +94,19 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*gact), bind, true); + &act_gact_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - gact = to_gact(a); + gact = to_gact(*a); ASSERT_RTNL(); gact->tcf_action = parm->action; @@ -121,14 +122,14 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); #ifdef CONFIG_GACT_PROB @@ -151,7 +152,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, u64 lastuse) { - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; @@ -166,7 +167,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_gact *gact = a->priv; + struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, .refcnt = gact->tcf_refcnt - ref, @@ -201,14 +202,14 @@ nla_put_failure: static int tcf_gact_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, gact_net_id); @@ -225,6 +226,7 @@ static struct tc_action_ops act_gact_ops = { .init = tcf_gact_init, .walk = tcf_gact_walker, .lookup = tcf_gact_search, + .size = sizeof(struct tcf_gact), }; static __net_init int gact_init_net(struct net *net) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 845ab5119c05..141a06eeb1e5 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -37,6 +37,7 @@ static int ife_net_id; static int max_metacnt = IFE_META_MAX + 1; +static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, @@ -364,7 +365,7 @@ out_nlmsg_trim: /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a, int bind) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { @@ -382,7 +383,7 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); @@ -417,7 +418,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, } static int tcf_ife_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, ife_net_id); @@ -451,25 +452,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, **/ if (!tb[TCA_IFE_TYPE]) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); pr_info("You MUST pass etherype for encoding\n"); return -EINVAL; } } if (!exists) { - ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife), + ret = tcf_hash_create(tn, parm->index, est, a, &act_ife_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - ife = to_ife(a); + ife = to_ife(*a); ife->flags = parm->flags; if (parm->flags & IFE_ENCODE) { @@ -507,9 +508,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (err) { metadata_parse_err: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -529,7 +530,7 @@ metadata_parse_err: err = use_all_metadata(ife); if (err) { if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(a, bind); + _tcf_ife_cleanup(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -541,7 +542,7 @@ metadata_parse_err: spin_unlock_bh(&ife->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -550,7 +551,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, @@ -623,7 +624,7 @@ struct meta_tlvhdr { static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data; u16 ifehdrln = ifehdr->metalen; @@ -695,7 +696,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct ethhdr *iethh; /* inner eth header */ @@ -799,7 +800,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_ife_info *ife = a->priv; + struct tcf_ife_info *ife = to_ife(a); if (ife->flags & IFE_ENCODE) return tcf_ife_encode(skb, a, res); @@ -819,14 +820,14 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ife_net_id); @@ -843,6 +844,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .size = sizeof(struct tcf_ife_info), }; static __net_init int ife_init_net(struct net *net) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index b8c50600697a..378c1c976058 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -31,8 +31,10 @@ #define IPT_TAB_MASK 15 static int ipt_net_id; +static struct tc_action_ops act_ipt_ops; static int xt_net_id; +static struct tc_action_ops act_xt_ops; static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) @@ -90,8 +92,8 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { }; static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, - int bind) + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int ovr, int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; @@ -118,19 +120,19 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (!exists) { - ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, + ret = tcf_hash_create(tn, index, est, a, ops, bind, false); if (ret) return ret; @@ -138,13 +140,11 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - ipt = to_ipt(a); - hook = nla_get_u32(tb[TCA_IPT_HOOK]); err = -ENOMEM; @@ -163,6 +163,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (err < 0) goto err3; + ipt = to_ipt(*a); + spin_lock_bh(&ipt->tcf_lock); if (ret != ACT_P_CREATED) { ipt_destroy_target(ipt->tcfi_t); @@ -174,7 +176,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, ipt->tcfi_hook = hook; spin_unlock_bh(&ipt->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; err3: @@ -183,33 +185,33 @@ err2: kfree(tname); err1: if (ret == ACT_P_CREATED) - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return err; } static int tcf_ipt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return __tcf_ipt_init(tn, nla, est, a, ovr, bind); + return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind); } static int tcf_xt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return __tcf_ipt_init(tn, nla, est, a, ovr, bind); + return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind); } static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; - struct tcf_ipt *ipt = a->priv; + struct tcf_ipt *ipt = to_ipt(a); struct xt_action_param par; if (skb_unclone(skb, GFP_ATOMIC)) @@ -259,7 +261,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ipt *ipt = a->priv; + struct tcf_ipt *ipt = to_ipt(a); struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; @@ -299,14 +301,14 @@ nla_put_failure: static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -323,6 +325,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .size = sizeof(struct tcf_ipt), }; static __net_init int ipt_init_net(struct net *net) @@ -348,14 +351,14 @@ static struct pernet_operations ipt_net_ops = { static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, xt_net_id); @@ -372,6 +375,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .size = sizeof(struct tcf_ipt), }; static __net_init int xt_init_net(struct net *net) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 70cfbbf96af2..6038c85d92f5 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -52,9 +52,10 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { }; static int mirred_net_id; +static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, int ovr, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, mirred_net_id); @@ -84,14 +85,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, break; default: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (parm->ifindex) { dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -ENODEV; } switch (dev->type) { @@ -115,16 +116,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*m), bind, true); + &act_mirred_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - m = to_mirred(a); + m = to_mirred(*a); ASSERT_RTNL(); m->tcf_action = parm->action; @@ -142,7 +143,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock_bh(&mirred_list_lock); - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); } return ret; @@ -151,7 +152,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_mirred *m = a->priv; + struct tcf_mirred *m = to_mirred(a); struct net_device *dev; struct sk_buff *skb2; int retval, err; @@ -206,7 +207,7 @@ out: static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mirred *m = a->priv; + struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, @@ -232,14 +233,14 @@ nla_put_failure: static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, mirred_net_id); @@ -284,6 +285,7 @@ static struct tc_action_ops act_mirred_ops = { .init = tcf_mirred_init, .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, + .size = sizeof(struct tcf_mirred), }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 06ccb03f25da..8e8b0cc30704 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -32,13 +32,14 @@ #define NAT_TAB_MASK 15 static int nat_net_id; +static struct tc_action_ops act_nat_ops; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) + struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -59,18 +60,18 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (!tcf_hash_check(tn, parm->index, a, bind)) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_nat_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - p = to_tcf_nat(a); + p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); p->old_addr = parm->old_addr; @@ -82,7 +83,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -90,7 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_nat *p = a->priv; + struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; __be32 old_addr; __be32 new_addr; @@ -248,7 +249,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = a->priv; + struct tcf_nat *p = to_tcf_nat(a); struct tc_nat opt = { .old_addr = p->old_addr, .new_addr = p->new_addr, @@ -278,14 +279,14 @@ nla_put_failure: static int tcf_nat_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, nat_net_id); @@ -301,6 +302,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .size = sizeof(struct tcf_nat), }; static __net_init int nat_init_net(struct net *net) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 82d3c1479029..b54d56d4959b 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -26,13 +26,14 @@ #define PEDIT_TAB_MASK 15 static int pedit_net_id; +static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; static int tcf_pedit_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, pedit_net_id); @@ -61,23 +62,23 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!parm->nkeys) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*p), bind, false); + &act_pedit_ops, bind, false); if (ret) return ret; - p = to_pedit(a); + p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return -ENOMEM; } ret = ACT_P_CREATED; } else { if (bind) return 0; - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; - p = to_pedit(a); + p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -96,13 +97,13 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, memcpy(p->tcfp_keys, parm->keys, ksize); spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } static void tcf_pedit_cleanup(struct tc_action *a, int bind) { - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; kfree(keys); } @@ -110,7 +111,7 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind) static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); int i; unsigned int off; @@ -177,7 +178,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = a->priv; + struct tcf_pedit *p = to_pedit(a); struct tc_pedit *opt; struct tcf_t t; int s; @@ -216,14 +217,14 @@ nla_put_failure: static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, pedit_net_id); @@ -240,6 +241,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .size = sizeof(struct tcf_pedit), }; static __net_init int pedit_init_net(struct net *net) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 1e8ede3955f4..123794af55c3 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -37,8 +37,8 @@ struct tcf_police { struct psched_ratecfg peak; bool peak_present; }; -#define to_police(pc) \ - container_of(pc->priv, struct tcf_police, common) + +#define to_police(pc) ((struct tcf_police *)pc) #define POL_TAB_MASK 15 @@ -56,15 +56,14 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ static int police_net_id; +static struct tc_action_ops act_police_ops; static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, police_net_id); struct tcf_hashinfo *hinfo = tn->hinfo; - struct hlist_head *head; - struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; @@ -73,21 +72,22 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, s_i = cb->args[0]; for (i = 0; i < (POL_TAB_MASK + 1); i++) { + struct hlist_head *head; + struct tcf_common *p; + head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; hlist_for_each_entry_rcu(p, head, tcfc_head) { index++; if (index < s_i) continue; - a->priv = p; - a->order = index; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start(skb, index); if (nest == NULL) goto nla_put_failure; if (type == RTM_DELACTION) - err = tcf_action_dump_1(skb, a, 0, 1); + err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 1); else - err = tcf_action_dump_1(skb, a, 0, 0); + err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); if (err < 0) { index--; nla_nest_cancel(skb, nest); @@ -116,7 +116,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { }; static int tcf_act_police_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { int ret = 0, err; @@ -142,13 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { - if (tcf_hash_search(tn, a, parm->index)) { - police = to_police(a); - if (bind) { - police->tcf_bindcnt += 1; - police->tcf_refcnt += 1; - return 0; - } + if (tcf_hash_check(tn, parm->index, a, bind)) { if (ovr) goto override; /* not replacing */ @@ -156,14 +150,14 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, } } else { ret = tcf_hash_create(tn, parm->index, NULL, a, - sizeof(*police), bind, false); + &act_police_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } - police = to_police(a); override: + police = to_police(*a); if (parm->rate.rate) { err = -ENOMEM; R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); @@ -235,7 +229,7 @@ override: return ret; police->tcfp_t_c = ktime_get_ns(); - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; @@ -245,14 +239,14 @@ failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return err; } static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_police *police = a->priv; + struct tcf_police *police = to_police(a); s64 now; s64 toks; s64 ptoks = 0; @@ -311,7 +305,7 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = a->priv; + struct tcf_police *police = to_police(a); struct tc_police opt = { .index = police->tcf_index, .action = police->tcf_action, @@ -349,7 +343,7 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, police_net_id); @@ -369,6 +363,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .size = sizeof(struct tcf_police), }; static __net_init int police_init_net(struct net *net) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 318328d34d12..289af6f9bb3b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -27,12 +27,13 @@ #define SIMP_TAB_MASK 7 static int simp_net_id; +static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_defact *d = a->priv; + struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); @@ -79,7 +80,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { }; static int tcf_simp_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, simp_net_id); @@ -100,7 +101,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; - parm = nla_data(tb[TCA_DEF_PARMS]); exists = tcf_hash_check(tn, parm->index, a, bind); if (exists && bind) @@ -108,7 +108,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_DATA] == NULL) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } @@ -116,22 +116,22 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*d), bind, false); + &act_simp_ops, bind, false); if (ret) return ret; - d = to_defact(a); + d = to_defact(*a); ret = alloc_defdata(d, defdata); if (ret < 0) { - tcf_hash_cleanup(a, est); + tcf_hash_cleanup(*a, est); return ret; } d->tcf_action = parm->action; ret = ACT_P_CREATED; } else { - d = to_defact(a); + d = to_defact(*a); - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; @@ -139,7 +139,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -147,7 +147,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_defact *d = a->priv; + struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = d->tcf_refcnt - ref, @@ -172,14 +172,14 @@ nla_put_failure: static int tcf_simp_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, simp_net_id); @@ -196,6 +196,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .size = sizeof(struct tcf_defact), }; static __net_init int simp_init_net(struct net *net) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8e573c0f8742..a133dcb82132 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -30,11 +30,12 @@ #define SKBEDIT_TAB_MASK 15 static int skbedit_net_id; +static struct tc_action_ops act_skbedit_ops; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_skbedit *d = a->priv; + struct tcf_skbedit *d = to_skbedit(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); @@ -63,7 +64,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -114,21 +115,21 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, return 0; if (!flags) { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*d), bind, false); + &act_skbedit_ops, bind, false); if (ret) return ret; - d = to_skbedit(a); + d = to_skbedit(*a); ret = ACT_P_CREATED; } else { - d = to_skbedit(a); - tcf_hash_release(a, bind); + d = to_skbedit(*a); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } @@ -150,7 +151,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&d->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -158,7 +159,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = a->priv; + struct tcf_skbedit *d = to_skbedit(a); struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = d->tcf_refcnt - ref, @@ -194,14 +195,14 @@ nla_put_failure: static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); @@ -217,6 +218,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .size = sizeof(struct tcf_skbedit), }; static __net_init int skbedit_init_net(struct net *net) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index db9b7ed570ba..691409de3e1a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -22,11 +22,12 @@ #define VLAN_TAB_MASK 15 static int vlan_net_id; +static struct tc_action_ops act_vlan_ops; static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - struct tcf_vlan *v = a->priv; + struct tcf_vlan *v = to_vlan(a); int action; int err; @@ -67,7 +68,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, + struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -100,13 +101,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case TCA_VLAN_ACT_PUSH: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -ERANGE; } @@ -125,25 +126,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, break; default: if (exists) - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); return -EINVAL; } action = parm->v_action; if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, - sizeof(*v), bind, false); + &act_vlan_ops, bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - tcf_hash_release(a, bind); + tcf_hash_release(*a, bind); if (!ovr) return -EEXIST; } - v = to_vlan(a); + v = to_vlan(*a); spin_lock_bh(&v->tcf_lock); @@ -156,7 +157,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&v->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(tn, a); + tcf_hash_insert(tn, *a); return ret; } @@ -164,7 +165,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_vlan *v = a->priv; + struct tcf_vlan *v = to_vlan(a); struct tc_vlan opt = { .index = v->tcf_index, .refcnt = v->tcf_refcnt - ref, @@ -195,14 +196,14 @@ nla_put_failure: static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - struct tc_action *a) + const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tcf_generic_walker(tn, skb, cb, type, a); + return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index) +static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, vlan_net_id); @@ -218,6 +219,7 @@ static struct tc_action_ops act_vlan_ops = { .init = tcf_vlan_init, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .size = sizeof(struct tcf_vlan), }; static __net_init int vlan_init_net(struct net *net) -- cgit v1.2.3 From ec0595cc4495be579309b4bfd5e997af0f2ae6f9 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 25 Jul 2016 16:09:42 -0700 Subject: net_sched: get rid of struct tcf_common After the previous patch, struct tc_action should be enough to represent the generic tc action, tcf_common is not necessary any more. This patch gets rid of it to make tc action code more readable. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 63 +++++++++--------- include/net/tc_act/tc_bpf.h | 2 +- include/net/tc_act/tc_connmark.h | 2 +- include/net/tc_act/tc_csum.h | 2 +- include/net/tc_act/tc_defact.h | 2 +- include/net/tc_act/tc_gact.h | 2 +- include/net/tc_act/tc_ife.h | 2 +- include/net/tc_act/tc_ipt.h | 2 +- include/net/tc_act/tc_mirred.h | 2 +- include/net/tc_act/tc_nat.h | 2 +- include/net/tc_act/tc_pedit.h | 2 +- include/net/tc_act/tc_skbedit.h | 2 +- include/net/tc_act/tc_vlan.h | 2 +- net/sched/act_api.c | 139 +++++++++++++++++++-------------------- net/sched/act_police.c | 10 +-- 15 files changed, 114 insertions(+), 122 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8b199095ea51..41e6a24a44b9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -22,42 +22,39 @@ struct tc_action_ops; struct tc_action { const struct tc_action_ops *ops; - __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ - __u32 order; - struct list_head list; - struct tcf_hashinfo *hinfo; -}; - -struct tcf_common { - struct tc_action tcfc_act; - struct hlist_node tcfc_head; - u32 tcfc_index; - int tcfc_refcnt; - int tcfc_bindcnt; - u32 tcfc_capab; - int tcfc_action; - struct tcf_t tcfc_tm; - struct gnet_stats_basic_packed tcfc_bstats; - struct gnet_stats_queue tcfc_qstats; - struct gnet_stats_rate_est64 tcfc_rate_est; - spinlock_t tcfc_lock; - struct rcu_head tcfc_rcu; + __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ + __u32 order; + struct list_head list; + struct tcf_hashinfo *hinfo; + + struct hlist_node tcfa_head; + u32 tcfa_index; + int tcfa_refcnt; + int tcfa_bindcnt; + u32 tcfa_capab; + int tcfa_action; + struct tcf_t tcfa_tm; + struct gnet_stats_basic_packed tcfa_bstats; + struct gnet_stats_queue tcfa_qstats; + struct gnet_stats_rate_est64 tcfa_rate_est; + spinlock_t tcfa_lock; + struct rcu_head tcfa_rcu; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; }; -#define tcf_act common.tcfc_act -#define tcf_head common.tcfc_head -#define tcf_index common.tcfc_index -#define tcf_refcnt common.tcfc_refcnt -#define tcf_bindcnt common.tcfc_bindcnt -#define tcf_capab common.tcfc_capab -#define tcf_action common.tcfc_action -#define tcf_tm common.tcfc_tm -#define tcf_bstats common.tcfc_bstats -#define tcf_qstats common.tcfc_qstats -#define tcf_rate_est common.tcfc_rate_est -#define tcf_lock common.tcfc_lock -#define tcf_rcu common.tcfc_rcu +#define tcf_act common.tcfa_act +#define tcf_head common.tcfa_head +#define tcf_index common.tcfa_index +#define tcf_refcnt common.tcfa_refcnt +#define tcf_bindcnt common.tcfa_bindcnt +#define tcf_capab common.tcfa_capab +#define tcf_action common.tcfa_action +#define tcf_tm common.tcfa_tm +#define tcf_bstats common.tcfa_bstats +#define tcf_qstats common.tcfa_qstats +#define tcf_rate_est common.tcfa_rate_est +#define tcf_lock common.tcfa_lock +#define tcf_rcu common.tcfa_rcu static inline unsigned int tcf_hash(u32 index, unsigned int hmask) { diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h index 80a4d6f49773..2b94673a3dbc 100644 --- a/include/net/tc_act/tc_bpf.h +++ b/include/net/tc_act/tc_bpf.h @@ -14,7 +14,7 @@ #include struct tcf_bpf { - struct tcf_common common; + struct tc_action common; struct bpf_prog __rcu *filter; union { u32 bpf_fd; diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index 8a661135f4ac..59b515d32bb4 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -4,7 +4,7 @@ #include struct tcf_connmark_info { - struct tcf_common common; + struct tc_action common; struct net *net; u16 zone; }; diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 1a9ef15d573b..f31fb6331a53 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -5,7 +5,7 @@ #include struct tcf_csum { - struct tcf_common common; + struct tc_action common; u32 update_flags; }; diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h index e25b4eb4fc66..d47f040a3bdf 100644 --- a/include/net/tc_act/tc_defact.h +++ b/include/net/tc_act/tc_defact.h @@ -4,7 +4,7 @@ #include struct tcf_defact { - struct tcf_common common; + struct tc_action common; u32 tcfd_datalen; void *tcfd_defdata; }; diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 119cdb418c23..b6f173910226 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -5,7 +5,7 @@ #include struct tcf_gact { - struct tcf_common common; + struct tc_action common; #ifdef CONFIG_GACT_PROB u16 tcfg_ptype; u16 tcfg_pval; diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 7921abe42adc..5164bd7a38fb 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -8,7 +8,7 @@ #define IFE_METAHDRLEN 2 struct tcf_ife_info { - struct tcf_common common; + struct tc_action common; u8 eth_dst[ETH_ALEN]; u8 eth_src[ETH_ALEN]; u16 eth_type; diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h index c22ae7ab66ed..31309766e379 100644 --- a/include/net/tc_act/tc_ipt.h +++ b/include/net/tc_act/tc_ipt.h @@ -6,7 +6,7 @@ struct xt_entry_target; struct tcf_ipt { - struct tcf_common common; + struct tc_action common; u32 tcfi_hook; char *tcfi_tname; struct xt_entry_target *tcfi_t; diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 89aebd22cd79..62770add15bd 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -5,7 +5,7 @@ #include struct tcf_mirred { - struct tcf_common common; + struct tc_action common; int tcfm_eaction; int tcfm_ifindex; int tcfm_ok_push; diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index a91ad3ad565e..56681a320612 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -5,7 +5,7 @@ #include struct tcf_nat { - struct tcf_common common; + struct tc_action common; __be32 old_addr; __be32 new_addr; diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 2cccfbaae800..29e38d6823df 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -4,7 +4,7 @@ #include struct tcf_pedit { - struct tcf_common common; + struct tc_action common; unsigned char tcfp_nkeys; unsigned char tcfp_flags; struct tc_pedit_key *tcfp_keys; diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 9e0548998327..5767e9dbcf92 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -23,7 +23,7 @@ #include struct tcf_skbedit { - struct tcf_common common; + struct tc_action common; u32 flags; u32 priority; u32 mark; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 584b80788d52..e29f52e8bdf1 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -16,7 +16,7 @@ #define VLAN_F_PUSH 0x2 struct tcf_vlan { - struct tcf_common common; + struct tc_action common; int tcfv_action; u16 tcfv_push_vid; __be16 tcfv_push_proto; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d97419f35e7e..e4a5f2607ffa 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -29,46 +29,43 @@ static void free_tcf(struct rcu_head *head) { - struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); kfree(p); } -static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a) +static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p) { - struct tcf_common *p = (struct tcf_common *)a; - spin_lock_bh(&hinfo->lock); - hlist_del(&p->tcfc_head); + hlist_del(&p->tcfa_head); spin_unlock_bh(&hinfo->lock); - gen_kill_estimator(&p->tcfc_bstats, - &p->tcfc_rate_est); + gen_kill_estimator(&p->tcfa_bstats, + &p->tcfa_rate_est); /* - * gen_estimator est_timer() might access p->tcfc_lock + * gen_estimator est_timer() might access p->tcfa_lock * or bstats, wait a RCU grace period before freeing p */ - call_rcu(&p->tcfc_rcu, free_tcf); + call_rcu(&p->tcfa_rcu, free_tcf); } -int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) +int __tcf_hash_release(struct tc_action *p, bool bind, bool strict) { - struct tcf_common *p = (struct tcf_common *)a; int ret = 0; if (p) { if (bind) - p->tcfc_bindcnt--; - else if (strict && p->tcfc_bindcnt > 0) + p->tcfa_bindcnt--; + else if (strict && p->tcfa_bindcnt > 0) return -EPERM; - p->tcfc_refcnt--; - if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { - if (a->ops->cleanup) - a->ops->cleanup(a, bind); - list_del(&a->list); - tcf_hash_destroy(a->hinfo, a); + p->tcfa_refcnt--; + if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { + if (p->ops->cleanup) + p->ops->cleanup(p, bind); + list_del(&p->list); + tcf_hash_destroy(p->hinfo, p); ret = ACT_P_DELETED; } } @@ -89,11 +86,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, for (i = 0; i < (hinfo->hmask + 1); i++) { struct hlist_head *head; - struct tcf_common *p; + struct tc_action *p; head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) { + hlist_for_each_entry_rcu(p, head, tcfa_head) { index++; if (index < s_i) continue; @@ -101,7 +98,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); + err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); @@ -139,13 +136,13 @@ static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, for (i = 0; i < (hinfo->hmask + 1); i++) { struct hlist_head *head; struct hlist_node *n; - struct tcf_common *p; + struct tc_action *p; head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; - hlist_for_each_entry_safe(p, n, head, tcfc_head) { - ret = __tcf_hash_release((struct tc_action *)p, false, true); + hlist_for_each_entry_safe(p, n, head, tcfa_head) { + ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) { - module_put(p->tcfc_act.ops->owner); + module_put(p->ops->owner); n_i++; } else if (ret < 0) goto nla_put_failure; @@ -178,15 +175,15 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tc_action *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) { - struct tcf_common *p = NULL; + struct tc_action *p = NULL; struct hlist_head *head; spin_lock_bh(&hinfo->lock); head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) - if (p->tcfc_index == index) + hlist_for_each_entry_rcu(p, head, tcfa_head) + if (p->tcfa_index == index) break; spin_unlock_bh(&hinfo->lock); @@ -211,10 +208,10 @@ EXPORT_SYMBOL(tcf_hash_new_index); int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_hashinfo *hinfo = tn->hinfo; - struct tcf_common *p = tcf_hash_lookup(index, hinfo); + struct tc_action *p = tcf_hash_lookup(index, hinfo); if (p) { - *a = &p->tcfc_act; + *a = p; return 1; } return 0; @@ -225,12 +222,13 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; - struct tcf_common *p = NULL; + struct tc_action *p = NULL; + if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) { if (bind) - p->tcfc_bindcnt++; - p->tcfc_refcnt++; - *a = &p->tcfc_act; + p->tcfa_bindcnt++; + p->tcfa_refcnt++; + *a = p; return true; } return false; @@ -239,11 +237,10 @@ EXPORT_SYMBOL(tcf_hash_check); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) { - struct tcf_common *pc = (struct tcf_common *)a; if (est) - gen_kill_estimator(&pc->tcfc_bstats, - &pc->tcfc_rate_est); - call_rcu(&pc->tcfc_rcu, free_tcf); + gen_kill_estimator(&a->tcfa_bstats, + &a->tcfa_rate_est); + call_rcu(&a->tcfa_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); @@ -251,15 +248,15 @@ int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) { - struct tcf_common *p = kzalloc(ops->size, GFP_KERNEL); + struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_hashinfo *hinfo = tn->hinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; - p->tcfc_refcnt = 1; + p->tcfa_refcnt = 1; if (bind) - p->tcfc_bindcnt = 1; + p->tcfa_bindcnt = 1; if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -275,38 +272,37 @@ err2: goto err1; } } - spin_lock_init(&p->tcfc_lock); - INIT_HLIST_NODE(&p->tcfc_head); - p->tcfc_index = index ? index : tcf_hash_new_index(tn); - p->tcfc_tm.install = jiffies; - p->tcfc_tm.lastuse = jiffies; - p->tcfc_tm.firstuse = 0; + spin_lock_init(&p->tcfa_lock); + INIT_HLIST_NODE(&p->tcfa_head); + p->tcfa_index = index ? index : tcf_hash_new_index(tn); + p->tcfa_tm.install = jiffies; + p->tcfa_tm.lastuse = jiffies; + p->tcfa_tm.firstuse = 0; if (est) { - err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, - &p->tcfc_rate_est, - &p->tcfc_lock, NULL, est); + err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, + &p->tcfa_rate_est, + &p->tcfa_lock, NULL, est); if (err) { free_percpu(p->cpu_qstats); goto err2; } } - p->tcfc_act.hinfo = hinfo; - p->tcfc_act.ops = ops; - INIT_LIST_HEAD(&p->tcfc_act.list); - *a = &p->tcfc_act; + p->hinfo = hinfo; + p->ops = ops; + INIT_LIST_HEAD(&p->list); + *a = p; return 0; } EXPORT_SYMBOL(tcf_hash_create); void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a) { - struct tcf_common *p = (struct tcf_common *)a; struct tcf_hashinfo *hinfo = tn->hinfo; - unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); + unsigned int h = tcf_hash(a->tcfa_index, hinfo->hmask); spin_lock_bh(&hinfo->lock); - hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); + hlist_add_head(&a->tcfa_head, &hinfo->htab[h]); spin_unlock_bh(&hinfo->lock); } EXPORT_SYMBOL(tcf_hash_insert); @@ -317,13 +313,13 @@ void tcf_hashinfo_destroy(const struct tc_action_ops *ops, int i; for (i = 0; i < hinfo->hmask + 1; i++) { - struct tcf_common *p; + struct tc_action *p; struct hlist_node *n; - hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) { + hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfa_head) { int ret; - ret = __tcf_hash_release((struct tc_action *)p, false, true); + ret = __tcf_hash_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) @@ -625,12 +621,11 @@ err: return err; } -int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, +int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { int err = 0; struct gnet_dump d; - struct tcf_common *p = (struct tcf_common *)a; if (p == NULL) goto errout; @@ -639,27 +634,27 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { - if (a->type == TCA_OLD_COMPAT) + if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, - &p->tcfc_lock, &d, + &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &p->tcfc_lock, &d, TCA_ACT_PAD); + &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; - if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 || - gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, - &p->tcfc_rate_est) < 0 || + if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfa_bstats, + &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfc_qstats, - p->tcfc_qstats.qlen) < 0) + &p->tcfa_qstats, + p->tcfa_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 123794af55c3..b3c7e975fc9e 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -23,7 +23,7 @@ #include struct tcf_police { - struct tcf_common common; + struct tc_action common; int tcfp_result; u32 tcfp_ewma_rate; s64 tcfp_burst; @@ -73,11 +73,11 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, for (i = 0; i < (POL_TAB_MASK + 1); i++) { struct hlist_head *head; - struct tcf_common *p; + struct tc_action *p; head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; - hlist_for_each_entry_rcu(p, head, tcfc_head) { + hlist_for_each_entry_rcu(p, head, tcfa_head) { index++; if (index < s_i) continue; @@ -85,9 +85,9 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; if (type == RTM_DELACTION) - err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 1); + err = tcf_action_dump_1(skb, p, 0, 1); else - err = tcf_action_dump_1(skb, (struct tc_action *)p, 0, 0); + err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nla_nest_cancel(skb, nest); -- cgit v1.2.3 From df7e88f6cad8cc4e02c9275018a572fab59562c0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 30 Jul 2016 20:00:45 +0800 Subject: sctp: change to use TCP_CLOSE_WAIT as SCTP_SS_CLOSING Prior to this patch, sctp defined TCP_CLOSING as SCTP_SS_CLOSING. TCP_CLOSING is such a special sk state in TCP that inet common codes even exclude it. For instance, inet_accept thinks the accept sk's state never be TCP_CLOSING, or it will give a WARN_ON. TCP works well with that while SCTP may trigger the call trace, as CLOSING state in SCTP has different meaning from TCP. This fix is to change to use TCP_CLOSE_WAIT as SCTP_SS_CLOSING, instead of TCP_CLOSING. Some side-effects could be expected, regardless of not being used before. inet_accept will accept it now. I did all the func_tests in lksctp-tools and ran sctp codnomicon fuzzer tests against this patch, no regression or failure found. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/constants.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 8c337cd0e1e4..5b847e49f7e9 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -214,7 +214,7 @@ typedef enum { SCTP_SS_LISTENING = TCP_LISTEN, SCTP_SS_ESTABLISHING = TCP_SYN_SENT, SCTP_SS_ESTABLISHED = TCP_ESTABLISHED, - SCTP_SS_CLOSING = TCP_CLOSING, + SCTP_SS_CLOSING = TCP_CLOSE_WAIT, } sctp_sock_state_t; /* These functions map various type to printable names. */ -- cgit v1.2.3 From 0b01aeb3d2fbf16787f0c9629f4ca52ae792f732 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 28 Jul 2016 15:36:30 +0100 Subject: VSOCK: transport-specific vsock_transport functions struct vsock_transport contains function pointers called by AF_VSOCK core code. The transport may want its own transport-specific function pointers and they can be added after struct vsock_transport. Allow the transport to fetch vsock_transport. It can downcast it to access transport-specific function pointers. The virtio transport will use this. Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- include/net/af_vsock.h | 3 +++ net/vmw_vsock/af_vsock.c | 9 +++++++++ 2 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index e9eb2d6791b3..23f55259b60d 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -165,6 +165,9 @@ static inline int vsock_core_init(const struct vsock_transport *t) } void vsock_core_exit(void); +/* The transport may downcast this to access transport-specific functions */ +const struct vsock_transport *vsock_core_get_transport(void); + /**** UTILS ****/ void vsock_release_pending(struct sock *pending); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b96ac918e0ba..e34d96f8bde2 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1995,6 +1995,15 @@ void vsock_core_exit(void) } EXPORT_SYMBOL_GPL(vsock_core_exit); +const struct vsock_transport *vsock_core_get_transport(void) +{ + /* vsock_register_mutex not taken since only the transport uses this + * function and only while registered. + */ + return transport; +} +EXPORT_SYMBOL_GPL(vsock_core_get_transport); + MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.1.0-k"); -- cgit v1.2.3 From 6773b7dc39f165bd9d824b50ac52cbb3f87d53c8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 28 Jul 2016 15:36:31 +0100 Subject: VSOCK: defer sock removal to transports The virtio transport will implement graceful shutdown and the related SO_LINGER socket option. This requires orphaning the sock but keeping it in the table of connections after .release(). This patch adds the vsock_remove_sock() function and leaves it up to the transport when to remove the sock. Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- include/net/af_vsock.h | 1 + net/vmw_vsock/af_vsock.c | 16 ++++++++++------ net/vmw_vsock/vmci_transport.c | 2 ++ 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 23f55259b60d..3af0b224f754 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -180,6 +180,7 @@ void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); +void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); #endif /* __AF_VSOCK_H__ */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index e34d96f8bde2..17dbbe64cd73 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk) return ret; } +void vsock_remove_sock(struct vsock_sock *vsk) +{ + if (vsock_in_bound_table(vsk)) + vsock_remove_bound(vsk); + + if (vsock_in_connected_table(vsk)) + vsock_remove_connected(vsk); +} +EXPORT_SYMBOL_GPL(vsock_remove_sock); + void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) { int i; @@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ - if (vsock_in_bound_table(vsk)) - vsock_remove_bound(vsk); - - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); - transport->release(vsk); lock_sock(sk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4120b7a538be..4be4fbbc0b50 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk) static void vmci_transport_release(struct vsock_sock *vsk) { + vsock_remove_sock(vsk); + if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) { vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle); vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; -- cgit v1.2.3 From 06a8fc78367d070720af960dcecec917d3ae5f3b Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 28 Jul 2016 15:36:32 +0100 Subject: VSOCK: Introduce virtio_vsock_common.ko This module contains the common code and header files for the following virtio_transporto and vhost_vsock kernel modules. Signed-off-by: Asias He Signed-off-by: Claudio Imbrenda Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 10 + include/linux/virtio_vsock.h | 154 ++++ include/net/af_vsock.h | 2 + .../trace/events/vsock_virtio_transport_common.h | 144 +++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_vsock.h | 94 ++ net/vmw_vsock/virtio_transport_common.c | 992 +++++++++++++++++++++ 8 files changed, 1398 insertions(+) create mode 100644 include/linux/virtio_vsock.h create mode 100644 include/trace/events/vsock_virtio_transport_common.h create mode 100644 include/uapi/linux/virtio_vsock.h create mode 100644 net/vmw_vsock/virtio_transport_common.c (limited to 'include/net') diff --git a/MAINTAINERS b/MAINTAINERS index 8c20323d1277..b49ffb86ebf0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12138,6 +12138,16 @@ S: Maintained F: drivers/media/v4l2-core/videobuf2-* F: include/media/videobuf2-* +VIRTIO AND VHOST VSOCK DRIVER +M: Stefan Hajnoczi +L: kvm@vger.kernel.org +L: virtualization@lists.linux-foundation.org +L: netdev@vger.kernel.org +S: Maintained +F: include/linux/virtio_vsock.h +F: include/uapi/linux/virtio_vsock.h +F: net/vmw_vsock/virtio_transport_common.c + VIRTUAL SERIO DEVICE DRIVER M: Stephen Chandler Paul S: Maintained diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h new file mode 100644 index 000000000000..9638bfeb0d1f --- /dev/null +++ b/include/linux/virtio_vsock.h @@ -0,0 +1,154 @@ +#ifndef _LINUX_VIRTIO_VSOCK_H +#define _LINUX_VIRTIO_VSOCK_H + +#include +#include +#include +#include + +#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 +#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +enum { + VSOCK_VQ_RX = 0, /* for host to guest data */ + VSOCK_VQ_TX = 1, /* for guest to host data */ + VSOCK_VQ_EVENT = 2, + VSOCK_VQ_MAX = 3, +}; + +/* Per-socket state (accessed via vsk->trans) */ +struct virtio_vsock_sock { + struct vsock_sock *vsk; + + /* Protected by lock_sock(sk_vsock(trans->vsk)) */ + u32 buf_size; + u32 buf_size_min; + u32 buf_size_max; + + spinlock_t tx_lock; + spinlock_t rx_lock; + + /* Protected by tx_lock */ + u32 tx_cnt; + u32 buf_alloc; + u32 peer_fwd_cnt; + u32 peer_buf_alloc; + + /* Protected by rx_lock */ + u32 fwd_cnt; + u32 rx_bytes; + struct list_head rx_queue; +}; + +struct virtio_vsock_pkt { + struct virtio_vsock_hdr hdr; + struct work_struct work; + struct list_head list; + void *buf; + u32 len; + u32 off; + bool reply; +}; + +struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct msghdr *msg; + u32 pkt_len; + u16 type; + u16 op; + u32 flags; + bool reply; +}; + +struct virtio_transport { + /* This must be the first field */ + struct vsock_transport transport; + + /* Takes ownership of the packet */ + int (*send_pkt)(struct virtio_vsock_pkt *pkt); +}; + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, + int type); +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk); +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now); +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_available_now); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); +bool virtio_transport_stream_is_active(struct vsock_sock *vsk); +bool virtio_transport_stream_allow(u32 cid, u32 port); +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr); +bool virtio_transport_dgram_allow(u32 cid, u32 port); + +int virtio_transport_connect(struct vsock_sock *vsk); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); + +void virtio_transport_release(struct vsock_sock *vsk); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t len); + +void virtio_transport_destruct(struct vsock_sock *vsk); + +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); +u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); +void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); + +#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 3af0b224f754..f2758964ce6f 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -63,6 +63,8 @@ struct vsock_sock { struct list_head accept_queue; bool rejected; struct delayed_work dwork; + struct delayed_work close_work; + bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; diff --git a/include/trace/events/vsock_virtio_transport_common.h b/include/trace/events/vsock_virtio_transport_common.h new file mode 100644 index 000000000000..b7f1d6278280 --- /dev/null +++ b/include/trace/events/vsock_virtio_transport_common.h @@ -0,0 +1,144 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsock + +#if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \ + defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H + +#include + +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM); + +#define show_type(val) \ + __print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }) + +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE); +TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST); + +#define show_op(val) \ + __print_symbolic(val, \ + { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \ + { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \ + { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \ + { VIRTIO_VSOCK_OP_RST, "RST" }, \ + { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \ + { VIRTIO_VSOCK_OP_RW, "RW" }, \ + { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \ + { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" }) + +TRACE_EVENT(virtio_transport_alloc_pkt, + TP_PROTO( + __u32 src_cid, __u32 src_port, + __u32 dst_cid, __u32 dst_port, + __u32 len, + __u16 type, + __u16 op, + __u32 flags + ), + TP_ARGS( + src_cid, src_port, + dst_cid, dst_port, + len, + type, + op, + flags + ), + TP_STRUCT__entry( + __field(__u32, src_cid) + __field(__u32, src_port) + __field(__u32, dst_cid) + __field(__u32, dst_port) + __field(__u32, len) + __field(__u16, type) + __field(__u16, op) + __field(__u32, flags) + ), + TP_fast_assign( + __entry->src_cid = src_cid; + __entry->src_port = src_port; + __entry->dst_cid = dst_cid; + __entry->dst_port = dst_port; + __entry->len = len; + __entry->type = type; + __entry->op = op; + __entry->flags = flags; + ), + TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x", + __entry->src_cid, __entry->src_port, + __entry->dst_cid, __entry->dst_port, + __entry->len, + show_type(__entry->type), + show_op(__entry->op), + __entry->flags) +); + +TRACE_EVENT(virtio_transport_recv_pkt, + TP_PROTO( + __u32 src_cid, __u32 src_port, + __u32 dst_cid, __u32 dst_port, + __u32 len, + __u16 type, + __u16 op, + __u32 flags, + __u32 buf_alloc, + __u32 fwd_cnt + ), + TP_ARGS( + src_cid, src_port, + dst_cid, dst_port, + len, + type, + op, + flags, + buf_alloc, + fwd_cnt + ), + TP_STRUCT__entry( + __field(__u32, src_cid) + __field(__u32, src_port) + __field(__u32, dst_cid) + __field(__u32, dst_port) + __field(__u32, len) + __field(__u16, type) + __field(__u16, op) + __field(__u32, flags) + __field(__u32, buf_alloc) + __field(__u32, fwd_cnt) + ), + TP_fast_assign( + __entry->src_cid = src_cid; + __entry->src_port = src_port; + __entry->dst_cid = dst_cid; + __entry->dst_port = dst_port; + __entry->len = len; + __entry->type = type; + __entry->op = op; + __entry->flags = flags; + __entry->buf_alloc = buf_alloc; + __entry->fwd_cnt = fwd_cnt; + ), + TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x " + "buf_alloc=%u fwd_cnt=%u", + __entry->src_cid, __entry->src_port, + __entry->dst_cid, __entry->dst_port, + __entry->len, + show_type(__entry->type), + show_op(__entry->op), + __entry->flags, + __entry->buf_alloc, + __entry->fwd_cnt) +); + +#endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */ + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE vsock_virtio_transport_common + +/* This part must be outside protection */ +#include diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ec10cfef166a..3cf0116d9c2b 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -453,6 +453,7 @@ header-y += virtio_ring.h header-y += virtio_rng.h header-y += virtio_scsi.h header-y += virtio_types.h +header-y += virtio_vsock.h header-y += vm_sockets.h header-y += vt.h header-y += wait.h diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 77925f587b15..3228d582234a 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -41,5 +41,6 @@ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ +#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h new file mode 100644 index 000000000000..6b011c19b50f --- /dev/null +++ b/include/uapi/linux/virtio_vsock.h @@ -0,0 +1,94 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H +#define _UAPI_LINUX_VIRTIO_VOSCK_H + +#include +#include +#include + +struct virtio_vsock_config { + __le64 guest_cid; +} __attribute__((packed)); + +enum virtio_vsock_event_id { + VIRTIO_VSOCK_EVENT_TRANSPORT_RESET = 0, +}; + +struct virtio_vsock_event { + __le32 id; +} __attribute__((packed)); + +struct virtio_vsock_hdr { + __le64 src_cid; + __le64 dst_cid; + __le32 src_port; + __le32 dst_port; + __le32 len; + __le16 type; /* enum virtio_vsock_type */ + __le16 op; /* enum virtio_vsock_op */ + __le32 flags; + __le32 buf_alloc; + __le32 fwd_cnt; +} __attribute__((packed)); + +enum virtio_vsock_type { + VIRTIO_VSOCK_TYPE_STREAM = 1, +}; + +enum virtio_vsock_op { + VIRTIO_VSOCK_OP_INVALID = 0, + + /* Connect operations */ + VIRTIO_VSOCK_OP_REQUEST = 1, + VIRTIO_VSOCK_OP_RESPONSE = 2, + VIRTIO_VSOCK_OP_RST = 3, + VIRTIO_VSOCK_OP_SHUTDOWN = 4, + + /* To send payload */ + VIRTIO_VSOCK_OP_RW = 5, + + /* Tell the peer our credit info */ + VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6, + /* Request the peer to send the credit info to us */ + VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7, +}; + +/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ +enum virtio_vsock_shutdown { + VIRTIO_VSOCK_SHUTDOWN_RCV = 1, + VIRTIO_VSOCK_SHUTDOWN_SEND = 2, +}; + +#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c new file mode 100644 index 000000000000..a53b3a16b4f1 --- /dev/null +++ b/net/vmw_vsock/virtio_transport_common.c @@ -0,0 +1,992 @@ +/* + * common code for virtio vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He + * Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/* How long to wait for graceful shutdown of a connection */ +#define VSOCK_CLOSE_TIMEOUT (8 * HZ) + +static const struct virtio_transport *virtio_transport_get_ops(void) +{ + const struct vsock_transport *t = vsock_core_get_transport(); + + return container_of(t, struct virtio_transport, transport); +} + +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct virtio_vsock_pkt *pkt; + int err; + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + pkt->hdr.type = cpu_to_le16(info->type); + pkt->hdr.op = cpu_to_le16(info->op); + pkt->hdr.src_cid = cpu_to_le64(src_cid); + pkt->hdr.dst_cid = cpu_to_le64(dst_cid); + pkt->hdr.src_port = cpu_to_le32(src_port); + pkt->hdr.dst_port = cpu_to_le32(dst_port); + pkt->hdr.flags = cpu_to_le32(info->flags); + pkt->len = len; + pkt->hdr.len = cpu_to_le32(len); + pkt->reply = info->reply; + + if (info->msg && len > 0) { + pkt->buf = kmalloc(len, GFP_KERNEL); + if (!pkt->buf) + goto out_pkt; + err = memcpy_from_msg(pkt->buf, info->msg, len); + if (err) + goto out; + } + + trace_virtio_transport_alloc_pkt(src_cid, src_port, + dst_cid, dst_port, + len, + info->type, + info->op, + info->flags); + + return pkt; + +out: + kfree(pkt->buf); +out_pkt: + kfree(pkt); + return NULL; +} +EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt); + +static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info) +{ + u32 src_cid, src_port, dst_cid, dst_port; + struct virtio_vsock_sock *vvs; + struct virtio_vsock_pkt *pkt; + u32 pkt_len = info->pkt_len; + + src_cid = vm_sockets_get_local_cid(); + src_port = vsk->local_addr.svm_port; + if (!info->remote_cid) { + dst_cid = vsk->remote_addr.svm_cid; + dst_port = vsk->remote_addr.svm_port; + } else { + dst_cid = info->remote_cid; + dst_port = info->remote_port; + } + + vvs = vsk->trans; + + /* we can send less than pkt_len bytes */ + if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) + pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + + /* virtio_transport_get_credit might return less than pkt_len credit */ + pkt_len = virtio_transport_get_credit(vvs, pkt_len); + + /* Do not send zero length OP_RW pkt */ + if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) + return pkt_len; + + pkt = virtio_transport_alloc_pkt(info, pkt_len, + src_cid, src_port, + dst_cid, dst_port); + if (!pkt) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + + virtio_transport_inc_tx_pkt(vvs, pkt); + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes += pkt->len; +} + +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes -= pkt->len; + vvs->fwd_cnt += pkt->len; +} + +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +{ + spin_lock_bh(&vvs->tx_lock); + pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); + +u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + u32 ret; + + spin_lock_bh(&vvs->tx_lock); + ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (ret > credit) + ret = credit; + vvs->tx_cnt += ret; + spin_unlock_bh(&vvs->tx_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_credit); + +void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + spin_lock_bh(&vvs->tx_lock); + vvs->tx_cnt -= credit; + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_put_credit); + +static int virtio_transport_send_credit_update(struct vsock_sock *vsk, + int type, + struct virtio_vsock_hdr *hdr) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, + .type = type, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +static ssize_t +virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + while (total < len && !list_empty(&vvs->rx_queue)) { + pkt = list_first_entry(&vvs->rx_queue, + struct virtio_vsock_pkt, list); + + bytes = len - total; + if (bytes > pkt->len - pkt->off) + bytes = pkt->len - pkt->off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + pkt->off += bytes; + if (pkt->off == pkt->len) { + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + } + spin_unlock_bh(&vvs->rx_lock); + + /* Send a credit pkt to peer */ + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); + + return total; + +out: + if (total) + err = total; + return err; +} + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_stream_do_dequeue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); + +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->rx_lock); + bytes = vvs->rx_bytes; + spin_unlock_bh(&vvs->rx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); + +static s64 virtio_transport_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (bytes < 0) + bytes = 0; + + return bytes; +} + +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->tx_lock); + bytes = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk) +{ + struct virtio_vsock_sock *vvs; + + vvs = kzalloc(sizeof(*vvs), GFP_KERNEL); + if (!vvs) + return -ENOMEM; + + vsk->trans = vvs; + vvs->vsk = vsk; + if (psk) { + struct virtio_vsock_sock *ptrans = psk->trans; + + vvs->buf_size = ptrans->buf_size; + vvs->buf_size_min = ptrans->buf_size_min; + vvs->buf_size_max = ptrans->buf_size_max; + vvs->peer_buf_alloc = ptrans->peer_buf_alloc; + } else { + vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; + vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; + vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; + } + + vvs->buf_alloc = vvs->buf_size; + + spin_lock_init(&vvs->rx_lock); + spin_lock_init(&vvs->tx_lock); + INIT_LIST_HEAD(&vvs->rx_queue); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); + +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); + +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_min; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); + +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_max; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size_min) + vvs->buf_size_min = val; + if (val > vvs->buf_size_max) + vvs->buf_size_max = val; + vvs->buf_size = val; + vvs->buf_alloc = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); + +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val > vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_min = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); + +void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_max = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); + +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now) +{ + if (vsock_stream_has_data(vsk)) + *data_ready_now = true; + else + *data_ready_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); + +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_avail_now) +{ + s64 free_space; + + free_space = vsock_stream_has_space(vsk); + if (free_space > 0) + *space_avail_now = true; + else if (free_space == 0) + *space_avail_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); + +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); + +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); + +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); + +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); + +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); + +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); + +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); + +bool virtio_transport_stream_is_active(struct vsock_sock *vsk) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); + +bool virtio_transport_stream_allow(u32 cid, u32 port) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); + +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); + +bool virtio_transport_dgram_allow(u32 cid, u32 port) +{ + return false; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); + +int virtio_transport_connect(struct vsock_sock *vsk) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_REQUEST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_connect); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_SHUTDOWN, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .flags = (mode & RCV_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | + (mode & SEND_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_shutdown); + +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t dgram_len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RW, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .msg = msg, + .pkt_len = len, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); + +void virtio_transport_destruct(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + kfree(vvs); +} +EXPORT_SYMBOL_GPL(virtio_transport_destruct); + +static int virtio_transport_reset(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .reply = !!pkt, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Normally packets are associated with a socket. There may be no socket if an + * attempt was made to connect to a socket that does not exist. + */ +static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = le16_to_cpu(pkt->hdr.type), + .reply = true, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + pkt = virtio_transport_alloc_pkt(&info, 0, + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port), + le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + if (!pkt) + return -ENOMEM; + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, + sock_flag(sk, SOCK_DONE))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); + } +} + +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + sk->sk_state_change(sk); + + if (vsk->close_work_scheduled && + (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { + vsk->close_work_scheduled = false; + + vsock_remove_sock(vsk); + + /* Release refcnt obtained when we scheduled the timeout */ + sock_put(sk); + } +} + +static void virtio_transport_close_timeout(struct work_struct *work) +{ + struct vsock_sock *vsk = + container_of(work, struct vsock_sock, close_work.work); + struct sock *sk = sk_vsock(vsk); + + sock_hold(sk); + lock_sock(sk); + + if (!sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + + virtio_transport_do_close(vsk, false); + } + + vsk->close_work_scheduled = false; + + release_sock(sk); + sock_put(sk); +} + +/* User context, vsk->sk is locked */ +static bool virtio_transport_close(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + + if (!(sk->sk_state == SS_CONNECTED || + sk->sk_state == SS_DISCONNECTING)) + return true; + + /* Already received SHUTDOWN from peer, reply with RST */ + if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) { + (void)virtio_transport_reset(vsk, NULL); + return true; + } + + if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) + (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); + + if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) + virtio_transport_wait_close(sk, sk->sk_lingertime); + + if (sock_flag(sk, SOCK_DONE)) { + return true; + } + + sock_hold(sk); + INIT_DELAYED_WORK(&vsk->close_work, + virtio_transport_close_timeout); + vsk->close_work_scheduled = true; + schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT); + return false; +} + +void virtio_transport_release(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + bool remove_sock = true; + + lock_sock(sk); + if (sk->sk_type == SOCK_STREAM) + remove_sock = virtio_transport_close(vsk); + release_sock(sk); + + if (remove_sock) + vsock_remove_sock(vsk); +} +EXPORT_SYMBOL_GPL(virtio_transport_release); + +static int +virtio_transport_recv_connecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + int err; + int skerr; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RESPONSE: + sk->sk_state = SS_CONNECTED; + sk->sk_socket->state = SS_CONNECTED; + vsock_insert_connected(vsk); + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_INVALID: + break; + case VIRTIO_VSOCK_OP_RST: + skerr = ECONNRESET; + err = 0; + goto destroy; + default: + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + return 0; + +destroy: + virtio_transport_reset(vsk, pkt); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = skerr; + sk->sk_error_report(sk); + return err; +} + +static int +virtio_transport_recv_connected(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + int err = 0; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RW: + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + + spin_lock_bh(&vvs->rx_lock); + virtio_transport_inc_rx_pkt(vvs, pkt); + list_add_tail(&pkt->list, &vvs->rx_queue); + spin_unlock_bh(&vvs->rx_lock); + + sk->sk_data_ready(sk); + return err; + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + sk->sk_write_space(sk); + break; + case VIRTIO_VSOCK_OP_SHUTDOWN: + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + vsk->peer_shutdown |= RCV_SHUTDOWN; + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (vsk->peer_shutdown == SHUTDOWN_MASK && + vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + if (le32_to_cpu(pkt->hdr.flags)) + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_RST: + virtio_transport_do_close(vsk, true); + break; + default: + err = -EINVAL; + break; + } + + virtio_transport_free_pkt(pkt); + return err; +} + +static void +virtio_transport_recv_disconnecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + virtio_transport_do_close(vsk, true); +} + +static int +virtio_transport_send_response(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RESPONSE, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .remote_cid = le32_to_cpu(pkt->hdr.src_cid), + .remote_port = le32_to_cpu(pkt->hdr.src_port), + .reply = true, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Handle server socket */ +static int +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_sock *vchild; + struct sock *child; + + if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset(vsk, pkt); + return -EINVAL; + } + + if (sk_acceptq_is_full(sk)) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, + sk->sk_type, 0); + if (!child) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + sk->sk_ack_backlog++; + + lock_sock_nested(child, SINGLE_DEPTH_NESTING); + + child->sk_state = SS_CONNECTED; + + vchild = vsock_sk(child); + vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + + vsock_insert_connected(vchild); + vsock_enqueue_accept(sk, child); + virtio_transport_send_response(vchild, pkt); + + release_sock(child); + + sk->sk_data_ready(sk); + return 0; +} + +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + +/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex + * lock. + */ +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +{ + struct sockaddr_vm src, dst; + struct vsock_sock *vsk; + struct sock *sk; + bool space_available; + + vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + + trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, + dst.svm_cid, dst.svm_port, + le32_to_cpu(pkt->hdr.len), + le16_to_cpu(pkt->hdr.type), + le16_to_cpu(pkt->hdr.op), + le32_to_cpu(pkt->hdr.flags), + le32_to_cpu(pkt->hdr.buf_alloc), + le32_to_cpu(pkt->hdr.fwd_cnt)); + + if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + + /* The socket must be in connected or bound table + * otherwise send reset back + */ + sk = vsock_find_connected_socket(&src, &dst); + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + } + + vsk = vsock_sk(sk); + + space_available = virtio_transport_space_update(sk, pkt); + + lock_sock(sk); + + /* Update CID in case it has changed after a transport reset event */ + vsk->local_addr.svm_cid = dst.svm_cid; + + if (space_available) + sk->sk_write_space(sk); + + switch (sk->sk_state) { + case VSOCK_SS_LISTEN: + virtio_transport_recv_listen(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTING: + virtio_transport_recv_connecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTED: + virtio_transport_recv_connected(sk, pkt); + break; + case SS_DISCONNECTING: + virtio_transport_recv_disconnecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + default: + virtio_transport_free_pkt(pkt); + break; + } + release_sock(sk); + + /* Release refcnt obtained when we fetched this socket out of the + * bound or connected list. + */ + sock_put(sk); + return; + +free_pkt: + virtio_transport_free_pkt(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); + +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +{ + kfree(pkt->buf); + kfree(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("common code for virtio vsock"); -- cgit v1.2.3 From bd721ea73e1f965569b40620538c942001f76294 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 2 Aug 2016 14:03:33 -0700 Subject: treewide: replace obsolete _refok by __ref There was only one use of __initdata_refok and __exit_refok __init_refok was used 46 times against 82 for __ref. Those definitions are obsolete since commit 312b1485fb50 ("Introduce new section reference annotations tags: __ref, __refdata, __refconst") This patch removes the following compatibility definitions and replaces them treewide. /* compatibility defines */ #define __init_refok __ref #define __initdata_refok __refdata #define __exit_refok __ref I can also provide separate patches if necessary. (One patch per tree and check in 1 month or 2 to remove old definitions) [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1466796271-3043-1-git-send-email-fabf@skynet.be Signed-off-by: Fabian Frederick Cc: Ingo Molnar Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/machvec_impl.h | 2 +- arch/arc/mm/init.c | 2 +- arch/arm/mach-integrator/impd1.c | 4 ++-- arch/arm/mach-mv78xx0/common.c | 2 +- arch/blackfin/mm/init.c | 2 +- arch/hexagon/mm/init.c | 2 +- arch/ia64/kernel/mca.c | 2 +- arch/microblaze/mm/init.c | 4 ++-- arch/microblaze/mm/pgtable.c | 2 +- arch/mips/mm/init.c | 2 +- arch/mips/txx9/generic/pci.c | 2 +- arch/nios2/mm/init.c | 2 +- arch/openrisc/mm/ioremap.c | 4 ++-- arch/powerpc/lib/alloc.c | 2 +- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/platforms/powermac/setup.c | 4 ++-- arch/powerpc/platforms/ps3/device-init.c | 2 +- arch/powerpc/sysdev/msi_bitmap.c | 2 +- arch/score/mm/init.c | 2 +- arch/sh/drivers/pci/pci.c | 4 ++-- arch/sh/mm/ioremap.c | 2 +- arch/x86/mm/init.c | 4 ++-- arch/x86/platform/efi/early_printk.c | 4 ++-- drivers/acpi/osl.c | 5 ++--- drivers/base/node.c | 2 +- drivers/clk/clkdev.c | 4 ++-- drivers/pci/xen-pcifront.c | 2 +- drivers/video/logo/logo.c | 4 ++-- include/acpi/acpi_io.h | 2 +- include/linux/init.h | 6 ------ include/net/net_namespace.h | 2 +- init/main.c | 2 +- mm/page_alloc.c | 4 ++-- mm/slab.c | 2 +- mm/sparse-vmemmap.c | 2 +- mm/sparse.c | 2 +- 36 files changed, 46 insertions(+), 53 deletions(-) (limited to 'include/net') diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h index f54bdf658cd0..d3398f6ab74c 100644 --- a/arch/alpha/kernel/machvec_impl.h +++ b/arch/alpha/kernel/machvec_impl.h @@ -137,7 +137,7 @@ #define __initmv __initdata #define ALIAS_MV(x) #else -#define __initmv __initdata_refok +#define __initmv __refdata /* GCC actually has a syntax for defining aliases, but is under some delusion that you shouldn't be able to declare it extern somewhere diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 8be930394750..399e2f223d25 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -220,7 +220,7 @@ void __init mem_init(void) /* * free_initmem: Free all the __init memory. */ -void __init_refok free_initmem(void) +void __ref free_initmem(void) { free_initmem_default(-1); } diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c index 38b0da300dd5..ed9a01484030 100644 --- a/arch/arm/mach-integrator/impd1.c +++ b/arch/arm/mach-integrator/impd1.c @@ -320,11 +320,11 @@ static struct impd1_device impd1_devs[] = { #define IMPD1_VALID_IRQS 0x00000bffU /* - * As this module is bool, it is OK to have this as __init_refok() - no + * As this module is bool, it is OK to have this as __ref() - no * probe calls will be done after the initial system bootup, as devices * are discovered as part of the machine startup. */ -static int __init_refok impd1_probe(struct lm_device *dev) +static int __ref impd1_probe(struct lm_device *dev) { struct impd1_module *impd1; int irq_base; diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index 45a05207b418..6af5430d0d97 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -343,7 +343,7 @@ void __init mv78xx0_init_early(void) DDR_WINDOW_CPU1_BASE, DDR_WINDOW_CPU_SZ); } -void __init_refok mv78xx0_timer_init(void) +void __ref mv78xx0_timer_init(void) { orion_time_init(BRIDGE_VIRT_BASE, BRIDGE_INT_TIMER1_CLR, IRQ_MV78XX0_TIMER_1, get_tclk()); diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index 166842de3dc7..b59cd7c3261a 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -112,7 +112,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init_refok free_initmem(void) +void __ref free_initmem(void) { #if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU free_initmem_default(-1); diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 88977e42af0a..192584d5ac2f 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -93,7 +93,7 @@ void __init mem_init(void) * Todo: free pages between __init_begin and __init_end; possibly * some devtree related stuff as well. */ -void __init_refok free_initmem(void) +void __ref free_initmem(void) { } diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 07a4e32ae96a..eb9220cde76c 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, } /* Caller prevents this from being called after init */ -static void * __init_refok mca_bootmem(void) +static void * __ref mca_bootmem(void) { return __alloc_bootmem(sizeof(struct ia64_mca_cpu), KERNEL_STACK_SIZE, 0); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 77bc7c7e6522..434639f9a3a6 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -414,7 +414,7 @@ void __init *early_get_page(void) #endif /* CONFIG_MMU */ -void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) +void * __ref alloc_maybe_bootmem(size_t size, gfp_t mask) { if (mem_init_done) return kmalloc(size, mask); @@ -422,7 +422,7 @@ void * __init_refok alloc_maybe_bootmem(size_t size, gfp_t mask) return alloc_bootmem(size); } -void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) +void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index eb99fcc76088..cc732fe357ad 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -234,7 +234,7 @@ unsigned long iopa(unsigned long addr) return pa; } -__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 9b58eb5fd0d5..a5509e7dcad2 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -504,7 +504,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void (*free_init_pages_eva)(void *begin, void *end) = NULL; -void __init_refok free_initmem(void) +void __ref free_initmem(void) { prom_free_prom_memory(); /* diff --git a/arch/mips/txx9/generic/pci.c b/arch/mips/txx9/generic/pci.c index a77698ff2b6f..1f6bc9a3036c 100644 --- a/arch/mips/txx9/generic/pci.c +++ b/arch/mips/txx9/generic/pci.c @@ -268,7 +268,7 @@ static int txx9_i8259_irq_setup(int irq) return err; } -static void __init_refok quirk_slc90e66_bridge(struct pci_dev *dev) +static void __ref quirk_slc90e66_bridge(struct pci_dev *dev) { int irq; /* PCI/ISA Bridge interrupt */ u8 reg_64; diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index e75c75d249d6..c92fe4234009 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -89,7 +89,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init_refok free_initmem(void) +void __ref free_initmem(void) { free_initmem_default(-1); } diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index 5b2a95116e8f..fa60b81aee3e 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -38,7 +38,7 @@ static unsigned int fixmaps_used __initdata; * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -void __iomem *__init_refok +void __iomem *__ref __ioremap(phys_addr_t addr, unsigned long size, pgprot_t prot) { phys_addr_t p; @@ -116,7 +116,7 @@ void iounmap(void *addr) * the memblock infrastructure. */ -pte_t __init_refok *pte_alloc_one_kernel(struct mm_struct *mm, +pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c index 60b0b3fc8fc1..a58abe4afbd1 100644 --- a/arch/powerpc/lib/alloc.c +++ b/arch/powerpc/lib/alloc.c @@ -6,7 +6,7 @@ #include -void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) +void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 7f922f557936..0ae0572bc239 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -79,7 +79,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 3de4a7c85140..6b4e9d181126 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -353,12 +353,12 @@ static int pmac_late_init(void) machine_late_initcall(powermac, pmac_late_init); /* - * This is __init_refok because we check for "initializing" before + * This is __ref because we check for "initializing" before * touching any of the __init sensitive things and "initializing" * will be false after __init time. This can't be __init because it * can be called whenever a disk is first accessed. */ -void __init_refok note_bootable_part(dev_t dev, int part, int goodness) +void __ref note_bootable_part(dev_t dev, int part, int goodness) { char *p; diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index 3f175e8aedb4..57caaf11a83f 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -189,7 +189,7 @@ fail_malloc: return result; } -static int __init_refok ps3_setup_uhc_device( +static int __ref ps3_setup_uhc_device( const struct ps3_repository_device *repo, enum ps3_match_id match_id, enum ps3_interrupt_type interrupt_type, enum ps3_reg_type reg_type) { diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index ed5234ed8d3f..5ebd3f018295 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -112,7 +112,7 @@ int msi_bitmap_reserve_dt_hwirqs(struct msi_bitmap *bmp) return 0; } -int __init_refok msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, +int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, struct device_node *of_node) { int size; diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 9fbce49ad3bd..444c26c0f750 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -91,7 +91,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init_refok free_initmem(void) +void __ref free_initmem(void) { free_initmem_default(POISON_FREE_INITMEM); } diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index d5462b7bc514..84563e39a5b8 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -221,7 +221,7 @@ pcibios_bus_report_status_early(struct pci_channel *hose, * We can't use pci_find_device() here since we are * called from interrupt context. */ -static void __init_refok +static void __ref pcibios_bus_report_status(struct pci_bus *bus, unsigned int status_mask, int warn) { @@ -256,7 +256,7 @@ pcibios_bus_report_status(struct pci_bus *bus, unsigned int status_mask, pcibios_bus_report_status(dev->subordinate, status_mask, warn); } -void __init_refok pcibios_report_status(unsigned int status_mask, int warn) +void __ref pcibios_report_status(unsigned int status_mask, int warn) { struct pci_channel *hose; diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 0c99ec2e7ed8..d09ddfe58fd8 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -34,7 +34,7 @@ * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -void __iomem * __init_refok +void __iomem * __ref __ioremap_caller(phys_addr_t phys_addr, unsigned long size, pgprot_t pgprot, void *caller) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fb4c1b42fc7e..620928903be3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -208,7 +208,7 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, * adjust the page_size_mask for small range to go with * big page size instead small one if nearby are ram too. */ -static void __init_refok adjust_range_page_size_mask(struct map_range *mr, +static void __ref adjust_range_page_size_mask(struct map_range *mr, int nr_range) { int i; @@ -396,7 +396,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ -unsigned long __init_refok init_memory_mapping(unsigned long start, +unsigned long __ref init_memory_mapping(unsigned long start, unsigned long end) { struct map_range mr[NR_RANGE_MR]; diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 524142117296..5fdacb322ceb 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -44,7 +44,7 @@ early_initcall(early_efi_map_fb); * In case earlyprintk=efi,keep we have the whole framebuffer mapped already * so just return the offset efi_fb + start. */ -static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +static __ref void *early_efi_map(unsigned long start, unsigned long len) { unsigned long base; @@ -56,7 +56,7 @@ static __init_refok void *early_efi_map(unsigned long start, unsigned long len) return early_ioremap(base + start, len); } -static __init_refok void early_efi_unmap(void *addr, unsigned long len) +static __ref void early_efi_unmap(void *addr, unsigned long len) { if (!efi_fb) early_iounmap(addr, len); diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index b108f1358a32..4305ee9db4b2 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -309,7 +309,7 @@ static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr) * During early init (when acpi_gbl_permanent_mmap has not been set yet) this * routine simply calls __acpi_map_table() to get the job done. */ -void __iomem *__init_refok +void __iomem *__ref acpi_os_map_iomem(acpi_physical_address phys, acpi_size size) { struct acpi_ioremap *map; @@ -362,8 +362,7 @@ out: } EXPORT_SYMBOL_GPL(acpi_os_map_iomem); -void *__init_refok -acpi_os_map_memory(acpi_physical_address phys, acpi_size size) +void *__ref acpi_os_map_memory(acpi_physical_address phys, acpi_size size) { return (void *)acpi_os_map_iomem(phys, size); } diff --git a/drivers/base/node.c b/drivers/base/node.c index 29cd96661b30..5548f9686016 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -370,7 +370,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #define page_initialized(page) (page->lru.next) -static int __init_refok get_nid_for_pfn(unsigned long pfn) +static int __ref get_nid_for_pfn(unsigned long pfn) { struct page *page; diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index 89cc700fbc37..97ae60fa1584 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -250,7 +250,7 @@ struct clk_lookup_alloc { char con_id[MAX_CON_ID]; }; -static struct clk_lookup * __init_refok +static struct clk_lookup * __ref vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt, va_list ap) { @@ -287,7 +287,7 @@ vclkdev_create(struct clk_hw *hw, const char *con_id, const char *dev_fmt, return cl; } -struct clk_lookup * __init_refok +struct clk_lookup * __ref clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...) { struct clk_lookup *cl; diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 5f70fee59a94..d6ff5e82377d 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -1086,7 +1086,7 @@ out: return err; } -static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev, +static void __ref pcifront_backend_changed(struct xenbus_device *xdev, enum xenbus_state be_state) { struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev); diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c index 10fbfd8ab963..b6bc4a0bda2a 100644 --- a/drivers/video/logo/logo.c +++ b/drivers/video/logo/logo.c @@ -36,11 +36,11 @@ static int __init fb_logo_late_init(void) late_initcall(fb_logo_late_init); -/* logo's are marked __initdata. Use __init_refok to tell +/* logo's are marked __initdata. Use __ref to tell * modpost that it is intended that this function uses data * marked __initdata. */ -const struct linux_logo * __init_refok fb_find_logo(int depth) +const struct linux_logo * __ref fb_find_logo(int depth) { const struct linux_logo *logo = NULL; diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h index dd86c5fc102d..d7d0f495a34e 100644 --- a/include/acpi/acpi_io.h +++ b/include/acpi/acpi_io.h @@ -13,7 +13,7 @@ static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, } #endif -void __iomem *__init_refok +void __iomem *__ref acpi_os_map_iomem(acpi_physical_address phys, acpi_size size); void __ref acpi_os_unmap_iomem(void __iomem *virt, acpi_size size); void __iomem *acpi_os_get_iomem(acpi_physical_address phys, unsigned int size); diff --git a/include/linux/init.h b/include/linux/init.h index aedb254abc37..6935d02474aa 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -77,12 +77,6 @@ #define __refdata __section(.ref.data) #define __refconst __constsection(.ref.rodata) -/* compatibility defines */ -#define __init_refok __ref -#define __initdata_refok __refdata -#define __exit_refok __ref - - #ifdef MODULE #define __exitused #else diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 4089abc6e9c0..0933c7455a30 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -275,7 +275,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #define __net_initconst #else #define __net_init __init -#define __net_exit __exit_refok +#define __net_exit __ref #define __net_initdata __initdata #define __net_initconst __initconst #endif diff --git a/init/main.c b/init/main.c index eae02aa03c9e..e7345dcaaf05 100644 --- a/init/main.c +++ b/init/main.c @@ -380,7 +380,7 @@ static void __init setup_command_line(char *command_line) static __initdata DECLARE_COMPLETION(kthreadd_done); -static noinline void __init_refok rest_init(void) +static noinline void __ref rest_init(void) { int pid; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea759b935360..39a372a2a1d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5276,7 +5276,7 @@ void __init setup_per_cpu_pageset(void) setup_zone_pageset(zone); } -static noinline __init_refok +static noinline __ref int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -5903,7 +5903,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } -static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; diff --git a/mm/slab.c b/mm/slab.c index ca135bd47c35..261147ba156f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1877,7 +1877,7 @@ static struct array_cache __percpu *alloc_kmem_cache_cpus( return cpu_cache; } -static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) +static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (slab_state >= FULL) return enable_cpucache(cachep, gfp); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 68885dcbaf40..574c67b663fe 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -36,7 +36,7 @@ * Uses the main allocators if they are available, else bootmem. */ -static void * __init_refok __earlyonly_bootmem_alloc(int node, +static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long size, unsigned long align, unsigned long goal) diff --git a/mm/sparse.c b/mm/sparse.c index 36d7bbb80e49..1e168bf2779a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -59,7 +59,7 @@ static inline void set_section_nid(unsigned long section_nr, int nid) #endif #ifdef CONFIG_SPARSEMEM_EXTREME -static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) +static noinline struct mem_section __ref *sparse_index_alloc(int nid) { struct mem_section *section = NULL; unsigned long array_size = SECTIONS_PER_ROOT * -- cgit v1.2.3 From 2439ca0402091badb24415e1b073ba12b34ba423 Mon Sep 17 00:00:00 2001 From: Maxim Altshul Date: Thu, 4 Aug 2016 15:43:04 +0300 Subject: mac80211: Add ieee80211_hw pointer to get_expected_throughput The variable is added to allow the driver an easy access to it's own hw->priv when the op is invoked. This fixes a crash in wlcore because it was relying on a station pointer that wasn't initialized yet. It's the wrong way to fix the crash, but it solves the problem for now and it does make sense to have the hw pointer here. Signed-off-by: Maxim Altshul [rewrite commit message, fix indentation] Signed-off-by: Johannes Berg --- drivers/net/wireless/ti/wlcore/main.c | 5 +++-- include/net/mac80211.h | 3 ++- net/mac80211/driver-ops.h | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 1d689169da76..9e1f2d9c9865 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -5700,10 +5700,11 @@ out: mutex_unlock(&wl->mutex); } -static u32 wlcore_op_get_expected_throughput(struct ieee80211_sta *sta) +static u32 wlcore_op_get_expected_throughput(struct ieee80211_hw *hw, + struct ieee80211_sta *sta) { struct wl1271_station *wl_sta = (struct wl1271_station *)sta->drv_priv; - struct wl1271 *wl = wl_sta->wl; + struct wl1271 *wl = hw->priv; u8 hlid = wl_sta->hlid; /* return in units of Kbps */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b4faadbb4e01..cca510a585c3 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3620,7 +3620,8 @@ struct ieee80211_ops { int (*join_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void (*leave_ibss)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - u32 (*get_expected_throughput)(struct ieee80211_sta *sta); + u32 (*get_expected_throughput)(struct ieee80211_hw *hw, + struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int *dbm); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 184473c257eb..ba5fc1f01e53 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1094,7 +1094,7 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, trace_drv_get_expected_throughput(sta); if (local->ops->get_expected_throughput) - ret = local->ops->get_expected_throughput(sta); + ret = local->ops->get_expected_throughput(&local->hw, sta); trace_drv_return_u32(local, ret); return ret; -- cgit v1.2.3 From 372ee16386bbf6dc5eeb0387e1ede963debba82a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Aug 2016 14:11:40 +0100 Subject: rxrpc: Fix races between skb free, ACK generation and replying Inside the kafs filesystem it is possible to occasionally have a call processed and terminated before we've had a chance to check whether we need to clean up the rx queue for that call because afs_send_simple_reply() ends the call when it is done, but this is done in a workqueue item that might happen to run to completion before afs_deliver_to_call() completes. Further, it is possible for rxrpc_kernel_send_data() to be called to send a reply before the last request-phase data skb is released. The rxrpc skb destructor is where the ACK processing is done and the call state is advanced upon release of the last skb. ACK generation is also deferred to a work item because it's possible that the skb destructor is not called in a context where kernel_sendmsg() can be invoked. To this end, the following changes are made: (1) kernel_rxrpc_data_consumed() is added. This should be called whenever an skb is emptied so as to crank the ACK and call states. This does not release the skb, however. kernel_rxrpc_free_skb() must now be called to achieve that. These together replace rxrpc_kernel_data_delivered(). (2) kernel_rxrpc_data_consumed() is wrapped by afs_data_consumed(). This makes afs_deliver_to_call() easier to work as the skb can simply be discarded unconditionally here without trying to work out what the return value of the ->deliver() function means. The ->deliver() functions can, via afs_data_complete(), afs_transfer_reply() and afs_extract_data() mark that an skb has been consumed (thereby cranking the state) without the need to conditionally free the skb to make sure the state is correct on an incoming call for when the call processor tries to send the reply. (3) rxrpc_recvmsg() now has to call kernel_rxrpc_data_consumed() when it has finished with a packet and MSG_PEEK isn't set. (4) rxrpc_packet_destructor() no longer calls rxrpc_hard_ACK_data(). Because of this, we no longer need to clear the destructor and put the call before we free the skb in cases where we don't want the ACK/call state to be cranked. (5) The ->deliver() call-type callbacks are made to return -EAGAIN rather than 0 if they expect more data (afs_extract_data() returns -EAGAIN to the delivery function already), and the caller is now responsible for producing an abort if that was the last packet. (6) There are many bits of unmarshalling code where: ret = afs_extract_data(call, skb, last, ...); switch (ret) { case 0: break; case -EAGAIN: return 0; default: return ret; } is to be found. As -EAGAIN can now be passed back to the caller, we now just return if ret < 0: ret = afs_extract_data(call, skb, last, ...); if (ret < 0) return ret; (7) Checks for trailing data and empty final data packets has been consolidated as afs_data_complete(). So: if (skb->len > 0) return -EBADMSG; if (!last) return 0; becomes: ret = afs_data_complete(call, skb, last); if (ret < 0) return ret; (8) afs_transfer_reply() now checks the amount of data it has against the amount of data desired and the amount of data in the skb and returns an error to induce an abort if we don't get exactly what we want. Without these changes, the following oops can occasionally be observed, particularly if some printks are inserted into the delivery path: general protection fault: 0000 [#1] SMP Modules linked in: kafs(E) af_rxrpc(E) [last unloaded: af_rxrpc] CPU: 0 PID: 1305 Comm: kworker/u8:3 Tainted: G E 4.7.0-fsdevel+ #1303 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Workqueue: kafsd afs_async_workfn [kafs] task: ffff88040be041c0 ti: ffff88040c070000 task.ti: ffff88040c070000 RIP: 0010:[] [] __lock_acquire+0xcf/0x15a1 RSP: 0018:ffff88040c073bc0 EFLAGS: 00010002 RAX: 6b6b6b6b6b6b6b6b RBX: 0000000000000000 RCX: ffff88040d29a710 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88040d29a710 RBP: ffff88040c073c70 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88040be041c0 R15: ffffffff814c928f FS: 0000000000000000(0000) GS:ffff88041fa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa4595f4750 CR3: 0000000001c14000 CR4: 00000000001406f0 Stack: 0000000000000006 000000000be04930 0000000000000000 ffff880400000000 ffff880400000000 ffffffff8108f847 ffff88040be041c0 ffffffff81050446 ffff8803fc08a920 ffff8803fc08a958 ffff88040be041c0 ffff88040c073c38 Call Trace: [] ? mark_held_locks+0x5e/0x74 [] ? __local_bh_enable_ip+0x9b/0xa1 [] ? trace_hardirqs_on_caller+0x16d/0x189 [] lock_acquire+0x122/0x1b6 [] ? lock_acquire+0x122/0x1b6 [] ? skb_dequeue+0x18/0x61 [] _raw_spin_lock_irqsave+0x35/0x49 [] ? skb_dequeue+0x18/0x61 [] skb_dequeue+0x18/0x61 [] afs_deliver_to_call+0x344/0x39d [kafs] [] afs_process_async_call+0x4c/0xd5 [kafs] [] afs_async_workfn+0xe/0x10 [kafs] [] process_one_work+0x29d/0x57c [] worker_thread+0x24a/0x385 [] ? rescuer_thread+0x2d0/0x2d0 [] kthread+0xf3/0xfb [] ret_from_fork+0x1f/0x40 [] ? kthread_create_on_node+0x1cf/0x1cf Signed-off-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.txt | 21 ++-- fs/afs/cmservice.c | 78 ++++++------- fs/afs/fsclient.c | 221 ++++++++++++------------------------- fs/afs/internal.h | 14 ++- fs/afs/rxrpc.c | 73 +++++++----- fs/afs/vlclient.c | 11 +- include/net/af_rxrpc.h | 2 +- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_accept.c | 1 + net/rxrpc/call_event.c | 3 + net/rxrpc/call_object.c | 8 +- net/rxrpc/input.c | 12 +- net/rxrpc/recvmsg.c | 25 +---- net/rxrpc/skbuff.c | 41 +++++-- 14 files changed, 225 insertions(+), 286 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index 16a924c486bf..70c926ae212d 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -790,13 +790,12 @@ The kernel interface functions are as follows: Data messages can have their contents extracted with the usual bunch of socket buffer manipulation functions. A data message can be determined to be the last one in a sequence with rxrpc_kernel_is_data_last(). When a - data message has been used up, rxrpc_kernel_data_delivered() should be - called on it.. + data message has been used up, rxrpc_kernel_data_consumed() should be + called on it. - Non-data messages should be handled to rxrpc_kernel_free_skb() to dispose - of. It is possible to get extra refs on all types of message for later - freeing, but this may pin the state of a call until the message is finally - freed. + Messages should be handled to rxrpc_kernel_free_skb() to dispose of. It + is possible to get extra refs on all types of message for later freeing, + but this may pin the state of a call until the message is finally freed. (*) Accept an incoming call. @@ -821,12 +820,14 @@ The kernel interface functions are as follows: Other errors may be returned if the call had been aborted (-ECONNABORTED) or had timed out (-ETIME). - (*) Record the delivery of a data message and free it. + (*) Record the delivery of a data message. - void rxrpc_kernel_data_delivered(struct sk_buff *skb); + void rxrpc_kernel_data_consumed(struct rxrpc_call *call, + struct sk_buff *skb); - This is used to record a data message as having been delivered and to - update the ACK state for the call. The socket buffer will be freed. + This is used to record a data message as having been consumed and to + update the ACK state for the call. The message must still be passed to + rxrpc_kernel_free_skb() for disposal by the caller. (*) Free a message. diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 4b0eff6da674..85737e96ab8b 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract FID count"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); @@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("extract FID array"); ret = afs_extract_data(call, skb, last, call->buffer, call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; _debug("unmarshall FID array"); call->request = kcalloc(call->count, @@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 3: _debug("extract CB count"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; tmp = ntohl(call->tmp); _debug("CB count: %u", tmp); @@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("extract CB array"); ret = afs_extract_data(call, skb, last, call->request, call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; _debug("unmarshall CB array"); cb = call->request; @@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking @@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, break; } - if (!last) - return 0; call->state = AFS_CALL_REPLYING; @@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, { struct afs_server *server; struct in_addr addr; + int ret; _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, _enter(",{%u},%d", skb->len, last); + /* There are some arguments that we ignore */ + afs_data_consumed(call, skb); if (!last) - return 0; + return -EAGAIN; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work) static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, bool last) { + int ret; + _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; @@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; switch (call->unmarshall) { case 0: @@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, break; } - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; call->state = AFS_CALL_REPLYING; @@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, struct sk_buff *skb, bool last) { + int ret; + _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index c2e930ec2888..9312b92e54be 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, case 1: _debug("extract data length (MSW)"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("DATA length MSW: %u", call->count); @@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, case 2: _debug("extract data length"); ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("DATA length: %u", call->count); @@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, ret = afs_extract_data(call, skb, last, buffer, call->count); kunmap_atomic(buffer); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } call->offset = 0; @@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, case 4: ret = afs_extract_data(call, skb, last, call->buffer, (21 + 3 + 6) * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; bp = call->buffer; xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); @@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; break; } - if (!last) - return 0; - if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; @@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, { _enter(",{%u},%d", skb->len, last); - if (skb->len > 0) - return -EBADMSG; /* shouldn't be any reply data */ - return 0; + /* shouldn't be any reply data */ + return afs_data_complete(call, skb, last); } /* @@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call, { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call, { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call, { struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; @@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call, afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; + int ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ store_version = NULL; @@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, _debug("extract status"); ret = afs_extract_data(call, skb, last, call->buffer, 12 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; bp = call->buffer; xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); @@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); @@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, if (call->count > 0) { ret = afs_extract_data(call, skb, last, call->reply3, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } p = call->reply3; @@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 4: ret = afs_extract_data(call, skb, last, call->buffer, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; @@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); @@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, if (call->count > 0) { ret = afs_extract_data(call, skb, last, call->reply3, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } p = call->reply3; @@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 7: ret = afs_extract_data(call, skb, last, call->buffer, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; @@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); @@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, if (call->count > 0) { ret = afs_extract_data(call, skb, last, call->reply3, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; } p = call->reply3; @@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 10: ret = afs_extract_data(call, skb, last, call->buffer, call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (ret < 0) + return ret; call->offset = 0; call->unmarshall++; no_motd_padding: case 11: - _debug("trailer %d", skb->len); - if (skb->len != 0) - return -EBADMSG; + ret = afs_data_complete(call, skb, last); + if (ret < 0) + return ret; break; } - if (!last) - return 0; - _leave(" = 0 [done]"); return 0; } @@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call, struct sk_buff *skb, bool last) { const __be32 *bp; + int ret; _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 71d5982312f3..df976b2a7f40 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *); */ extern int afs_open_socket(void); extern void afs_close_socket(void); +extern void afs_data_consumed(struct afs_call *, struct sk_buff *); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, const struct afs_wait_mode *); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); +extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, size_t); +static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + if (skb->len > 0) + return -EBADMSG; + afs_data_consumed(call, skb); + if (!last) + return -EAGAIN; + return 0; +} + /* * security.c */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 4832de84d52c..14d04c848465 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -150,10 +150,9 @@ void afs_close_socket(void) } /* - * note that the data in a socket buffer is now delivered and that the buffer - * should be freed + * Note that the data in a socket buffer is now consumed. */ -static void afs_data_delivered(struct sk_buff *skb) +void afs_data_consumed(struct afs_call *call, struct sk_buff *skb) { if (!skb) { _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); @@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb) } else { _debug("DLVR %p{%u} [%d]", skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_data_delivered(skb); + rxrpc_kernel_data_consumed(call->rxcall, skb); } } @@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call) last = rxrpc_kernel_is_data_last(skb); ret = call->type->deliver(call, skb, last); switch (ret) { + case -EAGAIN: + if (last) { + _debug("short data"); + goto unmarshal_error; + } + break; case 0: - if (last && - call->state == AFS_CALL_AWAIT_REPLY) + ASSERT(last); + if (call->state == AFS_CALL_AWAIT_REPLY) call->state = AFS_CALL_COMPLETE; break; case -ENOTCONN: @@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code = RX_INVALID_OPERATION; goto do_abort; default: + unmarshal_error: abort_code = RXGEN_CC_UNMARSHAL; if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; @@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call) call->state = AFS_CALL_ERROR; break; } - afs_data_delivered(skb); - skb = NULL; - continue; + break; case RXRPC_SKB_MARK_FINAL_ACK: _debug("Rcv ACK"); call->state = AFS_CALL_COMPLETE; @@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call) } /* - * empty a socket buffer into a flat reply buffer + * Empty a socket buffer into a flat reply buffer. */ -void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) +int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last) { size_t len = skb->len; - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) - BUG(); - call->reply_size += len; + if (len > call->reply_max - call->reply_size) { + _leave(" = -EBADMSG [%zu > %u]", + len, call->reply_max - call->reply_size); + return -EBADMSG; + } + + if (len > 0) { + if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, + len) < 0) + BUG(); + call->reply_size += len; + } + + afs_data_consumed(call, skb); + if (!last) + return -EAGAIN; + + if (call->reply_size != call->reply_max) { + _leave(" = -EBADMSG [%u != %u]", + call->reply_size, call->reply_max); + return -EBADMSG; + } + return 0; } /* @@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work) } /* - * grab the operation ID from an incoming cache manager call + * Grab the operation ID from an incoming cache manager call. The socket + * buffer is discarded on error or if we don't yet have sufficient data. */ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, bool last) @@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, call->offset += len; if (call->offset < 4) { - if (last) { - _leave(" = -EBADMSG [op ID short]"); - return -EBADMSG; - } - _leave(" = 0 [incomplete]"); - return 0; + afs_data_consumed(call, skb); + _leave(" = -EAGAIN"); + return -EAGAIN; } call->state = AFS_CALL_AWAIT_REQUEST; @@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) } /* - * extract a piece of data from the received data socket buffers + * Extract a piece of data from the received data socket buffers. */ int afs_extract_data(struct afs_call *call, struct sk_buff *skb, bool last, void *buf, size_t count) @@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb, call->offset += len; if (call->offset < count) { - if (last) { - _leave(" = -EBADMSG [%d < %zu]", call->offset, count); - return -EBADMSG; - } + afs_data_consumed(call, skb); _leave(" = -EAGAIN"); return -EAGAIN; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 340afd0cd182..f94d1abdc3eb 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; - int loop; + int loop, ret; _enter(",,%u", last); - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call, skb, last); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ entry = call->reply; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ac1bc3c49fbd..7b0f88699b25 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -40,12 +40,12 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, unsigned long, gfp_t); int rxrpc_kernel_send_data(struct rxrpc_call *, struct msghdr *, size_t); +void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_kernel_abort_call(struct rxrpc_call *, u32); void rxrpc_kernel_end_call(struct rxrpc_call *); bool rxrpc_kernel_is_data_last(struct sk_buff *); u32 rxrpc_kernel_get_abort_code(struct sk_buff *); int rxrpc_kernel_get_error_number(struct sk_buff *); -void rxrpc_kernel_data_delivered(struct sk_buff *); void rxrpc_kernel_free_skb(struct sk_buff *); struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *, unsigned long); int rxrpc_kernel_reject_call(struct socket *); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1bb9e7ac9e14..ff83fb1ddd47 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -425,6 +425,7 @@ struct rxrpc_call { spinlock_t lock; rwlock_t state_lock; /* lock for state transition */ atomic_t usage; + atomic_t skb_count; /* Outstanding packets on this call */ atomic_t sequence; /* Tx data packet sequence counter */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0b2832141bd0..9bae21e66d65 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -130,6 +130,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, call->state = RXRPC_CALL_SERVER_ACCEPTING; list_add_tail(&call->accept_link, &rx->acceptq); rxrpc_get_call(call); + atomic_inc(&call->skb_count); nsp = rxrpc_skb(notification); nsp->call = call; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fc32aa5764a2..f5e99163a09e 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -460,6 +460,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call, ASSERTCMP(sp->call, ==, NULL); sp->call = call; rxrpc_get_call(call); + atomic_inc(&call->skb_count); /* insert into the buffer in sequence order */ spin_lock_bh(&call->lock); @@ -734,6 +735,7 @@ all_acked: skb->mark = RXRPC_SKB_MARK_FINAL_ACK; sp->call = call; rxrpc_get_call(call); + atomic_inc(&call->skb_count); spin_lock_bh(&call->lock); if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) BUG(); @@ -793,6 +795,7 @@ static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, sp->error = error; sp->call = call; rxrpc_get_call(call); + atomic_inc(&call->skb_count); spin_lock_bh(&call->lock); ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 91287c9d01bb..c47f14fc5e88 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -491,13 +491,6 @@ void rxrpc_release_call(struct rxrpc_call *call) spin_lock_bh(&call->lock); while ((skb = skb_dequeue(&call->rx_queue)) || (skb = skb_dequeue(&call->rx_oos_queue))) { - sp = rxrpc_skb(skb); - if (sp->call) { - ASSERTCMP(sp->call, ==, call); - rxrpc_put_call(call); - sp->call = NULL; - } - skb->destructor = NULL; spin_unlock_bh(&call->lock); _debug("- zap %s %%%u #%u", @@ -605,6 +598,7 @@ void __rxrpc_put_call(struct rxrpc_call *call) if (atomic_dec_and_test(&call->usage)) { _debug("call %d dead", call->debug_id); + WARN_ON(atomic_read(&call->skb_count) != 0); ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); rxrpc_queue_work(&call->destroyer); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 991a20d25093..9e0f58edcd01 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -55,9 +55,6 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { _debug("already terminated"); ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); - skb->destructor = NULL; - sp->call = NULL; - rxrpc_put_call(call); rxrpc_free_skb(skb); return 0; } @@ -111,13 +108,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, ret = 0; out: - /* release the socket buffer */ - if (skb) { - skb->destructor = NULL; - sp->call = NULL; - rxrpc_put_call(call); - rxrpc_free_skb(skb); - } + rxrpc_free_skb(skb); _leave(" = %d", ret); return ret; @@ -200,6 +191,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, sp->call = call; rxrpc_get_call(call); + atomic_inc(&call->skb_count); terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a3fa2ed85d63..9ed66d533002 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -203,6 +203,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } /* we transferred the whole data packet */ + if (!(flags & MSG_PEEK)) + rxrpc_kernel_data_consumed(call, skb); + if (sp->hdr.flags & RXRPC_LAST_PACKET) { _debug("last"); if (rxrpc_conn_is_client(call->conn)) { @@ -359,28 +362,6 @@ wait_error: } -/** - * rxrpc_kernel_data_delivered - Record delivery of data message - * @skb: Message holding data - * - * Record the delivery of a data message. This permits RxRPC to keep its - * tracking correct. The socket buffer will be deleted. - */ -void rxrpc_kernel_data_delivered(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call = sp->call; - - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - call->rx_data_recv = sp->hdr.seq; - - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); - rxrpc_free_skb(skb); -} - -EXPORT_SYMBOL(rxrpc_kernel_data_delivered); - /** * rxrpc_kernel_is_data_last - Determine if data message is last one * @skb: Message holding data diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index eee0cfd9ac8c..06c51d4b622d 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -98,11 +98,39 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call, spin_unlock_bh(&call->lock); } +/** + * rxrpc_kernel_data_consumed - Record consumption of data message + * @call: The call to which the message pertains. + * @skb: Message holding data + * + * Record the consumption of a data message and generate an ACK if appropriate. + * The call state is shifted if this was the final packet. The caller must be + * in process context with no spinlocks held. + * + * TODO: Actually generate the ACK here rather than punting this to the + * workqueue. + */ +void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq); + + ASSERTCMP(sp->call, ==, call); + ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA); + + /* TODO: Fix the sequence number tracking */ + ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); + ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); + ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + + call->rx_data_recv = sp->hdr.seq; + rxrpc_hard_ACK_data(call, sp); +} +EXPORT_SYMBOL(rxrpc_kernel_data_consumed); + /* - * destroy a packet that has an RxRPC control buffer - * - advance the hard-ACK state of the parent call (done here in case something - * in the kernel bypasses recvmsg() and steals the packet directly off of the - * socket receive queue) + * Destroy a packet that has an RxRPC control buffer */ void rxrpc_packet_destructor(struct sk_buff *skb) { @@ -112,9 +140,8 @@ void rxrpc_packet_destructor(struct sk_buff *skb) _enter("%p{%p}", skb, call); if (call) { - /* send the final ACK on a client call */ - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) - rxrpc_hard_ACK_data(call, sp); + if (atomic_dec_return(&call->skb_count) < 0) + BUG(); rxrpc_put_call(call); sp->call = NULL; } -- cgit v1.2.3 From c15c0ab12fd62f2b19181d05c62d24bc9fa55a42 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Aug 2016 07:48:21 +0200 Subject: ipv6: suppress sparse warnings in IP6_ECN_set_ce() Pass the correct type __wsum to csum_sub() and csum_add(). This doesn't really change anything since __wsum really *is* __be32, but removes the address space warnings from sparse. Cc: Eric Dumazet Fixes: 34ae6a1aa054 ("ipv6: update skb->csum when CE mark is propagated") Signed-off-by: Johannes Berg Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 0dc0a51da38f..dce2d586d9ce 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -128,7 +128,8 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(csum_sub(skb->csum, from), to); + skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), + (__force __wsum)to); return 1; } -- cgit v1.2.3 From 3d7b33209201cbfa090d614db993571ca3c6b090 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 15 Aug 2016 13:06:24 +0200 Subject: gre: set inner_protocol on xmit Ensure that the inner_protocol is set on transmit so that GSO segmentation, which relies on that field, works correctly. This is achieved by setting the inner_protocol in gre_build_header rather than each caller of that function. It ensures that the inner_protocol is set when gre_fb_xmit() is used to transmit GRE which was not previously the case. I have observed this is not the case when OvS transmits GRE using lwtunnel metadata (which it always does). Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol") Cc: Pravin Shelar Acked-by: Alexander Duyck Signed-off-by: Simon Horman Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/gre.h | 1 + net/ipv4/ip_gre.c | 1 - net/ipv6/ip6_gre.c | 2 -- 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/gre.h b/include/net/gre.h index 7a54a31d1d4c..73ea256eb7d7 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -104,6 +104,7 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len, skb_push(skb, hdr_len); + skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5b1481be0282..113cc43df789 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -370,7 +370,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->parms.o_flags, proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 776d145113e1..704274cbd495 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -519,8 +519,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); - skb_set_inner_protocol(skb, protocol); - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } -- cgit v1.2.3 From 0c23c3e705691cfb99c94f2760df2b456fe45194 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 13 Aug 2016 22:34:58 -0700 Subject: net_sched: fix a typo in tc_for_each_action() It is harmless because all users pass 'a' to this macro. Fixes: 00175aec941e ("net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef") Cc: Amir Vadai Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 41e6a24a44b9..f53ee9d315c1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -193,7 +193,7 @@ int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); (list_empty(&(_exts)->actions)) #define tc_for_each_action(_a, _exts) \ - list_for_each_entry(a, &(_exts)->actions, list) + list_for_each_entry(_a, &(_exts)->actions, list) #define tc_single_action(_exts) \ (list_is_singular(&(_exts)->actions)) -- cgit v1.2.3 From 2734437ef3c2943090d0914bf91caa6b30451615 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 13 Aug 2016 22:34:59 -0700 Subject: net_sched: move tc offload macros to pkt_cls.h struct tcf_exts belongs to filters, should not be visible to plain tc actions. Cc: Ido Schimmel Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 19 +++---------------- include/net/pkt_cls.h | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index f53ee9d315c1..870332ff61eb 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -189,30 +189,17 @@ int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); -#define tc_no_actions(_exts) \ - (list_empty(&(_exts)->actions)) - -#define tc_for_each_action(_a, _exts) \ - list_for_each_entry(_a, &(_exts)->actions, list) - -#define tc_single_action(_exts) \ - (list_is_singular(&(_exts)->actions)) +#endif /* CONFIG_NET_CLS_ACT */ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 lastuse) { +#ifdef CONFIG_NET_CLS_ACT if (!a->ops->stats_update) return; a->ops->stats_update(a, bytes, packets, lastuse); +#endif } -#else /* CONFIG_NET_CLS_ACT */ - -#define tc_no_actions(_exts) true -#define tc_for_each_action(_a, _exts) while ((void)(_a), 0) -#define tc_single_action(_exts) false -#define tcf_action_stats_update(a, bytes, packets, lastuse) - -#endif /* CONFIG_NET_CLS_ACT */ #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6f8d65342d3a..00dd5c4c1d0a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -130,6 +130,25 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, return 0; } +#ifdef CONFIG_NET_CLS_ACT + +#define tc_no_actions(_exts) \ + (list_empty(&(_exts)->actions)) + +#define tc_for_each_action(_a, _exts) \ + list_for_each_entry(_a, &(_exts)->actions, list) + +#define tc_single_action(_exts) \ + (list_is_singular(&(_exts)->actions)) + +#else /* CONFIG_NET_CLS_ACT */ + +#define tc_no_actions(_exts) true +#define tc_for_each_action(_a, _exts) while ((void)(_a), 0) +#define tc_single_action(_exts) false + +#endif /* CONFIG_NET_CLS_ACT */ + int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr); -- cgit v1.2.3 From 22dc13c837c33207548c8ee5116b64e2930a6e23 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 13 Aug 2016 22:35:00 -0700 Subject: net_sched: convert tcf_exts from list to pointer array As pointed out by Jamal, an action could be shared by multiple filters, so we can't use list to chain them any more after we get rid of the original tc_action. Instead, we could just save pointers to these actions in tcf_exts, since they are refcount'ed, so convert the list to an array of pointers. The "ugly" part is the action API still accepts list as a parameter, I just introduce a helper function to convert the array of pointers to a list, instead of relying on the C99 feature to iterate the array. Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common") Reported-by: Jamal Hadi Salim Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 ++++-- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 +- include/net/act_api.h | 4 +- include/net/pkt_cls.h | 40 ++++++++++++------- net/sched/act_api.c | 11 +++--- net/sched/cls_api.c | 51 +++++++++++++++++-------- 7 files changed, 85 insertions(+), 41 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ee57a89252bb..b4f03748adc0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8396,12 +8396,14 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter, struct tcf_exts *exts, u64 *action, u8 *queue) { const struct tc_action *a; + LIST_HEAD(actions); int err; if (tc_no_actions(exts)) return -EINVAL; - tc_for_each_action(a, exts) { + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { /* Drop action */ if (is_tcf_gact_shot(a)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 0f19b01e3fff..dc8b1cb0fdc8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -318,6 +318,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, u32 *action, u32 *flow_tag) { const struct tc_action *a; + LIST_HEAD(actions); if (tc_no_actions(exts)) return -EINVAL; @@ -325,7 +326,8 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, *flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; *action = 0; - tc_for_each_action(a, exts) { + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { /* Only support a single action per rule */ if (*action) return -EINVAL; @@ -362,13 +364,15 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, u32 *action, u32 *dest_vport) { const struct tc_action *a; + LIST_HEAD(actions); if (tc_no_actions(exts)) return -EINVAL; *action = 0; - tc_for_each_action(a, exts) { + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { /* Only support a single action per rule */ if (*action) return -EINVAL; @@ -503,6 +507,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow; struct tc_action *a; struct mlx5_fc *counter; + LIST_HEAD(actions); u64 bytes; u64 packets; u64 lastuse; @@ -518,7 +523,8 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv, mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); - tc_for_each_action(a, f->exts) + tcf_exts_to_list(f->exts, &actions); + list_for_each_entry(a, &actions, list) tcf_action_stats_update(a, bytes, packets, lastuse); return 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 1fe9fbdc9102..1f8168906811 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1121,6 +1121,7 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) { const struct tc_action *a; + LIST_HEAD(actions); int err; if (!tc_single_action(cls->exts)) { @@ -1128,7 +1129,8 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, return -ENOTSUPP; } - tc_for_each_action(a, cls->exts) { + tcf_exts_to_list(cls->exts, &actions); + list_for_each_entry(a, &actions, list) { if (!is_tcf_mirred_mirror(a) || protocol != htons(ETH_P_ALL)) return -ENOTSUPP; diff --git a/include/net/act_api.h b/include/net/act_api.h index 870332ff61eb..82f3c912a5b1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -176,8 +176,8 @@ int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_action_destroy(struct list_head *actions, int bind); -int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, - struct tcf_result *res); +int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, + int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, char *n, int ovr, int bind, struct list_head *); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 00dd5c4c1d0a..c99508d426cc 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -59,7 +59,8 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ - struct list_head actions; + int nr_actions; + struct tc_action **actions; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. @@ -72,7 +73,10 @@ static inline void tcf_exts_init(struct tcf_exts *exts, int action, int police) { #ifdef CONFIG_NET_CLS_ACT exts->type = 0; - INIT_LIST_HEAD(&exts->actions); + exts->nr_actions = 0; + exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), + GFP_KERNEL); + WARN_ON(!exts->actions); /* TODO: propagate the error to callers */ #endif exts->action = action; exts->police = police; @@ -89,7 +93,7 @@ static inline int tcf_exts_is_predicative(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - return !list_empty(&exts->actions); + return exts->nr_actions; #else return 0; #endif @@ -108,6 +112,20 @@ tcf_exts_is_available(struct tcf_exts *exts) return tcf_exts_is_predicative(exts); } +static inline void tcf_exts_to_list(const struct tcf_exts *exts, + struct list_head *actions) +{ +#ifdef CONFIG_NET_CLS_ACT + int i; + + for (i = 0; i < exts->nr_actions; i++) { + struct tc_action *a = exts->actions[i]; + + list_add(&a->list, actions); + } +#endif +} + /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer @@ -124,27 +142,21 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT - if (!list_empty(&exts->actions)) - return tcf_action_exec(skb, &exts->actions, res); + if (exts->nr_actions) + return tcf_action_exec(skb, exts->actions, exts->nr_actions, + res); #endif return 0; } #ifdef CONFIG_NET_CLS_ACT -#define tc_no_actions(_exts) \ - (list_empty(&(_exts)->actions)) - -#define tc_for_each_action(_a, _exts) \ - list_for_each_entry(_a, &(_exts)->actions, list) - -#define tc_single_action(_exts) \ - (list_is_singular(&(_exts)->actions)) +#define tc_no_actions(_exts) ((_exts)->nr_actions == 0) +#define tc_single_action(_exts) ((_exts)->nr_actions == 1) #else /* CONFIG_NET_CLS_ACT */ #define tc_no_actions(_exts) true -#define tc_for_each_action(_a, _exts) while ((void)(_a), 0) #define tc_single_action(_exts) false #endif /* CONFIG_NET_CLS_ACT */ diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b4c7be38b632..d09d0687594b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -420,18 +420,19 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) return res; } -int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, - struct tcf_result *res) +int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, + int nr_actions, struct tcf_result *res) { - const struct tc_action *a; - int ret = -1; + int ret = -1, i; if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); ret = TC_ACT_OK; goto exec_done; } - list_for_each_entry(a, actions, list) { + for (i = 0; i < nr_actions; i++) { + const struct tc_action *a = actions[i]; + repeat: ret = a->ops->act(skb, a, res); if (ret == TC_ACT_REPEAT) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 843a716a4303..a7c5645373af 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -541,8 +541,12 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); - INIT_LIST_HEAD(&exts->actions); + LIST_HEAD(actions); + + tcf_exts_to_list(exts, &actions); + tcf_action_destroy(&actions, TCA_ACT_UNBIND); + kfree(exts->actions); + exts->nr_actions = 0; #endif } EXPORT_SYMBOL(tcf_exts_destroy); @@ -554,7 +558,6 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, { struct tc_action *act; - INIT_LIST_HEAD(&exts->actions); if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tb[exts->police], rate_tlv, "police", ovr, @@ -563,14 +566,20 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, return PTR_ERR(act); act->type = exts->type = TCA_OLD_COMPAT; - list_add(&act->list, &exts->actions); + exts->actions[0] = act; + exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - int err; + LIST_HEAD(actions); + int err, i = 0; + err = tcf_action_init(net, tb[exts->action], rate_tlv, NULL, ovr, - TCA_ACT_BIND, &exts->actions); + TCA_ACT_BIND, &actions); if (err) return err; + list_for_each_entry(act, &actions, list) + exts->actions[i++] = act; + exts->nr_actions = i; } } #else @@ -587,37 +596,49 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(tmp); + struct tcf_exts old = *dst; + tcf_tree_lock(tp); - list_splice_init(&dst->actions, &tmp); - list_splice(&src->actions, &dst->actions); + dst->nr_actions = src->nr_actions; + dst->actions = src->actions; dst->type = src->type; tcf_tree_unlock(tp); - tcf_action_destroy(&tmp, TCA_ACT_UNBIND); + + tcf_exts_destroy(&old); #endif } EXPORT_SYMBOL(tcf_exts_change); -#define tcf_exts_first_act(ext) \ - list_first_entry_or_null(&(exts)->actions, \ - struct tc_action, list) +#ifdef CONFIG_NET_CLS_ACT +static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts) +{ + if (exts->nr_actions == 0) + return NULL; + else + return exts->actions[0]; +} +#endif int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct nlattr *nest; - if (exts->action && !list_empty(&exts->actions)) { + if (exts->action && exts->nr_actions) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { + LIST_HEAD(actions); + nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0) + + tcf_exts_to_list(exts, &actions); + if (tcf_action_dump(skb, &actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { -- cgit v1.2.3 From bb1fceca22492109be12640d49f5ea5a544c6bb4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Aug 2016 05:56:26 -0700 Subject: tcp: fix use after free in tcp_xmit_retransmit_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When tcp_sendmsg() allocates a fresh and empty skb, it puts it at the tail of the write queue using tcp_add_write_queue_tail() Then it attempts to copy user data into this fresh skb. If the copy fails, we undo the work and remove the fresh skb. Unfortunately, this undo lacks the change done to tp->highest_sack and we can leave a dangling pointer (to a freed skb) Later, tcp_xmit_retransmit_queue() can dereference this pointer and access freed memory. For regular kernels where memory is not unmapped, this might cause SACK bugs because tcp_highest_sack_seq() is buggy, returning garbage instead of tp->snd_nxt, but with various debug features like CONFIG_DEBUG_PAGEALLOC, this can crash the kernel. This bug was found by Marco Grassi thanks to syzkaller. Fixes: 6859d49475d4 ("[TCP]: Abstract tp->highest_sack accessing & point to next skb") Reported-by: Marco Grassi Signed-off-by: Eric Dumazet Cc: Ilpo Järvinen Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e7d51bb18..7717302cab91 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1523,6 +1523,8 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (sk->sk_send_head == skb_unlinked) sk->sk_send_head = NULL; + if (tcp_sk(sk)->highest_sack == skb_unlinked) + tcp_sk(sk)->highest_sack = NULL; } static inline void tcp_init_send_head(struct sock *sk) -- cgit v1.2.3 From 89e1f6d2b956649fbe0704d543a90b8e0cf872b0 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 22 Aug 2016 01:02:18 +0800 Subject: netfilter: nft_reject: restrict to INPUT/FORWARD/OUTPUT After I add the nft rule "nft add rule filter prerouting reject with tcp reset", kernel panic happened on my system: NULL pointer dereference at ... IP: [] nf_send_reset+0xaf/0x400 Call Trace: [] ? nf_reject_ip_tcphdr_get+0x160/0x160 [] nft_reject_ipv4_eval+0x61/0xb0 [nft_reject_ipv4] [] nft_do_chain+0x1fa/0x890 [nf_tables] [] ? __nft_trace_packet+0x170/0x170 [nf_tables] [] ? nf_ct_invert_tuple+0xb0/0xc0 [nf_conntrack] [] ? nf_nat_setup_info+0x5d4/0x650 [nf_nat] [...] Because in the PREROUTING chain, routing information is not exist, then we will dereference the NULL pointer and oops happen. So we restrict reject expression to INPUT, FORWARD and OUTPUT chain. This is consistent with iptables REJECT target. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_reject.h | 4 ++++ net/ipv4/netfilter/nft_reject_ipv4.c | 1 + net/ipv6/netfilter/nft_reject_ipv6.c | 1 + net/netfilter/nft_reject.c | 16 ++++++++++++++++ net/netfilter/nft_reject_inet.c | 7 ++++++- 5 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 60fa1530006b..02e28c529b29 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -8,6 +8,10 @@ struct nft_reject { extern const struct nla_policy nft_reject_policy[]; +int nft_reject_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data); + int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index c24f41c816b3..2c2553b9026c 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -46,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = { .eval = nft_reject_ipv4_eval, .init = nft_reject_init, .dump = nft_reject_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 533cd5719c59..92bda9908bb9 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -47,6 +47,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = { .eval = nft_reject_ipv6_eval, .init = nft_reject_init, .dump = nft_reject_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 0522fc9bfb0a..c64de3f7379d 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -26,11 +26,27 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { }; EXPORT_SYMBOL_GPL(nft_reject_policy); +int nft_reject_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT)); +} +EXPORT_SYMBOL_GPL(nft_reject_validate); + int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); + int err; + + err = nft_reject_validate(ctx, expr, NULL); + if (err < 0) + return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 759ca5248a3d..e79d9ca2ffee 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -66,7 +66,11 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code; + int icmp_code, err; + + err = nft_reject_validate(ctx, expr, NULL); + if (err < 0) + return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; @@ -124,6 +128,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .eval = nft_reject_inet_eval, .init = nft_reject_inet_init, .dump = nft_reject_inet_dump, + .validate = nft_reject_validate, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { -- cgit v1.2.3 From 960fa72f67f1be6891d63a5518860d1ae4e14b88 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 22 Aug 2016 22:57:56 +0800 Subject: netfilter: nft_meta: improve the validity check of pkttype set expr "meta pkttype set" is only supported on prerouting chain with bridge family and ingress chain with netdev family. But the validate check is incomplete, and the user can add the nft rules on input chain with bridge family, for example: # nft add table bridge filter # nft add chain bridge filter input {type filter hook input \ priority 0 \;} # nft add chain bridge filter test # nft add rule bridge filter test meta pkttype set unicast # nft add rule bridge filter input jump test This patch fixes the problem. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 4 ++++ net/bridge/netfilter/nft_meta_bridge.c | 1 + net/netfilter/nft_meta.c | 17 +++++++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index d27588c8dbd9..1139cde0fdc5 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -36,4 +36,8 @@ void nft_meta_set_eval(const struct nft_expr *expr, void nft_meta_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr); +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data); + #endif diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 4b901d9f2e7c..ad47a921b701 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -86,6 +86,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2863f3493038..8a6bc7630912 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -291,10 +291,16 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { + struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; + if (priv->key != NFT_META_PKTTYPE) + return 0; + switch (ctx->afi->family) { case NFPROTO_BRIDGE: hooks = 1 << NF_BR_PRE_ROUTING; @@ -308,6 +314,7 @@ static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) return nft_chain_validate_hooks(ctx->chain, hooks); } +EXPORT_SYMBOL_GPL(nft_meta_set_validate); int nft_meta_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -327,15 +334,16 @@ int nft_meta_set_init(const struct nft_ctx *ctx, len = sizeof(u8); break; case NFT_META_PKTTYPE: - err = nft_meta_set_init_pkttype(ctx); - if (err) - return err; len = sizeof(u8); break; default: return -EOPNOTSUPP; } + err = nft_meta_set_validate(ctx, expr, NULL); + if (err < 0) + return err; + priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) @@ -407,6 +415,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .init = nft_meta_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, }; static const struct nft_expr_ops * -- cgit v1.2.3 From 61aaa0e8c1c15d9e045f0577f046be50f2f571ab Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Fri, 19 Aug 2016 22:02:48 +0200 Subject: cfg80211: Add stub for cfg80211_get_station() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows modules using this function (currently: batman-adv) to compile even if cfg80211 is not built at all, thus relaxing dependencies. Signed-off-by: Linus Lüssing Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9c23f4d33e06..beb7610d64e9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1102,6 +1102,7 @@ struct station_info { struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1]; }; +#if IS_ENABLED(CONFIG_CFG80211) /** * cfg80211_get_station - retrieve information about a given station * @dev: the device where the station is supposed to be connected to @@ -1114,6 +1115,14 @@ struct station_info { */ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo); +#else +static inline int cfg80211_get_station(struct net_device *dev, + const u8 *mac_addr, + struct station_info *sinfo) +{ + return -ENOENT; +} +#endif /** * enum monitor_flags - monitor flags -- cgit v1.2.3 From 6e1ce3c3451291142a57c4f3f6f999a29fb5b3bc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 1 Sep 2016 14:43:53 -0700 Subject: af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' Right now we use the 'readlock' both for protecting some of the af_unix IO path and for making the bind be single-threaded. The two are independent, but using the same lock makes for a nasty deadlock due to ordering with regards to filesystem locking. The bind locking would want to nest outside the VSF pathname locking, but the IO locking wants to nest inside some of those same locks. We tried to fix this earlier with commit c845acb324aa ("af_unix: Fix splice-bind deadlock") which moved the readlock inside the vfs locks, but that caused problems with overlayfs that will then call back into filesystem routines that take the lock in the wrong order anyway. Splitting the locks means that we can go back to having the bind lock be the outermost lock, and we don't have any deadlocks with lock ordering. Acked-by: Rainer Weikusat Acked-by: Al Viro Signed-off-by: Linus Torvalds Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 45 +++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 9b4c418bebd8..fd60eccb59a6 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -52,7 +52,7 @@ struct unix_sock { struct sock sk; struct unix_address *addr; struct path path; - struct mutex readlock; + struct mutex iolock, bindlock; struct sock *peer; struct list_head link; atomic_long_t inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 433ae1bbef97..8309687a56b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -661,11 +661,11 @@ static int unix_set_peek_off(struct sock *sk, int val) { struct unix_sock *u = unix_sk(sk); - if (mutex_lock_interruptible(&u->readlock)) + if (mutex_lock_interruptible(&u->iolock)) return -EINTR; sk->sk_peek_off = val; - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); return 0; } @@ -779,7 +779,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) spin_lock_init(&u->lock); atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); - mutex_init(&u->readlock); /* single task reading lock */ + mutex_init(&u->iolock); /* single task reading lock */ + mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); @@ -848,7 +849,7 @@ static int unix_autobind(struct socket *sock) int err; unsigned int retries = 0; - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->bindlock); if (err) return err; @@ -895,7 +896,7 @@ retry: spin_unlock(&unix_table_lock); err = 0; -out: mutex_unlock(&u->readlock); +out: mutex_unlock(&u->bindlock); return err; } @@ -1009,7 +1010,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; - err = mutex_lock_interruptible(&u->readlock); + err = mutex_lock_interruptible(&u->bindlock); if (err) goto out; @@ -1063,7 +1064,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out_unlock: spin_unlock(&unix_table_lock); out_up: - mutex_unlock(&u->readlock); + mutex_unlock(&u->bindlock); out: return err; } @@ -1955,17 +1956,17 @@ static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, if (false) { alloc_skb: unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->readlock); + mutex_unlock(&unix_sk(other)->iolock); newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, &err, 0); if (!newskb) goto err; } - /* we must acquire readlock as we modify already present + /* we must acquire iolock as we modify already present * skbs in the sk_receive_queue and mess with skb->len */ - err = mutex_lock_interruptible(&unix_sk(other)->readlock); + err = mutex_lock_interruptible(&unix_sk(other)->iolock); if (err) { err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; goto err; @@ -2032,7 +2033,7 @@ alloc_skb: } unix_state_unlock(other); - mutex_unlock(&unix_sk(other)->readlock); + mutex_unlock(&unix_sk(other)->iolock); other->sk_data_ready(other); scm_destroy(&scm); @@ -2041,7 +2042,7 @@ alloc_skb: err_state_unlock: unix_state_unlock(other); err_unlock: - mutex_unlock(&unix_sk(other)->readlock); + mutex_unlock(&unix_sk(other)->iolock); err: kfree_skb(newskb); if (send_sigpipe && !(flags & MSG_NOSIGNAL)) @@ -2109,7 +2110,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err, @@ -2117,14 +2118,14 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (skb) break; - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); if (err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, &err, &timeo, last)); - if (!skb) { /* implies readlock unlocked */ + if (!skb) { /* implies iolock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && @@ -2189,7 +2190,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, out_free: skb_free_datagram(sk, skb); - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); out: return err; } @@ -2284,7 +2285,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); if (flags & MSG_PEEK) skip = sk_peek_offset(sk, flags); @@ -2326,7 +2327,7 @@ again: break; } - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); timeo = unix_stream_data_wait(sk, timeo, last, last_len); @@ -2337,7 +2338,7 @@ again: goto out; } - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); @@ -2440,7 +2441,7 @@ unlock: } } while (size); - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); if (state->msg) scm_recv(sock, state->msg, &scm, flags); else @@ -2481,9 +2482,9 @@ static ssize_t skb_unix_socket_splice(struct sock *sk, int ret; struct unix_sock *u = unix_sk(sk); - mutex_unlock(&u->readlock); + mutex_unlock(&u->iolock); ret = splice_to_pipe(pipe, spd); - mutex_lock(&u->readlock); + mutex_lock(&u->iolock); return ret; } -- cgit v1.2.3 From 5a56a0b3a45dd0cc5b2f7bec6afd053a474ed9f5 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 5 Sep 2016 10:20:20 +1200 Subject: net: Don't delete routes in different VRFs When deleting an IP address from an interface, there is a clean-up of routes which refer to this local address. However, there was no check to see that the VRF matched. This meant that deletion wasn't confined to the VRF it should have been. To solve this, a new field has been added to fib_info to hold a table id. When removing fib entries corresponding to a local ip address, this table id is also used in the comparison. The table id is populated when the fib_info is created. This was already done in some places, but not in ip_rt_ioctl(). This has now been fixed. Fixes: 021dd3b8a142 ("net: Add routes to the table associated with the device") Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: Mark Tomlinson Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_frontend.c | 3 ++- net/ipv4/fib_semantics.c | 8 ++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 4079fc18ffe4..7d4a72e75f33 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -111,6 +111,7 @@ struct fib_info { unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; + u32 fib_tb_id; u32 fib_priority; u32 *fib_metrics; #define fib_mtu fib_metrics[RTAX_MTU-1] @@ -319,7 +320,7 @@ void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); -int fib_sync_down_addr(struct net *net, __be32 local); +int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); extern u32 fib_multipath_secret __read_mostly; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ef2ebeb89d0f..1b25daf8c7f1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -509,6 +509,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; + cfg->fc_table = l3mdev_fib_table(dev); if (colon) { struct in_ifaddr *ifa; struct in_device *in_dev = __in_dev_get_rtnl(dev); @@ -1027,7 +1028,7 @@ no_promotions: * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) + if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 539fa264e67d..e9f56225e53f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; + fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; change_nexthops(fi) { @@ -1337,18 +1338,21 @@ nla_put_failure: * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ -int fib_sync_down_addr(struct net *net, __be32 local) +int fib_sync_down_addr(struct net_device *dev, __be32 local) { int ret = 0; unsigned int hash = fib_laddr_hashfn(local); struct hlist_head *head = &fib_info_laddrhash[hash]; + struct net *net = dev_net(dev); + int tb_id = l3mdev_fib_table(dev); struct fib_info *fi; if (!fib_info_laddrhash || local == 0) return 0; hlist_for_each_entry(fi, head, fib_lhash) { - if (!net_eq(fi->fib_net, net)) + if (!net_eq(fi->fib_net, net) || + fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; -- cgit v1.2.3 From 4440a2ab3b9f40dddbe006331ef0659c76859296 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 13 Sep 2016 08:49:18 +0800 Subject: netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions When memory is exhausted, nfct_seqadj_ext_add may fail to add the synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't check if get valid seqadj pointer by the nfct_seqadj. Now drop the packet directly when fail to add seqadj extension to avoid dereference NULL pointer in nf_ct_seqadj_init from init_conntrack(). Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 14 ++++++++++++++ net/netfilter/nf_conntrack_core.c | 6 +++--- net/netfilter/nf_nat_core.c | 3 ++- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 6793614e6502..e6937318546c 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) #endif } +static inline bool nf_ct_add_synproxy(struct nf_conn *ct, + const struct nf_conn *tmpl) +{ + if (tmpl && nfct_synproxy(tmpl)) { + if (!nfct_seqadj_ext_add(ct)) + return false; + + if (!nfct_synproxy_ext_add(ct)) + return false; + } + + return true; +} + struct synproxy_stats { unsigned int syn_received; unsigned int cookie_invalid; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index dd2c43abf9e2..9934b0c93c1e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; - if (tmpl && nfct_synproxy(tmpl)) { - nfct_seqadj_ext_add(ct); - nfct_synproxy_ext_add(ct); + if (!nf_ct_add_synproxy(ct, tmpl)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 19c081e1b328..ecee105bbada 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; if (nfct_help(ct)) - nfct_seqadj_ext_add(ct); + if (!nfct_seqadj_ext_add(ct)) + return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { -- cgit v1.2.3 From 20c64d5cd5a2bdcdc8982a06cb05e5e1bd851a3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Sep 2016 08:48:46 -0700 Subject: net: avoid sk_forward_alloc overflows A malicious TCP receiver, sending SACK, can force the sender to split skbs in write queue and increase its memory usage. Then, when socket is closed and its write queue purged, we might overflow sk_forward_alloc (It becomes negative) sk_mem_reclaim() does nothing in this case, and more than 2GB are leaked from TCP perspective (tcp_memory_allocated is not changed) Then warnings trigger from inet_sock_destruct() and sk_stream_kill_queues() seeing a not zero sk_forward_alloc All TCP stack can be stuck because TCP is under memory pressure. A simple fix is to preemptively reclaim from sk_mem_uncharge(). This makes sure a socket wont have more than 2 MB forward allocated, after burst and idle period. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index ff5be7e8ddea..8741988e6880 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1332,6 +1332,16 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) if (!sk_has_account(sk)) return; sk->sk_forward_alloc += size; + + /* Avoid a possible overflow. + * TCP send queues can make this happen, if sk_mem_reclaim() + * is not called and more than 2 GBytes are released at once. + * + * If we reach 2 MBytes, reclaim 1 MBytes right now, there is + * no need to hold that much forward allocation anyway. + */ + if (unlikely(sk->sk_forward_alloc >= 1 << 21)) + __sk_mem_reclaim(sk, 1 << 20); } static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From 4496195ddd75c4ad57b783739414e69b7d79843e Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 15 Sep 2016 15:02:38 -0300 Subject: sctp: fix SSN comparision This function actually operates on u32 yet its paramteres were declared as u16, causing integer truncation upon calling. Note in patch context that ADDIP_SERIAL_SIGN_BIT is already 32 bits. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index efc01743b9d6..bafe2a0ab908 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -382,7 +382,7 @@ enum { ADDIP_SERIAL_SIGN_BIT = (1<<31) }; -static inline int ADDIP_SERIAL_gte(__u16 s, __u16 t) +static inline int ADDIP_SERIAL_gte(__u32 s, __u32 t) { return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT); } -- cgit v1.2.3 From 63c43787d35e45562a6b5927e2edc8f4783d95b8 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 19 Sep 2016 16:17:57 +0200 Subject: vti6: fix input path Since commit 1625f4529957, vti6 is broken, all input packets are dropped (LINUX_MIB_XFRMINNOSTATES is incremented). XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 is set by vti6_rcv() before calling xfrm6_rcv()/xfrm6_rcv_spi(), thus we cannot set to NULL that value in xfrm6_rcv_spi(). A new function xfrm6_rcv_tnl() that enables to pass a value to xfrm6_rcv_spi() is added, so that xfrm6_rcv() is not touched (this function is used in several handlers). CC: Alexey Kodanev Fixes: 1625f4529957 ("net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key") Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 4 +++- net/ipv6/ip6_vti.c | 4 +--- net/ipv6/xfrm6_input.c | 16 +++++++++++----- net/ipv6/xfrm6_tunnel.c | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index adfebd6f243c..17934312eecb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1540,8 +1540,10 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi); +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t); int xfrm6_transport_finish(struct sk_buff *skb, int async); +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 52a2f735881f..5bd3afdcc771 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -321,11 +321,9 @@ static int vti6_rcv(struct sk_buff *skb) goto discard; } - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - rcu_read_unlock(); - return xfrm6_rcv(skb); + return xfrm6_rcv_tnl(skb, t); } rcu_read_unlock(); return -EINVAL; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00a2d40677d6..b5789562aded 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -21,9 +21,10 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, + struct ip6_tnl *t) { - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); @@ -49,13 +50,18 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return -1; } -int xfrm6_rcv(struct sk_buff *skb) +int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0); + 0, t); } -EXPORT_SYMBOL(xfrm6_rcv); +EXPORT_SYMBOL(xfrm6_rcv_tnl); +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_tnl(skb, NULL); +} +EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5743044cd660..e1c0bbe7996c 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -236,7 +236,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.3 From 73dca124cdbad2d67d47d6196c08325f18447d07 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 29 Sep 2016 02:37:26 +0800 Subject: sctp: move sent_count to the memory hole in sctp_chunk Now pahole sctp_chunk, it has 2 memory holes: struct sctp_chunk { struct list_head list; atomic_t refcnt; /* XXX 4 bytes hole, try to pack */ ... long unsigned int prsctp_param; int sent_count; /* XXX 4 bytes hole, try to pack */ This patch is to move up sent_count to fill the 1st one and eliminate the 2nd one. It's not just another struct compaction, it also fixes the "netperf- Throughput_Mbps -37.2% regression" issue when overloading the CPU. Fixes: a6c2f792873a ("sctp: implement prsctp TTL policy") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ce93c4b10d26..4f097f538fff 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -554,6 +554,9 @@ struct sctp_chunk { atomic_t refcnt; + /* How many times this chunk have been sent, for prsctp RTX policy */ + int sent_count; + /* This is our link to the per-transport transmitted list. */ struct list_head transmitted_list; @@ -610,9 +613,6 @@ struct sctp_chunk { */ unsigned long prsctp_param; - /* How many times this chunk have been sent, for prsctp RTX policy */ - int sent_count; - /* Which association does this belong to? */ struct sctp_association *asoc; -- cgit v1.2.3 From 0605483f6ace1f6b63e397c819a115ddcd13af0d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 29 Sep 2016 02:37:27 +0800 Subject: sctp: remove prsctp_param from sctp_chunk Now sctp uses chunk->prsctp_param to save the prsctp param for all the prsctp polices, we didn't need to introduce prsctp_param to sctp_chunk. We can just use chunk->sinfo.sinfo_timetolive for RTX and BUF polices, and reuse msg->expires_at for TTL policy, as the prsctp polices and old expires policy are mutual exclusive. This patch is to remove prsctp_param from sctp_chunk, and reuse msg's expires_at for TTL and chunk's sinfo.sinfo_timetolive for RTX and BUF polices. Note that sctp can't use chunk's sinfo.sinfo_timetolive for TTL policy, as it needs a u64 variables to save the expires_at time. This one also fixes the "netperf-Throughput_Mbps -37.2% regression" issue. Fixes: a6c2f792873a ("sctp: implement prsctp TTL policy") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 7 ------- net/sctp/chunk.c | 9 +++++++-- net/sctp/outqueue.c | 4 ++-- net/sctp/sm_make_chunk.c | 15 --------------- 4 files changed, 9 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4f097f538fff..ced0df374e60 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -606,13 +606,6 @@ struct sctp_chunk { /* This needs to be recoverable for SCTP_SEND_FAILED events. */ struct sctp_sndrcvinfo sinfo; - /* We use this field to record param for prsctp policies, - * for TTL policy, it is the time_to_drop of this chunk, - * for RTX policy, it is the max_sent_count of this chunk, - * for PRIO policy, it is the priority of this chunk. - */ - unsigned long prsctp_param; - /* Which association does this belong to? */ struct sctp_association *asoc; diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index a55e54738b81..14990e21cf55 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -179,6 +179,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, msg, msg->expires_at, jiffies); } + if (asoc->prsctp_enable && + SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) + msg->expires_at = + jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); + /* This is the biggest possible DATA chunk that can fit into * the packet */ @@ -349,14 +354,14 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) } if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && - time_after(jiffies, chunk->prsctp_param)) { + time_after(jiffies, chunk->msg->expires_at)) { if (chunk->sent_count) chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; else chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && - chunk->sent_count > chunk->prsctp_param) { + chunk->sent_count > chunk->sinfo.sinfo_timetolive) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 72e54a416af6..e084f35d40d2 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -383,7 +383,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, transmitted_list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->prsctp_param <= sinfo->sinfo_timetolive) + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; list_del_init(&chk->transmitted_list); @@ -418,7 +418,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, list_for_each_entry_safe(chk, temp, queue, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || - chk->prsctp_param <= sinfo->sinfo_timetolive) + chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; list_del_init(&chk->list); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8c77b87a8565..46ffecc57214 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -706,20 +706,6 @@ nodata: return retval; } -static void sctp_set_prsctp_policy(struct sctp_chunk *chunk, - const struct sctp_sndrcvinfo *sinfo) -{ - if (!chunk->asoc->prsctp_enable) - return; - - if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags)) - chunk->prsctp_param = - jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); - else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) || - SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags)) - chunk->prsctp_param = sinfo->sinfo_timetolive; -} - /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ @@ -753,7 +739,6 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); - sctp_set_prsctp_policy(retval, sinfo); nodata: return retval; -- cgit v1.2.3