From 6989310f5d4327e8595664954edd40a7f99ddd0d Mon Sep 17 00:00:00 2001 From: Pi-Hsun Shih Date: Wed, 4 Dec 2019 16:13:07 +0800 Subject: wireless: Use offsetof instead of custom macro. Use offsetof to calculate offset of a field to take advantage of compiler built-in version when possible, and avoid UBSAN warning when compiling with Clang: ================================================================== UBSAN: Undefined behaviour in net/wireless/wext-core.c:525:14 member access within null pointer of type 'struct iw_point' CPU: 3 PID: 165 Comm: kworker/u16:3 Tainted: G S W 4.19.23 #43 Workqueue: cfg80211 __cfg80211_scan_done [cfg80211] Call trace: dump_backtrace+0x0/0x194 show_stack+0x20/0x2c __dump_stack+0x20/0x28 dump_stack+0x70/0x94 ubsan_epilogue+0x14/0x44 ubsan_type_mismatch_common+0xf4/0xfc __ubsan_handle_type_mismatch_v1+0x34/0x54 wireless_send_event+0x3cc/0x470 ___cfg80211_scan_done+0x13c/0x220 [cfg80211] __cfg80211_scan_done+0x28/0x34 [cfg80211] process_one_work+0x170/0x35c worker_thread+0x254/0x380 kthread+0x13c/0x158 ret_from_fork+0x10/0x18 =================================================================== Signed-off-by: Pi-Hsun Shih Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20191204081307.138765-1-pihsun@chromium.org Signed-off-by: Johannes Berg --- include/uapi/linux/wireless.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index 86eca3208b6b..a2c006a364e0 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -74,6 +74,8 @@ #include /* for "struct sockaddr" et al */ #include /* for IFNAMSIZ and co... */ +#include /* for offsetof */ + /***************************** VERSION *****************************/ /* * This constant is used to know the availability of the wireless @@ -1090,8 +1092,7 @@ struct iw_event { /* iw_point events are special. First, the payload (extra data) come at * the end of the event, so they are bigger than IW_EV_POINT_LEN. Second, * we omit the pointer, so start at an offset. */ -#define IW_EV_POINT_OFF (((char *) &(((struct iw_point *) NULL)->length)) - \ - (char *) NULL) +#define IW_EV_POINT_OFF offsetof(struct iw_point, length) #define IW_EV_POINT_LEN (IW_EV_LCP_LEN + sizeof(struct iw_point) - \ IW_EV_POINT_OFF) -- cgit v1.2.3 From 5c5e52d1bb962510fecdc1ebecdde89694d1b223 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 17 Dec 2019 15:19:18 +0100 Subject: nl80211: add handling for BSS color This patch adds the attributes, policy and parsing code to allow userland to send the info about the BSS coloring settings to the kernel. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20191217141921.8114-1-john@phrozen.org [johannes: remove the strict policy parsing, that was a misunderstanding] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ include/uapi/linux/nl80211.h | 26 ++++++++++++++++++++++++++ net/wireless/nl80211.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 059524b87c4c..5c0b1b5ec77b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -259,6 +259,19 @@ struct ieee80211_he_obss_pd { u8 max_offset; }; +/** + * struct cfg80211_he_bss_color - AP settings for BSS coloring + * + * @color: the current color. + * @disabled: is the feature disabled. + * @partial: define the AID equation. + */ +struct cfg80211_he_bss_color { + u8 color; + bool disabled; + bool partial; +}; + /** * struct ieee80211_sta_ht_cap - STA's HT capabilities * @@ -990,6 +1003,7 @@ enum cfg80211_ap_settings_flags { * @twt_responder: Enable Target Wait Time * @flags: flags, as defined in enum cfg80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings + * @he_bss_color: BSS Color settings */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1018,6 +1032,7 @@ struct cfg80211_ap_settings { bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; + struct cfg80211_he_bss_color he_bss_color; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5eab191607f8..809ef9165684 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2400,6 +2400,8 @@ enum nl80211_commands { * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key * (u16). * + * @NL80211_ATTR_HE_BSS_COLOR: nested attribute for BSS Color Settings. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2864,6 +2866,8 @@ enum nl80211_attrs { NL80211_ATTR_VLAN_ID, + NL80211_ATTR_HE_BSS_COLOR, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -6587,5 +6591,27 @@ enum nl80211_obss_pd_attributes { NL80211_HE_OBSS_PD_ATTR_MAX = __NL80211_HE_OBSS_PD_ATTR_LAST - 1, }; +/** + * enum nl80211_bss_color_attributes - BSS Color attributes + * @__NL80211_HE_BSS_COLOR_ATTR_INVALID: Invalid + * + * @NL80211_HE_BSS_COLOR_ATTR_COLOR: the current BSS Color. + * @NL80211_HE_BSS_COLOR_ATTR_DISABLED: is BSS coloring disabled. + * @NL80211_HE_BSS_COLOR_ATTR_PARTIAL: the AID equation to be used.. + * + * @__NL80211_HE_BSS_COLOR_ATTR_LAST: Internal + * @NL80211_HE_BSS_COLOR_ATTR_MAX: highest BSS Color attribute. + */ +enum nl80211_bss_color_attributes { + __NL80211_HE_BSS_COLOR_ATTR_INVALID, + + NL80211_HE_BSS_COLOR_ATTR_COLOR, + NL80211_HE_BSS_COLOR_ATTR_DISABLED, + NL80211_HE_BSS_COLOR_ATTR_PARTIAL, + + /* keep last */ + __NL80211_HE_BSS_COLOR_ATTR_LAST, + NL80211_HE_BSS_COLOR_ATTR_MAX = __NL80211_HE_BSS_COLOR_ATTR_LAST - 1, +}; #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fa3526592c51..00f24d4c623e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -321,6 +321,13 @@ he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = { NLA_POLICY_RANGE(NLA_U8, 1, 20), }; +static const struct nla_policy +he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { + [NL80211_HE_BSS_COLOR_ATTR_COLOR] = NLA_POLICY_RANGE(NLA_U8, 1, 63), + [NL80211_HE_BSS_COLOR_ATTR_DISABLED] = { .type = NLA_FLAG }, + [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG }, +}; + const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -625,6 +632,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG }, [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), + [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy), }; /* policy for the key attributes */ @@ -4511,6 +4519,30 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs, return 0; } +static int nl80211_parse_he_bss_color(struct nlattr *attrs, + struct cfg80211_he_bss_color *he_bss_color) +{ + struct nlattr *tb[NL80211_HE_BSS_COLOR_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NL80211_HE_BSS_COLOR_ATTR_MAX, attrs, + he_bss_color_policy, NULL); + if (err) + return err; + + if (!tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]) + return -EINVAL; + + he_bss_color->color = + nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]); + he_bss_color->disabled = + nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]); + he_bss_color->partial = + nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]); + + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -4803,6 +4835,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return err; } + if (info->attrs[NL80211_ATTR_HE_BSS_COLOR]) { + err = nl80211_parse_he_bss_color( + info->attrs[NL80211_ATTR_HE_BSS_COLOR], + ¶ms.he_bss_color); + if (err) + return err; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3 From 1e61d82cca170465902003bfe445f0461e28208b Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Tue, 21 Jan 2020 10:12:13 +0200 Subject: cfg80211: add no HE indication to the channel flag The regulatory domain might forbid HE operation. Certain regulatory domains may restrict it for specific channels whereas others may do it for the whole regulatory domain. Add an option to indicate it in the channel flag. Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20200121081213.733757-1-luca@coelho.fi Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/nl80211.c | 3 +++ net/wireless/reg.c | 2 ++ 4 files changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fa027d0d031b..40f2a3a30e3d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -95,6 +95,7 @@ struct wiphy; * on this channel. * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. + * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. * */ enum ieee80211_channel_flags { @@ -111,6 +112,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_IR_CONCURRENT = 1<<10, IEEE80211_CHAN_NO_20MHZ = 1<<11, IEEE80211_CHAN_NO_10MHZ = 1<<12, + IEEE80211_CHAN_NO_HE = 1<<13, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 809ef9165684..d996bac97e9d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3587,6 +3587,8 @@ enum nl80211_wmm_rule { * @NL80211_FREQUENCY_ATTR_WMM: this channel has wmm limitations. * This is a nested attribute that contains the wmm limitation per AC. * (see &enum nl80211_wmm_rule) + * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel + * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3616,6 +3618,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_20MHZ, NL80211_FREQUENCY_ATTR_NO_10MHZ, NL80211_FREQUENCY_ATTR_WMM, + NL80211_FREQUENCY_ATTR_NO_HE, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -3813,6 +3816,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed + * @NL80211_RRF_NO_HE: HE operation not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -3830,6 +3834,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_HT40PLUS = 1<<14, NL80211_RRF_NO_80MHZ = 1<<15, NL80211_RRF_NO_160MHZ = 1<<16, + NL80211_RRF_NO_HE = 1<<17, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 00f24d4c623e..d8cdbf07aeec 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -972,6 +972,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_10MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_10MHZ)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_HE) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 446c76d44e65..ea7bc5652a41 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1569,6 +1569,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_80MHZ; if (rd_flags & NL80211_RRF_NO_160MHZ) channel_flags |= IEEE80211_CHAN_NO_160MHZ; + if (rd_flags & NL80211_RRF_NO_HE) + channel_flags |= IEEE80211_CHAN_NO_HE; return channel_flags; } -- cgit v1.2.3 From d6039a3416f7af37c04f22c411f120ad46f51663 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Mon, 27 Jan 2020 02:00:32 +0530 Subject: cfg80211: Enhance the AKM advertizement to support per interface. Commit ab4dfa20534e ("cfg80211: Allow drivers to advertise supported AKM suites") introduces the support to advertize supported AKMs to userspace. This needs an enhancement to advertize the AKM support per interface type, specifically for the cfg80211-based drivers that implement SME and use different mechanisms to support the AKM's for each interface type (e.g., the support for SAE, OWE AKM's take different paths for such drivers on STA/AP mode). This commit aims the same and enhances the earlier mechanism of advertizing the AKMs per wiphy. Add new nl80211 attributes and data structure to provide supported AKMs per interface type to userspace. the AKMs advertized in akm_suites are default capabilities if not advertized for a specific interface type in iftype_akm_suites. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200126203032.21934-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 28 +++++++++++++++++++++++++++- include/uapi/linux/nl80211.h | 33 +++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 40f2a3a30e3d..c3a9b375d3ca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4400,6 +4400,21 @@ struct cfg80211_pmsr_capabilities { } ftm; }; +/** + * struct wiphy_iftype_akm_suites - This structure encapsulates supported akm + * suites for interface types defined in @iftypes_mask. Each type in the + * @iftypes_mask must be unique across all instances of iftype_akm_suites. + * + * @iftypes_mask: bitmask of interfaces types + * @akm_suites: points to an array of supported akm suites + * @n_akm_suites: number of supported AKM suites + */ +struct wiphy_iftype_akm_suites { + u16 iftypes_mask; + const u32 *akm_suites; + int n_akm_suites; +}; + /** * struct wiphy - wireless hardware description * @reg_notifier: the driver's regulatory notification callback, @@ -4412,8 +4427,16 @@ struct cfg80211_pmsr_capabilities { * @signal_type: signal type reported in &struct cfg80211_bss. * @cipher_suites: supported cipher suites * @n_cipher_suites: number of supported cipher suites - * @akm_suites: supported AKM suites + * @akm_suites: supported AKM suites. These are the default AKMs supported if + * the supported AKMs not advertized for a specific interface type in + * iftype_akm_suites. * @n_akm_suites: number of supported AKM suites + * @iftype_akm_suites: array of supported akm suites info per interface type. + * Note that the bits in @iftypes_mask inside this structure cannot + * overlap (i.e. only one occurrence of each type is allowed across all + * instances of iftype_akm_suites). + * @num_iftype_akm_suites: number of interface types for which supported akm + * suites are specified separately. * @retry_short: Retry limit for short frames (dot11ShortRetryLimit) * @retry_long: Retry limit for long frames (dot11LongRetryLimit) * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold); @@ -4620,6 +4643,9 @@ struct wiphy { int n_akm_suites; const u32 *akm_suites; + const struct wiphy_iftype_akm_suites *iftype_akm_suites; + unsigned int num_iftype_akm_suites; + u8 retry_short; u8 retry_long; u32 frag_threshold; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d996bac97e9d..350ab86fe20e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2402,6 +2402,13 @@ enum nl80211_commands { * * @NL80211_ATTR_HE_BSS_COLOR: nested attribute for BSS Color Settings. * + * @NL80211_ATTR_IFTYPE_AKM_SUITES: nested array attribute, with each entry + * using attributes from &enum nl80211_iftype_akm_attributes. This + * attribute is sent in a response to %NL80211_CMD_GET_WIPHY indicating + * supported AKM suites capability per interface. AKMs advertised in + * %NL80211_ATTR_AKM_SUITES are default capabilities if AKM suites not + * advertised for a specific interface type. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2868,6 +2875,8 @@ enum nl80211_attrs { NL80211_ATTR_HE_BSS_COLOR, + NL80211_ATTR_IFTYPE_AKM_SUITES, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -6619,4 +6628,28 @@ enum nl80211_bss_color_attributes { NL80211_HE_BSS_COLOR_ATTR_MAX = __NL80211_HE_BSS_COLOR_ATTR_LAST - 1, }; +/** + * enum nl80211_iftype_akm_attributes - interface type AKM attributes + * @__NL80211_IFTYPE_AKM_ATTR_INVALID: Invalid + * + * @NL80211_IFTYPE_AKM_ATTR_IFTYPES: nested attribute containing a flag + * attribute for each interface type that supports AKM suites specified in + * %NL80211_IFTYPE_AKM_ATTR_SUITES + * @NL80211_IFTYPE_AKM_ATTR_SUITES: an array of u32. Used to indicate supported + * AKM suites for the specified interface types. + * + * @__NL80211_IFTYPE_AKM_ATTR_LAST: Internal + * @NL80211_IFTYPE_AKM_ATTR_MAX: highest interface type AKM attribute. + */ +enum nl80211_iftype_akm_attributes { + __NL80211_IFTYPE_AKM_ATTR_INVALID, + + NL80211_IFTYPE_AKM_ATTR_IFTYPES, + NL80211_IFTYPE_AKM_ATTR_SUITES, + + /* keep last */ + __NL80211_IFTYPE_AKM_ATTR_LAST, + NL80211_IFTYPE_AKM_ATTR_MAX = __NL80211_IFTYPE_AKM_ATTR_LAST - 1, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d8cdbf07aeec..3ad937d0a256 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1896,6 +1896,46 @@ static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev, return 0; } +static int +nl80211_put_iftype_akm_suites(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) +{ + int i; + struct nlattr *nested, *nested_akms; + const struct wiphy_iftype_akm_suites *iftype_akms; + + if (!rdev->wiphy.num_iftype_akm_suites || + !rdev->wiphy.iftype_akm_suites) + return 0; + + nested = nla_nest_start(msg, NL80211_ATTR_IFTYPE_AKM_SUITES); + if (!nested) + return -ENOBUFS; + + for (i = 0; i < rdev->wiphy.num_iftype_akm_suites; i++) { + nested_akms = nla_nest_start(msg, i + 1); + if (!nested_akms) + return -ENOBUFS; + + iftype_akms = &rdev->wiphy.iftype_akm_suites[i]; + + if (nl80211_put_iftypes(msg, NL80211_IFTYPE_AKM_ATTR_IFTYPES, + iftype_akms->iftypes_mask)) + return -ENOBUFS; + + if (nla_put(msg, NL80211_IFTYPE_AKM_ATTR_SUITES, + sizeof(u32) * iftype_akms->n_akm_suites, + iftype_akms->akm_suites)) { + return -ENOBUFS; + } + nla_nest_end(msg, nested_akms); + } + + nla_nest_end(msg, nested); + + return 0; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -2454,6 +2494,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.akm_suites)) goto nla_put_failure; + if (nl80211_put_iftype_akm_suites(rdev, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 8c3ed7aa2b9ef666195b789e9b02e28383243fa8 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Wed, 15 Jan 2020 13:55:22 +0100 Subject: nl80211: add src and dst addr attributes for control port tx/rx When using control port over nl80211 in AP mode with pre-authentication, APs need to forward frames to other APs defined by their MAC address. Before this patch, pre-auth frames reaching user space over nl80211 control port have no longer any information about the dest attached, which can be used for forwarding to a controller or injecting the frame back to a ethernet interface over a AF_PACKET socket. Analog problems exist, when forwarding pre-auth frames from AP -> STA. This patch therefore adds the NL80211_ATTR_DST_MAC and NL80211_ATTR_SRC_MAC attributes to provide more context information when forwarding. The respective arguments are optional on tx and included on rx. Therefore unaware existing software is not affected. Software which wants to detect this feature, can do so by checking against: NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_MAC_ADDRS Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200115125522.3755-1-markus.theil@tu-ilmenau.de [split into separate cfg80211/mac80211 patches] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 ++- include/uapi/linux/nl80211.h | 16 +++++++++++++++- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/tx.c | 3 ++- net/wireless/nl80211.c | 18 +++++++++++++++--- net/wireless/rdev-ops.h | 8 ++++---- net/wireless/trace.h | 27 +++++++++++++++++---------- 7 files changed, 57 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c3a9b375d3ca..ec0de3c43c61 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3969,7 +3969,8 @@ struct cfg80211_ops { int (*tx_control_port)(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, const __be16 proto, + const u8 *dest, const u8 *src, + const __be16 proto, const bool noencrypt); int (*get_ftm_responder_stats)(struct wiphy *wiphy, diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 350ab86fe20e..158bccb4a47b 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1039,11 +1039,14 @@ * a control port frame and as a notification that a control port frame * has been received. %NL80211_ATTR_FRAME is used to specify the * frame contents. The frame is the raw EAPoL data, without ethernet or - * 802.11 headers. + * 802.11 headers. An optional %NL80211_ATTR_SRC_MAC can be used to send + * pre-auth frames to STAs on behalf of other APs. * When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added * indicating the protocol type of the received frame; whether the frame * was received unencrypted and the MAC address of the peer respectively. + * %NL80211_ATTR_DST_MAC can be used to forward pre-auth frames in + * userspace while using AP mode. * * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. * @@ -2409,6 +2412,9 @@ enum nl80211_commands { * %NL80211_ATTR_AKM_SUITES are default capabilities if AKM suites not * advertised for a specific interface type. * + * @NL80211_ATTR_SRC_MAC: MAC address used in control port over nl80211 transmit + * @NL80211_ATTR_DST_MAC: MAC address used in control port over nl80211 receive + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2877,6 +2883,9 @@ enum nl80211_attrs { NL80211_ATTR_IFTYPE_AKM_SUITES, + NL80211_ATTR_SRC_MAC, + NL80211_ATTR_DST_MAC, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5539,6 +5548,10 @@ enum nl80211_feature_flags { * feature, which prevents bufferbloat by using the expected transmission * time to limit the amount of data buffered in the hardware. * + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_MAC_ADDRS: The driver + * can use src and dst MAC addresses with control port over nl80211 rx + * and tx operations. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5586,6 +5599,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SAE_OFFLOAD, NL80211_EXT_FEATURE_VLAN_OFFLOAD, NL80211_EXT_FEATURE_AQL, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_MAC_ADDRS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8a49d78ad7c9..da9eaa9ee37e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1792,7 +1792,8 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted); + const u8 *dest, const u8 *src, __be16 proto, + bool unencrypted); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 4296d9d71311..059c66b6bb5c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5285,7 +5285,8 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted) + const u8 *dest, const u8 *src, __be16 proto, + bool unencrypted) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4c0ea54e0f59..33fe6ac1c242 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -633,6 +633,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy), + [NL80211_ATTR_SRC_MAC] = NLA_POLICY_ETH_ADDR, + [NL80211_ATTR_DST_MAC] = NLA_POLICY_ETH_ADDR, }; /* policy for the key attributes */ @@ -13694,6 +13696,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) const u8 *buf; size_t len; u8 *dest; + u8 src[ETH_ALEN]; u16 proto; bool noencrypt; int err; @@ -13731,6 +13734,13 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) goto out; } + /* copy src address under wdev_lock, as we may copy wdev_address */ + if (info->attrs[NL80211_ATTR_SRC_MAC]) + ether_addr_copy(src, + nla_data(info->attrs[NL80211_ATTR_SRC_MAC])); + else + ether_addr_copy(src, wdev_address(wdev)); + wdev_unlock(wdev); buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); @@ -13741,7 +13751,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); return rdev_tx_control_port(rdev, dev, buf, len, - dest, cpu_to_be16(proto), noencrypt); + dest, src, cpu_to_be16(proto), noencrypt); out: wdev_unlock(wdev); @@ -15996,7 +16006,8 @@ static int __nl80211_rx_control_port(struct net_device *dev, struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ethhdr *ehdr = eth_hdr(skb); - const u8 *addr = ehdr->h_source; + const u8 *daddr = ehdr->h_dest; + const u8 *saddr = ehdr->h_source; u16 proto = be16_to_cpu(skb->protocol); struct sk_buff *msg; void *hdr; @@ -16021,7 +16032,8 @@ static int __nl80211_rx_control_port(struct net_device *dev, nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, saddr) || + nla_put(msg, NL80211_ATTR_DST_MAC, ETH_ALEN, daddr) || nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || (unencrypted && nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index e853a4fe6f97..39e6c1db3092 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -730,14 +730,14 @@ static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, - const u8 *dest, __be16 proto, - const bool noencrypt) + const u8 *dest, const u8 *src, + __be16 proto, const bool noencrypt) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); + dest, src, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); + dest, src, proto, noencrypt); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index d98ad2b3143b..fefa255fd062 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1923,27 +1923,31 @@ TRACE_EVENT(rdev_mgmt_tx, TRACE_EVENT(rdev_tx_control_port, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - const u8 *buf, size_t len, const u8 *dest, __be16 proto, + const u8 *buf, size_t len, + const u8 *dest, const u8 *src, __be16 proto, bool unencrypted), - TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted), + TP_ARGS(wiphy, netdev, buf, len, dest, src, proto, unencrypted), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dest) - __field(__be16, proto) + MAC_ENTRY(src) + __field(u16, proto) __field(bool, unencrypted) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dest, dest); - __entry->proto = proto; + MAC_ASSIGN(src, src); + __entry->proto = be16_to_cpu(proto); __entry->unencrypted = unencrypted; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," - " proto: 0x%x, unencrypted: %s", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), - be16_to_cpu(__entry->proto), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", dest: " MAC_PR_FMT + ", src: " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, + MAC_PR_ARG(dest), MAC_PR_ARG(src), + __entry->proto, BOOL_TO_STR(__entry->unencrypted)) ); @@ -2835,6 +2839,7 @@ TRACE_EVENT(cfg80211_rx_control_port, TP_STRUCT__entry( NETDEV_ENTRY __field(int, len) + MAC_ENTRY(to) MAC_ENTRY(from) __field(u16, proto) __field(bool, unencrypted) @@ -2842,12 +2847,14 @@ TRACE_EVENT(cfg80211_rx_control_port, TP_fast_assign( NETDEV_ASSIGN; __entry->len = skb->len; + MAC_ASSIGN(to, eth_hdr(skb)->h_dest); MAC_ASSIGN(from, eth_hdr(skb)->h_source); __entry->proto = be16_to_cpu(skb->protocol); __entry->unencrypted = unencrypted; ), - TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", - NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from), + TP_printk(NETDEV_PR_FMT ", len=%d, dest: " MAC_PR_FMT + ", src: " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(to), MAC_PR_ARG(from), __entry->proto, BOOL_TO_STR(__entry->unencrypted)) ); -- cgit v1.2.3 From ca4b43c14cd88d28cfc6467d2fa075aad6818f1d Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Sat, 1 Feb 2020 14:13:44 +0800 Subject: usb: charger: assign specific number for enum value To work properly on every architectures and compilers, the enum value needs to be specific numbers. Suggested-by: Greg KH Signed-off-by: Peter Chen Link: https://lore.kernel.org/r/1580537624-10179-1-git-send-email-peter.chen@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usb/charger.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usb/charger.h b/include/uapi/linux/usb/charger.h index 5f72af35b3ed..ad22079125bf 100644 --- a/include/uapi/linux/usb/charger.h +++ b/include/uapi/linux/usb/charger.h @@ -14,18 +14,18 @@ * ACA (Accessory Charger Adapters) */ enum usb_charger_type { - UNKNOWN_TYPE, - SDP_TYPE, - DCP_TYPE, - CDP_TYPE, - ACA_TYPE, + UNKNOWN_TYPE = 0, + SDP_TYPE = 1, + DCP_TYPE = 2, + CDP_TYPE = 3, + ACA_TYPE = 4, }; /* USB charger state */ enum usb_charger_state { - USB_CHARGER_DEFAULT, - USB_CHARGER_PRESENT, - USB_CHARGER_ABSENT, + USB_CHARGER_DEFAULT = 0, + USB_CHARGER_PRESENT = 1, + USB_CHARGER_ABSENT = 2, }; #endif /* _UAPI__LINUX_USB_CHARGER_H */ -- cgit v1.2.3 From bbfd5e4fab63703375eafaf241a0c696024a59e1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 27 Jan 2020 08:53:54 -0800 Subject: perf/core: Add new branch sample type for HW index of raw branch records The low level index is the index in the underlying hardware buffer of the most recently captured taken branch which is always saved in branch_entries[0]. It is very useful for reconstructing the call stack. For example, in Intel LBR call stack mode, the depth of reconstructed LBR call stack limits to the number of LBR registers. With the low level index information, perf tool may stitch the stacks of two samples. The reconstructed LBR call stack can break the HW limitation. Add a new branch sample type to retrieve low level index of raw branch records. The low level index is between -1 (unknown) and max depth which can be retrieved in /sys/devices/cpu/caps/branches. Only when the new branch sample type is set, the low level index information is dumped into the PERF_SAMPLE_BRANCH_STACK output. Perf tool should check the attr.branch_sample_type, and apply the corresponding format for PERF_SAMPLE_BRANCH_STACK samples. Otherwise, some user case may be broken. For example, users may parse a perf.data, which include the new branch sample type, with an old version perf tool (without the check). Users probably get incorrect information without any warning. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200127165355.27495-2-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/intel/lbr.c | 3 +++ include/linux/perf_event.h | 12 ++++++++++++ include/uapi/linux/perf_event.h | 8 +++++++- kernel/events/core.c | 10 ++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3086055bf681..3dcfecf858f3 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -518,6 +518,7 @@ static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events * } } cpuhw->bhrb_stack.nr = u_index; + cpuhw->bhrb_stack.hw_idx = -1ULL; return; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 534c76606049..7639e2097101 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -585,6 +585,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.hw_idx = -1ULL; } /* @@ -680,6 +681,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) out++; } cpuc->lbr_stack.nr = out; + cpuc->lbr_stack.hw_idx = -1ULL; } void intel_pmu_lbr_read(void) @@ -1120,6 +1122,7 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) int i; cpuc->lbr_stack.nr = x86_pmu.lbr_nr; + cpuc->lbr_stack.hw_idx = -1ULL; for (i = 0; i < x86_pmu.lbr_nr; i++) { u64 info = lbr->lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 547773f5894e..68e21e828893 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -93,14 +93,26 @@ struct perf_raw_record { /* * branch stack layout: * nr: number of taken branches stored in entries[] + * hw_idx: The low level index of raw branch records + * for the most recent branch. + * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. + * The entries[] is an abstraction of raw branch records, + * which may not be stored in age order in HW, e.g. Intel LBR. + * The hw_idx is to expose the low level index of raw + * branch record for the most recent branch aka entries[0]. + * The hw_idx index is between -1 (unknown) and max depth, + * which can be retrieved in /sys/devices/cpu/caps/branches. + * For the architectures whose raw branch records are + * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; + __u64 hw_idx; struct perf_branch_entry entries[0]; }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 377d794d3105..397cfd65b3fe 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -181,6 +181,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -208,6 +210,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -853,7 +857,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 nr; - * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER diff --git a/kernel/events/core.c b/kernel/events/core.c index e453589da97c..3f1f77de7247 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6555,6 +6555,11 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } +static inline bool perf_sample_save_hw_index(struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6643,6 +6648,8 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (perf_sample_save_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* @@ -6836,6 +6843,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { + if (perf_sample_save_hw_index(event)) + size += sizeof(u64); + size += data->br_stack->nr * sizeof(struct perf_branch_entry); } -- cgit v1.2.3 From 51c1064e82e77b39a49889287ca50709303e2f26 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 22 Nov 2019 15:19:21 +0100 Subject: gpiolib: add new ioctl() for monitoring changes in line info Currently there is no way for user-space to be informed about changes in status of GPIO lines e.g. when someone else requests the line or its config changes. We can only periodically re-read the line-info. This is fine for simple one-off user-space tools, but any daemon that provides a centralized access to GPIO chips would benefit hugely from an event driven line info synchronization. This patch adds a new ioctl() that allows user-space processes to reuse the file descriptor associated with the character device for watching any changes in line properties. Every such event contains the updated line information. Currently the events are generated on three types of status changes: when a line is requested, when it's released and when its config is changed. The first two are self-explanatory. For the third one: this will only happen when another user-space process calls the new SET_CONFIG ioctl() as any changes that can happen from within the kernel (i.e. set_transitory() or set_debounce()) are of no interest to user-space. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij --- drivers/gpio/gpiolib.c | 186 ++++++++++++++++++++++++++++++++++++++++++++-- drivers/gpio/gpiolib.h | 1 + include/uapi/linux/gpio.h | 30 ++++++++ 3 files changed, 209 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 443321f9cf63..f73077f26eff 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -546,6 +546,9 @@ static long linehandle_set_config(struct linehandle_state *lh, if (ret) return ret; } + + atomic_notifier_call_chain(&desc->gdev->notifier, + GPIOLINE_CHANGED_CONFIG, desc); } return 0; } @@ -1201,14 +1204,25 @@ static void gpio_desc_to_lineinfo(struct gpio_desc *desc, spin_unlock_irqrestore(&gpio_lock, flags); } +struct gpio_chardev_data { + struct gpio_device *gdev; + wait_queue_head_t wait; + DECLARE_KFIFO(events, struct gpioline_info_changed, 32); + struct notifier_block lineinfo_changed_nb; + unsigned long *watched_lines; +}; + /* * gpio_ioctl() - ioctl handler for the GPIO chardev */ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct gpio_device *gdev = filp->private_data; + struct gpio_chardev_data *priv = filp->private_data; + struct gpio_device *gdev = priv->gdev; struct gpio_chip *chip = gdev->chip; void __user *ip = (void __user *)arg; + struct gpio_desc *desc; + __u32 offset; /* We fail any subsequent ioctl():s when the chip is gone */ if (!chip) @@ -1230,9 +1244,9 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (copy_to_user(ip, &chipinfo, sizeof(chipinfo))) return -EFAULT; return 0; - } else if (cmd == GPIO_GET_LINEINFO_IOCTL) { + } else if (cmd == GPIO_GET_LINEINFO_IOCTL || + cmd == GPIO_GET_LINEINFO_WATCH_IOCTL) { struct gpioline_info lineinfo; - struct gpio_desc *desc; if (copy_from_user(&lineinfo, ip, sizeof(lineinfo))) return -EFAULT; @@ -1245,11 +1259,25 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (copy_to_user(ip, &lineinfo, sizeof(lineinfo))) return -EFAULT; + + if (cmd == GPIO_GET_LINEINFO_WATCH_IOCTL) + set_bit(desc_to_gpio(desc), priv->watched_lines); + return 0; } else if (cmd == GPIO_GET_LINEHANDLE_IOCTL) { return linehandle_create(gdev, ip); } else if (cmd == GPIO_GET_LINEEVENT_IOCTL) { return lineevent_create(gdev, ip); + } else if (cmd == GPIO_GET_LINEINFO_UNWATCH_IOCTL) { + if (copy_from_user(&offset, ip, sizeof(offset))) + return -EFAULT; + + desc = gpiochip_get_desc(chip, offset); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + clear_bit(desc_to_gpio(desc), &desc->flags); + return 0; } return -EINVAL; } @@ -1262,6 +1290,101 @@ static long gpio_ioctl_compat(struct file *filp, unsigned int cmd, } #endif +static struct gpio_chardev_data * +to_gpio_chardev_data(struct notifier_block *nb) +{ + return container_of(nb, struct gpio_chardev_data, lineinfo_changed_nb); +} + +static int lineinfo_changed_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct gpio_chardev_data *priv = to_gpio_chardev_data(nb); + struct gpioline_info_changed chg; + struct gpio_desc *desc = data; + int ret; + + if (!test_bit(desc_to_gpio(desc), priv->watched_lines)) + return NOTIFY_DONE; + + memset(&chg, 0, sizeof(chg)); + chg.info.line_offset = gpio_chip_hwgpio(desc); + chg.event_type = action; + chg.timestamp = ktime_get_ns(); + gpio_desc_to_lineinfo(desc, &chg.info); + + ret = kfifo_in_spinlocked(&priv->events, &chg, 1, &priv->wait.lock); + if (ret) + wake_up_poll(&priv->wait, EPOLLIN); + else + pr_debug_ratelimited("lineinfo event FIFO is full - event dropped\n"); + + return NOTIFY_OK; +} + +static __poll_t lineinfo_watch_poll(struct file *filep, + struct poll_table_struct *pollt) +{ + struct gpio_chardev_data *priv = filep->private_data; + __poll_t events = 0; + + poll_wait(filep, &priv->wait, pollt); + + if (!kfifo_is_empty_spinlocked_noirqsave(&priv->events, + &priv->wait.lock)) + events = EPOLLIN | EPOLLRDNORM; + + return events; +} + +static ssize_t lineinfo_watch_read(struct file *filep, char __user *buf, + size_t count, loff_t *off) +{ + struct gpio_chardev_data *priv = filep->private_data; + struct gpioline_info_changed event; + ssize_t bytes_read = 0; + int ret; + + if (count < sizeof(event)) + return -EINVAL; + + do { + spin_lock(&priv->wait.lock); + if (kfifo_is_empty(&priv->events)) { + if (bytes_read) { + spin_unlock(&priv->wait.lock); + return bytes_read; + } + + if (filep->f_flags & O_NONBLOCK) { + spin_unlock(&priv->wait.lock); + return -EAGAIN; + } + + ret = wait_event_interruptible_locked(priv->wait, + !kfifo_is_empty(&priv->events)); + if (ret) { + spin_unlock(&priv->wait.lock); + return ret; + } + } + + ret = kfifo_out(&priv->events, &event, 1); + spin_unlock(&priv->wait.lock); + if (ret != 1) { + ret = -EIO; + break; + /* We should never get here. See lineevent_read(). */ + } + + if (copy_to_user(buf + bytes_read, &event, sizeof(event))) + return -EFAULT; + bytes_read += sizeof(event); + } while (count >= bytes_read + sizeof(event)); + + return bytes_read; +} + /** * gpio_chrdev_open() - open the chardev for ioctl operations * @inode: inode for this chardev @@ -1272,14 +1395,48 @@ static int gpio_chrdev_open(struct inode *inode, struct file *filp) { struct gpio_device *gdev = container_of(inode->i_cdev, struct gpio_device, chrdev); + struct gpio_chardev_data *priv; + int ret = -ENOMEM; /* Fail on open if the backing gpiochip is gone */ if (!gdev->chip) return -ENODEV; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->watched_lines = bitmap_zalloc(gdev->chip->ngpio, GFP_KERNEL); + if (!priv->watched_lines) + goto out_free_priv; + + init_waitqueue_head(&priv->wait); + INIT_KFIFO(priv->events); + priv->gdev = gdev; + + priv->lineinfo_changed_nb.notifier_call = lineinfo_changed_notify; + ret = atomic_notifier_chain_register(&gdev->notifier, + &priv->lineinfo_changed_nb); + if (ret) + goto out_free_bitmap; + get_device(&gdev->dev); - filp->private_data = gdev; + filp->private_data = priv; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto out_unregister_notifier; + + return ret; + +out_unregister_notifier: + atomic_notifier_chain_unregister(&gdev->notifier, + &priv->lineinfo_changed_nb); +out_free_bitmap: + bitmap_free(priv->watched_lines); +out_free_priv: + kfree(priv); + return ret; } /** @@ -1290,17 +1447,23 @@ static int gpio_chrdev_open(struct inode *inode, struct file *filp) */ static int gpio_chrdev_release(struct inode *inode, struct file *filp) { - struct gpio_device *gdev = container_of(inode->i_cdev, - struct gpio_device, chrdev); + struct gpio_chardev_data *priv = filp->private_data; + struct gpio_device *gdev = priv->gdev; + bitmap_free(priv->watched_lines); + atomic_notifier_chain_unregister(&gdev->notifier, + &priv->lineinfo_changed_nb); put_device(&gdev->dev); + kfree(priv); + return 0; } - static const struct file_operations gpio_fileops = { .release = gpio_chrdev_release, .open = gpio_chrdev_open, + .poll = lineinfo_watch_poll, + .read = lineinfo_watch_read, .owner = THIS_MODULE, .llseek = no_llseek, .unlocked_ioctl = gpio_ioctl, @@ -1511,6 +1674,8 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, spin_unlock_irqrestore(&gpio_lock, flags); + ATOMIC_INIT_NOTIFIER_HEAD(&gdev->notifier); + #ifdef CONFIG_PINCTRL INIT_LIST_HEAD(&gdev->pin_ranges); #endif @@ -2843,6 +3008,8 @@ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) } done: spin_unlock_irqrestore(&gpio_lock, flags); + atomic_notifier_call_chain(&desc->gdev->notifier, + GPIOLINE_CHANGED_REQUESTED, desc); return ret; } @@ -2940,6 +3107,9 @@ static bool gpiod_free_commit(struct gpio_desc *desc) } spin_unlock_irqrestore(&gpio_lock, flags); + atomic_notifier_call_chain(&desc->gdev->notifier, + GPIOLINE_CHANGED_RELEASED, desc); + return ret; } diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 3e0aab2945d8..5ab90746b519 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -56,6 +56,7 @@ struct gpio_device { const char *label; void *data; struct list_head list; + struct atomic_notifier_head notifier; #ifdef CONFIG_PINCTRL /* diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 799cf823d493..dca320764e4d 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -59,6 +59,34 @@ struct gpioline_info { /* Maximum number of requested handles */ #define GPIOHANDLES_MAX 64 +/* Possible line status change events */ +enum { + GPIOLINE_CHANGED_REQUESTED = 1, + GPIOLINE_CHANGED_RELEASED, + GPIOLINE_CHANGED_CONFIG, +}; + +/** + * struct gpioline_info_changed - Information about a change in status + * of a GPIO line + * @info: updated line information + * @timestamp: estimate of time of status change occurrence, in nanoseconds + * and GPIOLINE_CHANGED_CONFIG + * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED + * + * Note: struct gpioline_info embedded here has 32-bit alignment on its own, + * but it works fine with 64-bit alignment too. With its 72 byte size, we can + * guarantee there are no implicit holes between it and subsequent members. + * The 20-byte padding at the end makes sure we don't add any implicit padding + * at the end of the structure on 64-bit architectures. + */ +struct gpioline_info_changed { + struct gpioline_info info; + __u64 timestamp; + __u32 event_type; + __u32 padding[5]; /* for future use */ +}; + /* Linerequest flags */ #define GPIOHANDLE_REQUEST_INPUT (1UL << 0) #define GPIOHANDLE_REQUEST_OUTPUT (1UL << 1) @@ -176,6 +204,8 @@ struct gpioevent_data { #define GPIO_GET_CHIPINFO_IOCTL _IOR(0xB4, 0x01, struct gpiochip_info) #define GPIO_GET_LINEINFO_IOCTL _IOWR(0xB4, 0x02, struct gpioline_info) +#define GPIO_GET_LINEINFO_WATCH_IOCTL _IOWR(0xB4, 0x0b, struct gpioline_info) +#define GPIO_GET_LINEINFO_UNWATCH_IOCTL _IOWR(0xB4, 0x0c, __u32) #define GPIO_GET_LINEHANDLE_IOCTL _IOWR(0xB4, 0x03, struct gpiohandle_request) #define GPIO_GET_LINEEVENT_IOCTL _IOWR(0xB4, 0x04, struct gpioevent_request) -- cgit v1.2.3 From ef2c41cf38a7559bbf91af42d5b6a4429db8fc68 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:22 +0100 Subject: clone3: allow spawning processes into cgroups This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Johannes Weiner Cc: Li Zefan Cc: Peter Zijlstra Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 5 +- include/linux/cgroup.h | 20 +++-- include/linux/sched/task.h | 4 + include/uapi/linux/sched.h | 5 ++ kernel/cgroup/cgroup.c | 191 ++++++++++++++++++++++++++++++++++++++------ kernel/cgroup/pids.c | 15 +++- kernel/fork.c | 13 ++- 7 files changed, 214 insertions(+), 39 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 63097cb243cb..68c391f451d1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -628,8 +628,9 @@ struct cgroup_subsys { void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); - int (*can_fork)(struct task_struct *task); - void (*cancel_fork)(struct task_struct *task); + int (*can_fork)(struct task_struct *task, + struct css_set *cset); + void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f1219b927817..4598e4da6b1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,8 @@ #include +struct kernel_clone_args; + #ifdef CONFIG_CGROUPS /* @@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); -extern int cgroup_can_fork(struct task_struct *p); -extern void cgroup_cancel_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); +extern int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); @@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} -static inline int cgroup_can_fork(struct task_struct *p) { return 0; } -static inline void cgroup_cancel_fork(struct task_struct *p) {} -static inline void cgroup_post_fork(struct task_struct *p) {} +static inline int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs) { return 0; } +static inline void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} +static inline void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index f1879884238e..38359071236a 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -13,6 +13,7 @@ struct task_struct; struct rusage; union thread_union; +struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL @@ -29,6 +30,9 @@ struct kernel_clone_args { pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; + int cgroup; + struct cgroup *cgrp; + struct css_set *cset; }; /* diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 2e3bc22c6f20..3bac0a8ceab2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -35,6 +35,7 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 @@ -81,6 +82,8 @@ * @set_tid_size: This defines the size of the array referenced * in @set_tid. This cannot be larger than the * kernel's limit of nested PID namespaces. + * @cgroup: If CLONE_INTO_CGROUP is specified set this to + * a file descriptor for the cgroup. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -97,11 +100,13 @@ struct clone_args { __aligned_u64 tls; __aligned_u64 set_tid; __aligned_u64 set_tid_size; + __aligned_u64 cgroup; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ /* * Scheduling policies diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6d8bdddd8c28..9a8a5ded3c48 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5881,8 +5881,7 @@ out: * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() - * attaches it to the parent's css_set. Empty cg_list indicates that - * @child isn't holding reference to its css_set. + * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { @@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f) return cgrp; } +/** + * cgroup_css_set_fork - find or create a css_set for a child process + * @kargs: the arguments passed to create the child process + * + * This functions finds or creates a new css_set which the child + * process will be attached to in cgroup_post_fork(). By default, + * the child process will be given the same css_set as its parent. + * + * If CLONE_INTO_CGROUP is specified this function will try to find an + * existing css_set which includes the requested cgroup and if not create + * a new css_set that the child will be attached to later. If this function + * succeeds it will hold cgroup_threadgroup_rwsem on return. If + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex + * before grabbing cgroup_threadgroup_rwsem and will hold a reference + * to the target cgroup. + */ +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) +{ + int ret; + struct cgroup *dst_cgrp = NULL; + struct css_set *cset; + struct super_block *sb; + struct file *f; + + if (kargs->flags & CLONE_INTO_CGROUP) + mutex_lock(&cgroup_mutex); + + cgroup_threadgroup_change_begin(current); + + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + if (!(kargs->flags & CLONE_INTO_CGROUP)) { + kargs->cset = cset; + return 0; + } + + f = fget_raw(kargs->cgroup); + if (!f) { + ret = -EBADF; + goto err; + } + sb = f->f_path.dentry->d_sb; + + dst_cgrp = cgroup_get_from_file(f); + if (IS_ERR(dst_cgrp)) { + ret = PTR_ERR(dst_cgrp); + dst_cgrp = NULL; + goto err; + } + + if (cgroup_is_dead(dst_cgrp)) { + ret = -ENODEV; + goto err; + } + + /* + * Verify that we the target cgroup is writable for us. This is + * usually done by the vfs layer but since we're not going through + * the vfs layer here we need to do it "manually". + */ + ret = cgroup_may_write(dst_cgrp, sb); + if (ret) + goto err; + + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, + !(kargs->flags & CLONE_THREAD)); + if (ret) + goto err; + + kargs->cset = find_css_set(cset, dst_cgrp); + if (!kargs->cset) { + ret = -ENOMEM; + goto err; + } + + put_css_set(cset); + fput(f); + kargs->cgrp = dst_cgrp; + return ret; + +err: + cgroup_threadgroup_change_end(current); + mutex_unlock(&cgroup_mutex); + if (f) + fput(f); + if (dst_cgrp) + cgroup_put(dst_cgrp); + put_css_set(cset); + if (kargs->cset) + put_css_set(kargs->cset); + return ret; +} + +/** + * cgroup_css_set_put_fork - drop references we took during fork + * @kargs: the arguments passed to create the child process + * + * Drop references to the prepared css_set and target cgroup if + * CLONE_INTO_CGROUP was requested. + */ +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) +{ + cgroup_threadgroup_change_end(current); + + if (kargs->flags & CLONE_INTO_CGROUP) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + + mutex_unlock(&cgroup_mutex); + + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + + if (cgrp) { + cgroup_put(cgrp); + kargs->cgrp = NULL; + } + } +} + /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * + * This prepares a new css_set for the child process which the child will + * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child) - __acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem) +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; - cgroup_threadgroup_change_begin(current); + ret = cgroup_css_set_fork(kargs); + if (ret) + return ret; do_each_subsys_mask(ss, i, have_canfork_callback) { - ret = ss->can_fork(child); + ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); @@ -5937,32 +6066,34 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); } - cgroup_threadgroup_change_end(current); + cgroup_css_set_put_fork(kargs); return ret; } /** - * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() - * @child: the child process - * - * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded. - */ -void cgroup_cancel_fork(struct task_struct *child) - __releases(&cgroup_threadgroup_rwsem) + * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() + * @child: the child process + * @kargs: the arguments passed to create the child process + * + * This calls the cancel_fork() callbacks if a fork failed *after* + * cgroup_can_fork() succeded and cleans up references we took to + * prepare a new css_set for the child process in cgroup_can_fork(). + */ +void cgroup_cancel_fork(struct task_struct *child, + struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); - cgroup_threadgroup_change_end(current); + cgroup_css_set_put_fork(kargs); } /** @@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child) * Attach the child process to its css_set calling the subsystem fork() * callbacks. */ -void cgroup_post_fork(struct task_struct *child) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_post_fork(struct task_struct *child, + struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { struct cgroup_subsys *ss; struct css_set *cset; int i; + cset = kargs->cset; + kargs->cset = NULL; + spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ if (likely(child->pid)) { WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); + } else { + put_css_set(cset); + cset = NULL; } /* @@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child) ss->fork(child); } while_each_subsys_mask(); - cgroup_threadgroup_change_end(current); + /* Make the new cset the root_cset of the new cgroup namespace. */ + if (kargs->flags & CLONE_NEWCGROUP) { + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; + + get_css_set(cset); + child->nsproxy->cgroup_ns->root_cset = cset; + put_css_set(rcset); + } + + cgroup_css_set_put_fork(kargs); } /** diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 138059eb730d..511af87f685e 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -33,6 +33,7 @@ #include #include #include +#include #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" @@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task) +static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; int err; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); err = pids_try_charge(pids, 1); if (err) { @@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task) return err; } -static void pids_cancel_fork(struct task_struct *task) +static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - css = task_css_check(current, pids_cgrp_id, true); + if (cset) + css = cset->subsys[pids_cgrp_id]; + else + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); pids_uncharge(pids, 1); } diff --git a/kernel/fork.c b/kernel/fork.c index 9245b6e53f55..635d6369dfb9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process( * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p); + retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd; @@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); + cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process( bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - cgroup_cancel_fork(p); + cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); @@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; + if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), @@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, + .cgroup = args.cgroup, }; if (args.set_tid && @@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ - if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) + if (kargs->flags & + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* -- cgit v1.2.3 From c8856c051454909e5059df4e81c77b9c366c5515 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Fri, 14 Feb 2020 15:30:49 -0800 Subject: tcp-zerocopy: Return inq along with tcp receive zerocopy. This patchset is intended to reduce the number of extra system calls imposed by TCP receive zerocopy. For ping-pong RPC style workloads, this patchset has demonstrated a system call reduction of about 30% when coupled with userspace changes. For applications using edge-triggered epoll, returning inq along with the result of tcp receive zerocopy could remove the need to call recvmsg()=-EAGAIN after a successful zerocopy. Generally speaking, since normally we would need to perform a recvmsg() call for every successful small RPC read via TCP receive zerocopy, returning inq can reduce the number of system calls performed by approximately half. Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index fd9eb8f6bcae..548f480b9c66 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -345,5 +345,6 @@ struct tcp_zerocopy_receive { __u64 address; /* in: address of mapping */ __u32 length; /* in/out: number of bytes to map/mapped */ __u32 recv_skip_hint; /* out: amount of bytes to skip */ + __u32 inq; /* out: amount of bytes in read queue */ }; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eb2d80519f8e..a697f1455f8d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3667,13 +3667,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (get_user(len, optlen)) return -EFAULT; - if (len != sizeof(zc)) + if (len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; + if (len > sizeof(zc)) + len = sizeof(zc); if (copy_from_user(&zc, optval, len)) return -EFAULT; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc); release_sock(sk); + switch (len) { + case sizeof(zc): + case offsetofend(struct tcp_zerocopy_receive, inq): + goto zerocopy_rcv_inq; + case offsetofend(struct tcp_zerocopy_receive, length): + default: + goto zerocopy_rcv_out; + } +zerocopy_rcv_inq: + zc.inq = tcp_inq_hint(sk); +zerocopy_rcv_out: if (!err && copy_to_user(optval, &zc, len)) err = -EFAULT; return err; -- cgit v1.2.3 From 33946518d493cdf10aedb4a483f1aa41948a3dab Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Fri, 14 Feb 2020 15:30:50 -0800 Subject: tcp-zerocopy: Return sk_err (if set) along with tcp receive zerocopy. This patchset is intended to reduce the number of extra system calls imposed by TCP receive zerocopy. For ping-pong RPC style workloads, this patchset has demonstrated a system call reduction of about 30% when coupled with userspace changes. For applications using epoll, returning sk_err along with the result of tcp receive zerocopy could remove the need to call recvmsg()=-EAGAIN after a spurious wakeup. Consider a multi-threaded application using epoll. A thread may awaken with EPOLLIN but another thread may already be reading. The spuriously-awoken thread does not necessarily know that another thread 'won'; rather, it may be possible that it was woken up due to the presence of an error if there is no data. A zerocopy read receiving 0 bytes thus would need to be followed up by recvmsg to be sure. Instead, we return sk_err directly with zerocopy, so the application can avoid this extra system call. Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 548f480b9c66..1a7fc856e237 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -346,5 +346,6 @@ struct tcp_zerocopy_receive { __u32 length; /* in/out: number of bytes to map/mapped */ __u32 recv_skip_hint; /* out: amount of bytes to skip */ __u32 inq; /* out: amount of bytes in read queue */ + __s32 err; /* out: socket error */ }; #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a697f1455f8d..1b685485a5b5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3676,14 +3676,20 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc); release_sock(sk); + if (len == sizeof(zc)) + goto zerocopy_rcv_sk_err; switch (len) { - case sizeof(zc): + case offsetofend(struct tcp_zerocopy_receive, err): + goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } +zerocopy_rcv_sk_err: + if (!err) + zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: -- cgit v1.2.3 From 744676e777207f4992ba4cc728a8a71352963c5b Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Sat, 15 Feb 2020 14:20:56 +0100 Subject: openvswitch: add TTL decrement action New action to decrement TTL instead of setting it to a fixed value. This action will decrement the TTL and, in case of expired TTL, drop it or execute an action passed via a nested attribute. The default TTL expired action is to drop the packet. Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively. Tested with a corresponding change in the userspace: # ovs-dpctl dump-flows in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},1 in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl{ttl<=1 action:(drop)},2 in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2 in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1 # ping -c1 192.168.0.2 -t 42 IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84) 192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64 # ping -c1 192.168.0.2 -t 120 IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84) 192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64 # ping -c1 192.168.0.2 -t 1 # Co-developed-by: Bindiya Kurle Signed-off-by: Bindiya Kurle Signed-off-by: Matteo Croce Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 7 ++++ net/openvswitch/actions.c | 67 ++++++++++++++++++++++++++++++++++++++ net/openvswitch/flow_netlink.c | 70 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index ae2bff14e7e1..9b14519e74d9 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -958,6 +958,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ + OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ @@ -1050,4 +1051,10 @@ struct ovs_zone_limit { __u32 count; }; +enum ovs_dec_ttl_attr { + OVS_DEC_TTL_ATTR_UNSPEC, + OVS_DEC_TTL_ATTR_ACTION, /* Nested struct nlattr */ + __OVS_DEC_TTL_ATTR_MAX +}; + #endif /* _LINUX_OPENVSWITCH_H */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 7fbfe2adfffa..fc0efd8833c8 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -964,6 +964,25 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } +static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, bool last) +{ + /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */ + struct nlattr *dec_ttl_arg = nla_data(attr); + int rem = nla_len(attr); + + if (nla_len(dec_ttl_arg)) { + struct nlattr *actions = nla_next(dec_ttl_arg, &rem); + + if (actions) + return clone_execute(dp, skb, key, 0, actions, rem, + last, false); + } + consume_skb(skb); + return 0; +} + /* When 'last' is true, sample() should always consume the 'skb'. * Otherwise, sample() should keep 'skb' intact regardless what * actions are executed within sample(). @@ -1180,6 +1199,45 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, nla_len(actions), last, clone_flow_key); } +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + if (skb->protocol == htons(ETH_P_IPV6)) { + struct ipv6hdr *nh; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + + if (nh->hop_limit <= 1) + return -EHOSTUNREACH; + + key->ip.ttl = --nh->hop_limit; + } else { + struct iphdr *nh; + u8 old_ttl; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + if (nh->ttl <= 1) + return -EHOSTUNREACH; + + old_ttl = nh->ttl--; + csum_replace2(&nh->check, htons(old_ttl << 8), + htons(nh->ttl << 8)); + key->ip.ttl = nh->ttl; + } + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -1365,6 +1423,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; } + + case OVS_ACTION_ATTR_DEC_TTL: + err = execute_dec_ttl(skb, key); + if (err == -EHOSTUNREACH) { + err = dec_ttl_exception_handler(dp, skb, key, + a, true); + return err; + } + break; } if (unlikely(err)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 7da4230627f5..1ccd5e092bca 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -80,6 +80,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_METER: case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: default: return true; } @@ -2495,6 +2496,39 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return 0; } +static int validate_and_copy_dec_ttl(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log) +{ + int start, err; + u32 nested = true; + + if (!nla_len(attr)) + return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL, + NULL, 0, log); + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); + if (start < 0) + return start; + + err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested, + sizeof(nested), log); + + if (err) + return err; + + err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type, + vlan_tci, mpls_label_count, log); + if (err) + return err; + + add_nested_action_end(*sfa, start); + return 0; +} + static int validate_and_copy_clone(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, @@ -3007,6 +3041,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_CLONE] = (u32)-1, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), + [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3267,6 +3302,15 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, break; } + case OVS_ACTION_ATTR_DEC_TTL: + err = validate_and_copy_dec_ttl(net, a, key, sfa, + eth_type, vlan_tci, + mpls_label_count, log); + if (err) + return err; + skip_copy = true; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3438,6 +3482,26 @@ out: return err; } +static int dec_ttl_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + int err = 0, rem = nla_len(attr); + struct nlattr *start; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); + + if (!start) + return -EMSGSIZE; + + err = ovs_nla_put_actions(nla_data(attr), rem, skb); + if (err) + nla_nest_cancel(skb, start); + else + nla_nest_end(skb, start); + + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3538,6 +3602,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_DEC_TTL: + err = dec_ttl_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; -- cgit v1.2.3 From 6a757c07e51f80ac34325fcd558490d2d1439e1b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:07 +0100 Subject: netfilter: conntrack: allow insertion of clashing entries This patch further relaxes the need to drop an skb due to a clash with an existing conntrack entry. Current clash resolution handles the case where the clash occurs between two identical entries (distinct nf_conn objects with same tuples), i.e.: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 ... existing handling will discard the unconfirmed clashing entry and makes skb->_nfct point to the existing one. The skb can then be processed normally just as if the clash would not have existed in the first place. For other clashes, the skb needs to be dropped. This frequently happens with DNS resolvers that send A and AAAA queries back-to-back when NAT rules are present that cause packets to get different DNAT transformations applied, for example: -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.6:5353 -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.7:5353 In this case the A or AAAA query is dropped which incurs a costly delay during name resolution. This patch also allows this collision type: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.7:5353 In this case, clash is in original direction -- the reply direction is still unique. The change makes it so that when the 2nd colliding packet is received, the clashing conntrack is tagged with new IPS_NAT_CLASH_BIT, gets a fixed 1 second timeout and is inserted in the reply direction only. The entry is hidden from 'conntrack -L', it will time out quickly and it can be early dropped because it will never progress to the ASSURED state. To avoid special-casing the delete code path to special case the ORIGINAL hlist_nulls node, a new helper, "hlist_nulls_add_fake", is added so hlist_nulls_del() will work. Example: CPU A: CPU B: 1. 10.2.3.4:42 -> 10.8.8.8:53 (A) 2. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 3. Apply DNAT, reply changed to 10.0.0.6 4. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 5. Apply DNAT, reply changed to 10.0.0.7 6. confirm/commit to conntrack table, no collisions 7. commit clashing entry Reply comes in: 10.2.3.4:42 <- 10.0.0.6:5353 (A) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 10.2.3.4:42 <- 10.0.0.7:5353 (AAAA) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 The conntrack entry is deleted from table, as it has the NAT_CLASH bit set. In case of a retransmit from ORIGINAL dir, all further packets will get the DNAT transformation to 10.0.0.6. I tried to come up with other solutions but they all have worse problems. Alternatives considered were: 1. Confirm ct entries at allocation time, not in postrouting. a. will cause uneccesarry work when the skb that creates the conntrack is dropped by ruleset. b. in case nat is applied, ct entry would need to be moved in the table, which requires another spinlock pair to be taken. c. breaks the 'unconfirmed entry is private to cpu' assumption: we would need to guard all nfct->ext allocation requests with ct->lock spinlock. 2. Make the unconfirmed list a hash table instead of a pcpu list. Shares drawback c) of the first alternative. 3. Document this is expected and force users to rearrange their ruleset (e.g. by using "-m cluster" instead of "-m statistics"). nft has the 'jhash' expression which can be used instead of 'numgen'. Major drawback: doesn't fix what I consider a bug, not very realistic and I believe its reasonable to have the existing rulesets to 'just work'. 4. Document this is expected and force users to steer problematic packets to the same CPU -- this would serialize the "allocate new conntrack entry/nat table evaluation/perform nat/confirm entry", so no race can occur. Similar drawback to 3. Another advantage of this patch compared to 1) and 2) is that there are no changes to the hot path; things are handled in the udp tracker and the clash resolution path. Cc: rcu@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist_nulls.h | 7 ++ include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++- net/netfilter/nf_conntrack_core.c | 76 +++++++++++++++++++++- net/netfilter/nf_conntrack_proto_udp.c | 20 ++++-- 4 files changed, 108 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index e5b752027a03..9670b54b484a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -145,6 +145,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, } } +/* after that hlist_nulls_del will work */ +static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) +{ + n->pprev = &n->next; + n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 336014bf8868..b6f0bb1dc799 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -97,6 +97,15 @@ enum ip_conntrack_status { IPS_UNTRACKED_BIT = 12, IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), +#ifdef __KERNEL__ + /* Re-purposed for in-kernel use: + * Tags a conntrack entry that clashed with an existing entry + * on insert. + */ + IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT, + IPS_NAT_CLASH = IPS_UNTRACKED, +#endif + /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), @@ -110,7 +119,8 @@ enum ip_conntrack_status { */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | - IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD), + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | + IPS_OFFLOAD), __IPS_MAX_BIT = 15, }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3f069eb0f0fc..1927fc296f95 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -940,11 +940,71 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, return NF_DROP; } +/** + * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry + * + * @skb: skb that causes the collision + * @repl_idx: hash slot for reply direction + * + * Called when origin or reply direction had a clash. + * The skb can be handled without packet drop provided the reply direction + * is unique or there the existing entry has the identical tuple in both + * directions. + * + * Caller must hold conntrack table locks to prevent concurrent updates. + * + * Returns NF_DROP if the clash could not be handled. + */ +static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) +{ + struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct net *net; + + zone = nf_ct_zone(loser_ct); + net = nf_ct_net(loser_ct); + + /* Reply direction must never result in a clash, unless both origin + * and reply tuples are identical. + */ + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { + if (nf_ct_key_equal(h, + &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) + return __nf_ct_resolve_clash(skb, h); + } + + /* We want the clashing entry to go away real soon: 1 second timeout. */ + loser_ct->timeout = nfct_time_stamp + HZ; + + /* IPS_NAT_CLASH removes the entry automatically on the first + * reply. Also prevents UDP tracker from moving the entry to + * ASSURED state, i.e. the entry can always be evicted under + * pressure. + */ + loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; + + __nf_conntrack_insert_prepare(loser_ct); + + /* fake add for ORIGINAL dir: we want lookups to only find the entry + * already in the table. This also hides the clashing entry from + * ctnetlink iteration, i.e. conntrack -L won't show them. + */ + hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + + hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &nf_conntrack_hash[repl_idx]); + return NF_ACCEPT; +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table + * @hash_reply: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. @@ -963,10 +1023,18 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * + * Failing that, the new, unconfirmed conntrack is still added to the table + * provided that the collision only occurs in the ORIGINAL direction. + * The new entry will be added after the existing one in the hash list, + * so packets in the ORIGINAL direction will continue to match the existing + * entry. The new entry will also have a fixed timeout so it expires -- + * due to the collision, it will not see bidirectional traffic. + * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int -nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) +nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, + u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -987,6 +1055,10 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) if (ret == NF_ACCEPT) return ret; + ret = nf_ct_resolve_clash_harder(skb, reply_hash); + if (ret == NF_ACCEPT) + return ret; + drop: nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); @@ -1101,7 +1173,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - ret = nf_ct_resolve_clash(skb, h); + ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 7365b43f8f98..760ca2422816 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb, return false; } +static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, + struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + u32 extra_jiffies) +{ + if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && + ct->status & IPS_NAT_CLASH)) + nf_ct_kill(ct); + else + nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); +} + /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } -- cgit v1.2.3 From f25975f42f2f8f2a01303054d6a70c7ceb1fcf54 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 18 Feb 2020 14:03:34 +0100 Subject: bpf, uapi: Remove text about bpf_redirect_map() giving higher performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The performance of bpf_redirect() is now roughly the same as that of bpf_redirect_map(). However, David Ahern pointed out that the header file has not been updated to reflect this, and still says that a significant performance increase is possible when using bpf_redirect_map(). Remove this text from the bpf_redirect_map() description, and reword the description in bpf_redirect() slightly. Also fix the 'Return' section of the bpf_redirect_map() documentation. Fixes: 1d233886dd90 ("xdp: Use bulking for non-map XDP_REDIRECT and consolidate code paths") Reported-by: David Ahern Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200218130334.29889-1-toke@redhat.com --- include/uapi/linux/bpf.h | 16 +++++++--------- tools/include/uapi/linux/bpf.h | 16 +++++++--------- 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1d74a2bd234..22f235260a3a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1045,9 +1045,9 @@ union bpf_attr { * supports redirection to the egress interface, and accepts no * flag at all. * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values @@ -1611,13 +1611,11 @@ union bpf_attr { * the caller. Any higher bits in the *flags* argument must be * unset. * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. + * See also bpf_redirect(), which only supports redirecting to an + * ifindex, but doesn't require a map to do so. * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the **flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1d74a2bd234..22f235260a3a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1045,9 +1045,9 @@ union bpf_attr { * supports redirection to the egress interface, and accepts no * flag at all. * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values @@ -1611,13 +1611,11 @@ union bpf_attr { * the caller. Any higher bits in the *flags* argument must be * unset. * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. + * See also bpf_redirect(), which only supports redirecting to an + * ifindex, but doesn't require a map to do so. * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the **flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description -- cgit v1.2.3 From f623e5970501449b475a8dc007f0fe24743ab9d3 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 11 Feb 2020 14:32:52 -0800 Subject: ethtool: Add support for low latency RS FEC Add support for low latency Reed Solomon FEC as LLRS. The LL-FEC is defined by the 25G/50G ethernet consortium, in the document titled "Low Latency Reed Solomon Forward Error Correction" Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha CC: Andrew Lunn Signed-off-by: Saeed Mahameed Reviewed-by: Andrew Lunn --- drivers/net/phy/phy-core.c | 2 +- include/uapi/linux/ethtool.h | 4 +++- net/ethtool/common.c | 1 + net/ethtool/linkmodes.c | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index a4d2d59fceca..e083e7a76ada 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 74, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 75, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 4295ebfa2f91..d586ee5e10a1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1330,6 +1330,7 @@ enum ethtool_fec_config_bits { ETHTOOL_FEC_OFF_BIT, ETHTOOL_FEC_RS_BIT, ETHTOOL_FEC_BASER_BIT, + ETHTOOL_FEC_LLRS_BIT, }; #define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) @@ -1337,6 +1338,7 @@ enum ethtool_fec_config_bits { #define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) #define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) #define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) +#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. @@ -1521,7 +1523,7 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, - + ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 636ec6d5110e..7b6969af5ae7 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -168,6 +168,7 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full), __DEFINE_LINK_MODE_NAME(400000, DR8, Full), __DEFINE_LINK_MODE_NAME(400000, CR8, Full), + __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 96f20be64553..f049b97072fe 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -237,6 +237,7 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), + __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), }; static const struct nla_policy -- cgit v1.2.3 From fff7b64355eac6e29b50229ad1512315bc04b44e Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 17 Feb 2020 19:04:31 -0800 Subject: bpf: Add bpf_read_branch_records() helper Branch records are a CPU feature that can be configured to record certain branches that are taken during code execution. This data is particularly interesting for profile guided optimizations. perf has had branch record support for a while but the data collection can be a bit coarse grained. We (Facebook) have seen in experiments that associating metadata with branch records can improve results (after postprocessing). We generally use bpf_probe_read_*() to get metadata out of userspace. That's why bpf support for branch records is useful. Aside from this particular use case, having branch data available to bpf progs can be useful to get stack traces out of userspace applications that omit frame pointers. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200218030432.4600-2-dxu@dxuuu.xyz --- include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- kernel/trace/bpf_trace.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1d74a2bd234..a7e59756853f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2892,6 +2892,25 @@ union bpf_attr { * Obtain the 64bit jiffies * Return * The 64 bit jiffies + * + * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (struct perf_branch_entry) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of sizeof(struct perf_branch_entry). + * + * **-ENOENT** if architecture does not support branch records. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3012,7 +3031,8 @@ union bpf_attr { FN(probe_read_kernel_str), \ FN(tcp_send_ack), \ FN(send_signal_thread), \ - FN(jiffies64), + FN(jiffies64), \ + FN(read_branch_records), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3091,6 +3111,9 @@ enum bpf_func_id { /* BPF_FUNC_sk_storage_get flags */ #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +/* BPF_FUNC_read_branch_records flags. */ +#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4ddd5ac46094..b8661bd0d028 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1028,6 +1028,45 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ +#ifndef CONFIG_X86 + return -ENOENT; +#else + static const u32 br_entry_size = sizeof(struct perf_branch_entry); + struct perf_branch_stack *br_stack = ctx->data->br_stack; + u32 to_copy; + + if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) + return -EINVAL; + + if (unlikely(!br_stack)) + return -EINVAL; + + if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) + return br_stack->nr * br_entry_size; + + if (!buf || (size % br_entry_size != 0)) + return -EINVAL; + + to_copy = min_t(u32, br_stack->nr * br_entry_size, size); + memcpy(buf, br_stack->entries, to_copy); + + return to_copy; +#endif +} + +static const struct bpf_func_proto bpf_read_branch_records_proto = { + .func = bpf_read_branch_records, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1040,6 +1079,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; + case BPF_FUNC_read_branch_records: + return &bpf_read_branch_records_proto; default: return tracing_func_proto(func_id, prog); } -- cgit v1.2.3 From 202853595e53f981c86656c49fc1cc1e3620f558 Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Fri, 25 Oct 2019 15:00:45 -0400 Subject: PCI: pciehp: Disable in-band presence detect when possible The presence detect state (PDS) is normally a logical OR of in-band and out-of-band (OOB) presence detect. As of PCIe 4.0, there is the option to disable in-band presence so that the PDS bit always reflects the state of the out-of-band presence. The recommendation of the PCIe spec is to disable in-band presence whenever supported (PCIe r5.0, appendix I implementation note): Due to architectural issues, the in-band (Physical-Layer-based) portion of the PD mechanism is deprecated for use with async hot-plug. One issue is that in-band PD as architected does not detect adapter removal during certain LTSSM states, notably the L1 and Disabled States. Another issue is that when both in-band and OOB PD are being used together, the Presence Detect State bit and its associated interrupt mechanism always reflect the logical OR of the inband and OOB PD states, and with some hot-plug hardware configurations, it is important for software to detect and respond to in-band and OOB PD events independently. If OOB PD is being used and the associated DSP supports In-Band PD Disable, it is recommended that the In-Band PD Disable bit be Set, and the Presence Detect State bit and its associated interrupt mechanism be used exclusively for OOB PD. As a substitute for in-band PD with async hot-plug, the reference model uses either the DPC or the DLL Link Active mechanism. Link: https://lore.kernel.org/r/20191025190047.38130-2-stuart.w.hayes@gmail.com [bhelgaas: move PCI_EXP_SLTCAP2 read earlier & print PCI_EXP_SLTCAP2_IBPD value (suggested by Lukas)] Signed-off-by: Alexandru Gagniuc Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Reviewed-by: Lukas Wunner --- drivers/pci/hotplug/pciehp.h | 1 + drivers/pci/hotplug/pciehp_hpc.c | 12 ++++++++++-- include/uapi/linux/pci_regs.h | 2 ++ 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index aa61d4c219d7..ae44f46d1bf3 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -84,6 +84,7 @@ struct controller { struct pcie_device *pcie; u32 slot_cap; /* capabilities and quirks */ + unsigned int inband_presence_disabled:1; u16 slot_ctrl; /* control register access */ struct mutex ctrl_lock; diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 8a2cb1764386..a573490289c3 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -848,7 +848,7 @@ static inline void dbg_ctrl(struct controller *ctrl) struct controller *pcie_init(struct pcie_device *dev) { struct controller *ctrl; - u32 slot_cap, link_cap; + u32 slot_cap, slot_cap2, link_cap; u8 poweron; struct pci_dev *pdev = dev->port; struct pci_bus *subordinate = pdev->subordinate; @@ -883,6 +883,13 @@ struct controller *pcie_init(struct pcie_device *dev) ctrl->state = list_empty(&subordinate->devices) ? OFF_STATE : ON_STATE; up_read(&pci_bus_sem); + pcie_capability_read_dword(pdev, PCI_EXP_SLTCAP2, &slot_cap2); + if (slot_cap2 & PCI_EXP_SLTCAP2_IBPD) { + pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_IBPD_DISABLE, + PCI_EXP_SLTCTL_IBPD_DISABLE); + ctrl->inband_presence_disabled = 1; + } + /* Check if Data Link Layer Link Active Reporting is implemented */ pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP, &link_cap); @@ -892,7 +899,7 @@ struct controller *pcie_init(struct pcie_device *dev) PCI_EXP_SLTSTA_MRLSC | PCI_EXP_SLTSTA_CC | PCI_EXP_SLTSTA_DLLSC | PCI_EXP_SLTSTA_PDC); - ctrl_info(ctrl, "Slot #%d AttnBtn%c PwrCtrl%c MRL%c AttnInd%c PwrInd%c HotPlug%c Surprise%c Interlock%c NoCompl%c LLActRep%c%s\n", + ctrl_info(ctrl, "Slot #%d AttnBtn%c PwrCtrl%c MRL%c AttnInd%c PwrInd%c HotPlug%c Surprise%c Interlock%c NoCompl%c IbPresDis%c LLActRep%c%s\n", (slot_cap & PCI_EXP_SLTCAP_PSN) >> 19, FLAG(slot_cap, PCI_EXP_SLTCAP_ABP), FLAG(slot_cap, PCI_EXP_SLTCAP_PCP), @@ -903,6 +910,7 @@ struct controller *pcie_init(struct pcie_device *dev) FLAG(slot_cap, PCI_EXP_SLTCAP_HPS), FLAG(slot_cap, PCI_EXP_SLTCAP_EIP), FLAG(slot_cap, PCI_EXP_SLTCAP_NCCS), + FLAG(slot_cap2, PCI_EXP_SLTCAP2_IBPD), FLAG(link_cap, PCI_EXP_LNKCAP_DLLLARC), pdev->broken_cmd_compl ? " (with Cmd Compl erratum)" : ""); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 5437690483cd..f9701410d3b5 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -605,6 +605,7 @@ #define PCI_EXP_SLTCTL_PWR_OFF 0x0400 /* Power Off */ #define PCI_EXP_SLTCTL_EIC 0x0800 /* Electromechanical Interlock Control */ #define PCI_EXP_SLTCTL_DLLSCE 0x1000 /* Data Link Layer State Changed Enable */ +#define PCI_EXP_SLTCTL_IBPD_DISABLE 0x4000 /* In-band PD disable */ #define PCI_EXP_SLTSTA 26 /* Slot Status */ #define PCI_EXP_SLTSTA_ABP 0x0001 /* Attention Button Pressed */ #define PCI_EXP_SLTSTA_PFD 0x0002 /* Power Fault Detected */ @@ -680,6 +681,7 @@ #define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */ #define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */ +#define PCI_EXP_SLTCAP2_IBPD 0x00000001 /* In-band PD Disable Supported */ #define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */ #define PCI_EXP_SLTSTA2 58 /* Slot Status 2 */ -- cgit v1.2.3 From c766d1472c70d25ad475cf56042af1652e792b23 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:57 -0800 Subject: y2038: hide timeval/timespec/itimerval/itimerspec types There are no in-kernel users remaining, but there may still be users that include linux/time.h instead of sys/time.h from user space, so leave the types available to user space while hiding them from kernel space. Only the __kernel_old_* versions of these types remain now. Link: http://lkml.kernel.org/r/20200110154232.4104492-4-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/posix_types.h | 2 ++ include/uapi/linux/time.h | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h index 2f9c80595ba7..b5f7594eee7a 100644 --- a/include/uapi/asm-generic/posix_types.h +++ b/include/uapi/asm-generic/posix_types.h @@ -87,7 +87,9 @@ typedef struct { typedef __kernel_long_t __kernel_off_t; typedef long long __kernel_loff_t; typedef __kernel_long_t __kernel_old_time_t; +#ifndef __KERNEL__ typedef __kernel_long_t __kernel_time_t; +#endif typedef long long __kernel_time64_t; typedef __kernel_long_t __kernel_clock_t; typedef int __kernel_timer_t; diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index a655aa28dc6e..4f4b6e48e01c 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -5,6 +5,7 @@ #include #include +#ifndef __KERNEL__ #ifndef _STRUCT_TIMESPEC #define _STRUCT_TIMESPEC struct timespec { @@ -18,6 +19,17 @@ struct timeval { __kernel_suseconds_t tv_usec; /* microseconds */ }; +struct itimerspec { + struct timespec it_interval;/* timer period */ + struct timespec it_value; /* timer expiration */ +}; + +struct itimerval { + struct timeval it_interval;/* timer interval */ + struct timeval it_value; /* current value */ +}; +#endif + struct timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ @@ -31,16 +43,6 @@ struct timezone { #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 -struct itimerspec { - struct timespec it_interval; /* timer period */ - struct timespec it_value; /* timer expiration */ -}; - -struct itimerval { - struct timeval it_interval; /* timer interval */ - struct timeval it_value; /* current value */ -}; - /* * The IDs of the various system clocks (for POSIX.1b interval timers): */ -- cgit v1.2.3 From 467d12f5c7842896d2de3ced74e4147ee29e97c8 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 20 Feb 2020 20:04:03 -0800 Subject: include/uapi/linux/swab.h: fix userspace breakage, use __BITS_PER_LONG for swap QEMU has a funny new build error message when I use the upstream kernel headers: CC block/file-posix.o In file included from /home/cborntra/REPOS/qemu/include/qemu/timer.h:4, from /home/cborntra/REPOS/qemu/include/qemu/timed-average.h:29, from /home/cborntra/REPOS/qemu/include/block/accounting.h:28, from /home/cborntra/REPOS/qemu/include/block/block_int.h:27, from /home/cborntra/REPOS/qemu/block/file-posix.c:30: /usr/include/linux/swab.h: In function `__swab': /home/cborntra/REPOS/qemu/include/qemu/bitops.h:20:34: error: "sizeof" is not defined, evaluates to 0 [-Werror=undef] 20 | #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) | ^~~~~~ /home/cborntra/REPOS/qemu/include/qemu/bitops.h:20:41: error: missing binary operator before token "(" 20 | #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) | ^ cc1: all warnings being treated as errors make: *** [/home/cborntra/REPOS/qemu/rules.mak:69: block/file-posix.o] Error 1 rm tests/qemu-iotests/socket_scm_helper.o This was triggered by commit d5767057c9a ("uapi: rename ext2_swab() to swab() and share globally in swab.h"). That patch is doing #include but it uses BITS_PER_LONG. The kernel file asm/bitsperlong.h provide only __BITS_PER_LONG. Let us use the __ variant in swap.h Link: http://lkml.kernel.org/r/20200213142147.17604-1-borntraeger@de.ibm.com Fixes: d5767057c9a ("uapi: rename ext2_swab() to swab() and share globally in swab.h") Signed-off-by: Christian Borntraeger Cc: Yury Norov Cc: Allison Randal Cc: Joe Perches Cc: Thomas Gleixner Cc: William Breathitt Gray Cc: Torsten Hilbrich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/swab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h index fa7f97da5b76..7272f85d6d6a 100644 --- a/include/uapi/linux/swab.h +++ b/include/uapi/linux/swab.h @@ -135,9 +135,9 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) static __always_inline unsigned long __swab(const unsigned long y) { -#if BITS_PER_LONG == 64 +#if __BITS_PER_LONG == 64 return __swab64(y); -#else /* BITS_PER_LONG == 32 */ +#else /* __BITS_PER_LONG == 32 */ return __swab32(y); #endif } -- cgit v1.2.3 From 8d74a623cc3cecda89da628b8f3d115d8cf1ee8f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Feb 2020 10:19:12 +0100 Subject: Revert "nl80211: add src and dst addr attributes for control port tx/rx" This reverts commit 8c3ed7aa2b9ef666195b789e9b02e28383243fa8. As Jouni points out, there's really no need for this, since the RSN pre-authentication frames are normal data frames, not port control frames (locally). We can still revert this now since it hasn't actually gone beyond -next. Fixes: 8c3ed7aa2b9e ("nl80211: add src and dst addr attributes for control port tx/rx") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200224101910.b746e263287a.I9eb15d6895515179d50964dec3550c9dc784bb93@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +-- include/uapi/linux/nl80211.h | 16 +--------------- net/mac80211/ieee80211_i.h | 3 +-- net/mac80211/tx.c | 3 +-- net/wireless/nl80211.c | 18 +++--------------- net/wireless/rdev-ops.h | 8 ++++---- net/wireless/trace.h | 27 ++++++++++----------------- 7 files changed, 21 insertions(+), 57 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7ea23caa7301..089ffdd6dba5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3974,8 +3974,7 @@ struct cfg80211_ops { int (*tx_control_port)(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, const u8 *src, - const __be16 proto, + const u8 *dest, const __be16 proto, const bool noencrypt); int (*get_ftm_responder_stats)(struct wiphy *wiphy, diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 158bccb4a47b..350ab86fe20e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1039,14 +1039,11 @@ * a control port frame and as a notification that a control port frame * has been received. %NL80211_ATTR_FRAME is used to specify the * frame contents. The frame is the raw EAPoL data, without ethernet or - * 802.11 headers. An optional %NL80211_ATTR_SRC_MAC can be used to send - * pre-auth frames to STAs on behalf of other APs. + * 802.11 headers. * When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added * indicating the protocol type of the received frame; whether the frame * was received unencrypted and the MAC address of the peer respectively. - * %NL80211_ATTR_DST_MAC can be used to forward pre-auth frames in - * userspace while using AP mode. * * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. * @@ -2412,9 +2409,6 @@ enum nl80211_commands { * %NL80211_ATTR_AKM_SUITES are default capabilities if AKM suites not * advertised for a specific interface type. * - * @NL80211_ATTR_SRC_MAC: MAC address used in control port over nl80211 transmit - * @NL80211_ATTR_DST_MAC: MAC address used in control port over nl80211 receive - * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2883,9 +2877,6 @@ enum nl80211_attrs { NL80211_ATTR_IFTYPE_AKM_SUITES, - NL80211_ATTR_SRC_MAC, - NL80211_ATTR_DST_MAC, - /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5548,10 +5539,6 @@ enum nl80211_feature_flags { * feature, which prevents bufferbloat by using the expected transmission * time to limit the amount of data buffered in the hardware. * - * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_MAC_ADDRS: The driver - * can use src and dst MAC addresses with control port over nl80211 rx - * and tx operations. - * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5599,7 +5586,6 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SAE_OFFLOAD, NL80211_EXT_FEATURE_VLAN_OFFLOAD, NL80211_EXT_FEATURE_AQL, - NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_MAC_ADDRS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index da9eaa9ee37e..8a49d78ad7c9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1792,8 +1792,7 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, const u8 *src, __be16 proto, - bool unencrypted); + const u8 *dest, __be16 proto, bool unencrypted); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2645a39cce88..cddaacaa31a3 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5283,8 +5283,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, const u8 *src, __be16 proto, - bool unencrypted) + const u8 *dest, __be16 proto, bool unencrypted) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f0112dabe21e..d795552f97b2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -634,8 +634,6 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy), - [NL80211_ATTR_SRC_MAC] = NLA_POLICY_ETH_ADDR, - [NL80211_ATTR_DST_MAC] = NLA_POLICY_ETH_ADDR, }; /* policy for the key attributes */ @@ -13698,7 +13696,6 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) const u8 *buf; size_t len; u8 *dest; - u8 src[ETH_ALEN]; u16 proto; bool noencrypt; int err; @@ -13736,13 +13733,6 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) goto out; } - /* copy src address under wdev_lock, as we may copy wdev_address */ - if (info->attrs[NL80211_ATTR_SRC_MAC]) - ether_addr_copy(src, - nla_data(info->attrs[NL80211_ATTR_SRC_MAC])); - else - ether_addr_copy(src, wdev_address(wdev)); - wdev_unlock(wdev); buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); @@ -13753,7 +13743,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); return rdev_tx_control_port(rdev, dev, buf, len, - dest, src, cpu_to_be16(proto), noencrypt); + dest, cpu_to_be16(proto), noencrypt); out: wdev_unlock(wdev); @@ -16010,8 +16000,7 @@ static int __nl80211_rx_control_port(struct net_device *dev, struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ethhdr *ehdr = eth_hdr(skb); - const u8 *daddr = ehdr->h_dest; - const u8 *saddr = ehdr->h_source; + const u8 *addr = ehdr->h_source; u16 proto = be16_to_cpu(skb->protocol); struct sk_buff *msg; void *hdr; @@ -16036,8 +16025,7 @@ static int __nl80211_rx_control_port(struct net_device *dev, nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, saddr) || - nla_put(msg, NL80211_ATTR_DST_MAC, ETH_ALEN, daddr) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || (unencrypted && nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 5ea34c1b60fe..e0d34f796d0b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -734,14 +734,14 @@ static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, - const u8 *dest, const u8 *src, - __be16 proto, const bool noencrypt) + const u8 *dest, __be16 proto, + const bool noencrypt) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, - dest, src, proto, noencrypt); + dest, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, src, proto, noencrypt); + dest, proto, noencrypt); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b6b60e3aea41..3ef1679b0e66 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1928,31 +1928,27 @@ TRACE_EVENT(rdev_mgmt_tx, TRACE_EVENT(rdev_tx_control_port, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - const u8 *buf, size_t len, - const u8 *dest, const u8 *src, __be16 proto, + const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted), - TP_ARGS(wiphy, netdev, buf, len, dest, src, proto, unencrypted), + TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dest) - MAC_ENTRY(src) - __field(u16, proto) + __field(__be16, proto) __field(bool, unencrypted) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dest, dest); - MAC_ASSIGN(src, src); - __entry->proto = be16_to_cpu(proto); + __entry->proto = proto; __entry->unencrypted = unencrypted; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", dest: " MAC_PR_FMT - ", src: " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", - WIPHY_PR_ARG, NETDEV_PR_ARG, - MAC_PR_ARG(dest), MAC_PR_ARG(src), - __entry->proto, + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," + " proto: 0x%x, unencrypted: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), + be16_to_cpu(__entry->proto), BOOL_TO_STR(__entry->unencrypted)) ); @@ -2844,7 +2840,6 @@ TRACE_EVENT(cfg80211_rx_control_port, TP_STRUCT__entry( NETDEV_ENTRY __field(int, len) - MAC_ENTRY(to) MAC_ENTRY(from) __field(u16, proto) __field(bool, unencrypted) @@ -2852,14 +2847,12 @@ TRACE_EVENT(cfg80211_rx_control_port, TP_fast_assign( NETDEV_ASSIGN; __entry->len = skb->len; - MAC_ASSIGN(to, eth_hdr(skb)->h_dest); MAC_ASSIGN(from, eth_hdr(skb)->h_source); __entry->proto = be16_to_cpu(skb->protocol); __entry->unencrypted = unencrypted; ), - TP_printk(NETDEV_PR_FMT ", len=%d, dest: " MAC_PR_FMT - ", src: " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", - NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(to), MAC_PR_ARG(from), + TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from), __entry->proto, BOOL_TO_STR(__entry->unencrypted)) ); -- cgit v1.2.3 From 56be393fa8b40db2d4f54f97614f645eb8d3c32e Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 22 Feb 2020 15:25:43 +0200 Subject: cfg80211: Support key configuration for Beacon protection (BIGTK) IEEE P802.11-REVmd/D3.0 adds support for protecting Beacon frames using a new set of keys (BIGTK; key index 6..7) similarly to the way group-addressed Robust Management frames are protected (IGTK; key index 4..5). Extend cfg80211 and nl80211 to allow the new BIGTK to be configured. Add an extended feature flag to indicate driver support for the new key index values to avoid array overflows in driver implementations and also to indicate to user space when this functionality is available. Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20200222132548.20835-2-jouni@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ++++ include/uapi/linux/nl80211.h | 6 +++++ net/wireless/nl80211.c | 56 +++++++++++++++++++++++++++++++++++--------- net/wireless/rdev-ops.h | 13 ++++++++++ net/wireless/sme.c | 11 +++++++-- net/wireless/trace.h | 17 ++++++++++++++ net/wireless/util.c | 7 +++++- 7 files changed, 101 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 682ee4dd6963..04b6df15706c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3369,6 +3369,8 @@ struct cfg80211_update_owe_info { * @set_default_key: set the default key on an interface * * @set_default_mgmt_key: set the default management frame key on an interface + + * @set_default_beacon_key: set the default Beacon frame key on an interface * * @set_rekey_data: give the data necessary for GTK rekeying to the driver * @@ -3702,6 +3704,9 @@ struct cfg80211_ops { int (*set_default_mgmt_key)(struct wiphy *wiphy, struct net_device *netdev, u8 key_index); + int (*set_default_beacon_key)(struct wiphy *wiphy, + struct net_device *netdev, + u8 key_index); int (*start_ap)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *settings); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 350ab86fe20e..934e62fe8705 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4550,6 +4550,7 @@ enum nl80211_key_default_types { * See &enum nl80211_key_default_types. * @NL80211_KEY_MODE: the mode from enum nl80211_key_mode. * Defaults to @NL80211_KEY_RX_TX. + * @NL80211_KEY_DEFAULT_BEACON: flag indicating default Beacon frame key * * @__NL80211_KEY_AFTER_LAST: internal * @NL80211_KEY_MAX: highest key attribute @@ -4565,6 +4566,7 @@ enum nl80211_key_attributes { NL80211_KEY_TYPE, NL80211_KEY_DEFAULT_TYPES, NL80211_KEY_MODE, + NL80211_KEY_DEFAULT_BEACON, /* keep last */ __NL80211_KEY_AFTER_LAST, @@ -5539,6 +5541,9 @@ enum nl80211_feature_flags { * feature, which prevents bufferbloat by using the expected transmission * time to limit the amount of data buffered in the hardware. * + * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection + * and can receive key configuration for BIGTK using key indexes 6 and 7. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5586,6 +5591,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SAE_OFFLOAD, NL80211_EXT_FEATURE_VLAN_OFFLOAD, NL80211_EXT_FEATURE_AQL, + NL80211_EXT_FEATURE_BEACON_PROTECTION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ce55a2c05fe9..a75f72288139 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -368,7 +368,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN }, - [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 7), [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 }, @@ -1037,7 +1037,7 @@ struct key_parse { struct key_params p; int idx; int type; - bool def, defmgmt; + bool def, defmgmt, defbeacon; bool def_uni, def_multi; }; @@ -1053,12 +1053,13 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, k->def = !!tb[NL80211_KEY_DEFAULT]; k->defmgmt = !!tb[NL80211_KEY_DEFAULT_MGMT]; + k->defbeacon = !!tb[NL80211_KEY_DEFAULT_BEACON]; if (k->def) { k->def_uni = true; k->def_multi = true; } - if (k->defmgmt) + if (k->defmgmt || k->defbeacon) k->def_multi = true; if (tb[NL80211_KEY_IDX]) @@ -1165,14 +1166,17 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k) if (err) return err; - if (k->def && k->defmgmt) { - GENL_SET_ERR_MSG(info, "key with def && defmgmt is invalid"); + if ((k->def ? 1 : 0) + (k->defmgmt ? 1 : 0) + + (k->defbeacon ? 1 : 0) > 1) { + GENL_SET_ERR_MSG(info, + "key with multiple default flags is invalid"); return -EINVAL; } - if (k->defmgmt) { + if (k->defmgmt || k->defbeacon) { if (k->def_uni || !k->def_multi) { - GENL_SET_ERR_MSG(info, "defmgmt key must be mcast"); + GENL_SET_ERR_MSG(info, + "defmgmt/defbeacon key must be mcast"); return -EINVAL; } } @@ -1184,14 +1188,20 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k) "defmgmt key idx not 4 or 5"); return -EINVAL; } + } else if (k->defbeacon) { + if (k->idx < 6 || k->idx > 7) { + GENL_SET_ERR_MSG(info, + "defbeacon key idx not 6 or 7"); + return -EINVAL; + } } else if (k->def) { if (k->idx < 0 || k->idx > 3) { GENL_SET_ERR_MSG(info, "def key idx not 0-3"); return -EINVAL; } } else { - if (k->idx < 0 || k->idx > 5) { - GENL_SET_ERR_MSG(info, "key idx not 0-5"); + if (k->idx < 0 || k->idx > 7) { + GENL_SET_ERR_MSG(info, "key idx not 0-7"); return -EINVAL; } } @@ -3817,8 +3827,14 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) void *hdr; struct sk_buff *msg; - if (info->attrs[NL80211_ATTR_KEY_IDX]) + if (info->attrs[NL80211_ATTR_KEY_IDX]) { key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + if (key_idx > 5 && + !wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + return -EINVAL; + } if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); @@ -3894,7 +3910,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) /* Only support setting default key and * Extended Key ID action NL80211_KEY_SET_TX. */ - if (!key.def && !key.defmgmt && + if (!key.def && !key.defmgmt && !key.defbeacon && !(key.p.mode == NL80211_KEY_SET_TX)) return -EINVAL; @@ -3941,6 +3957,24 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_CFG80211_WEXT dev->ieee80211_ptr->wext.default_mgmt_key = key.idx; #endif + } else if (key.defbeacon) { + if (key.def_uni || !key.def_multi) { + err = -EINVAL; + goto out; + } + + if (!rdev->ops->set_default_beacon_key) { + err = -EOPNOTSUPP; + goto out; + } + + err = nl80211_key_allowed(dev->ieee80211_ptr); + if (err) + goto out; + + err = rdev_set_default_beacon_key(rdev, dev, key.idx); + if (err) + goto out; } else if (key.p.mode == NL80211_KEY_SET_TX && wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index e0d34f796d0b..af7fcf2a3b4a 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -136,6 +136,19 @@ rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u8 key_index) +{ + int ret; + + trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, key_index); + ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, + key_index); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_start_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_settings *settings) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d32a2ec4d96a..ac3e60aa1fc8 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1111,9 +1111,16 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ - if (rdev->ops->del_key) - for (i = 0; i < 6; i++) + if (rdev->ops->del_key) { + int max_key_idx = 5; + + if (wiphy_ext_feature_isset( + wdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + max_key_idx = 7; + for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, i, false, NULL); + } rdev_set_qos_map(rdev, dev, NULL); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 3ef1679b0e66..56b78222746c 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -510,6 +510,23 @@ TRACE_EVENT(rdev_set_default_mgmt_key, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index) ); +TRACE_EVENT(rdev_set_default_beacon_key, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index), + TP_ARGS(wiphy, netdev, key_index), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u8, key_index) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->key_index = key_index; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index) +); + TRACE_EVENT(rdev_start_ap, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ap_settings *settings), diff --git a/net/wireless/util.c b/net/wireless/util.c index 8481e9ac33da..72926f87c913 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -231,7 +231,12 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { - if (key_idx < 0 || key_idx > 5) + int max_key_idx = 5; + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + max_key_idx = 7; + if (key_idx < 0 || key_idx > max_key_idx) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) -- cgit v1.2.3 From 77f576deaa393b54a0f2ca8ab1ab5b2d3c6b971b Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Mon, 20 Jan 2020 13:21:22 +0530 Subject: nl80211: Add NL command to support TID speicific configurations Add the new NL80211_CMD_SET_TID_CONFIG command to support data TID specific configuration. Per TID configuration is passed in the nested NL80211_ATTR_TID_CONFIG attribute. This patch adds support to configure per TID noack policy through the NL80211_TID_CONFIG_ATTR_NOACK attribute. Signed-off-by: Tamizh chelvam Link: https://lore.kernel.org/r/1579506687-18296-2-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 40 ++++++++++++ include/uapi/linux/nl80211.h | 71 +++++++++++++++++++++ net/wireless/nl80211.c | 148 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 24 +++++++ net/wireless/trace.h | 37 +++++++++++ 5 files changed, 320 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 04b6df15706c..65a8bf73d21e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -626,6 +626,38 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; }; +enum ieee80211_tid_conf_mask { + IEEE80211_TID_CONF_NOACK = BIT(NL80211_TID_CONFIG_ATTR_NOACK), +}; + +/** + * struct ieee80211_tid_cfg - TID specific configuration + * @config_override: Flag to notify driver to reset TID configuration + * of the peer. + * @tid: TID number + * @tid_conf_mask: bitmap indicating which parameter changed + * see &enum ieee80211_tid_conf_mask + * @noack: noack configuration value for the TID + */ +struct ieee80211_tid_cfg { + bool config_override; + u8 tid; + u32 tid_conf_mask; + enum nl80211_tid_config noack; +}; + +/** + * struct ieee80211_tid_config - TID configuration + * @peer: Station's MAC address + * @n_tid_conf: Number of TID specific configurations to be applied + * @tid_conf: Configuration change info + */ +struct ieee80211_tid_config { + const u8 *peer; + u32 n_tid_conf; + struct ieee80211_tid_cfg tid_conf[]; +}; + /** * cfg80211_get_chandef_type - return old channel type from chandef * @chandef: the channel definition @@ -3671,6 +3703,10 @@ struct cfg80211_update_owe_info { * * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame * and overrule HWMP path selection algorithm. + * @set_tid_config: TID specific configuration, this can be peer or BSS specific + * This callback may sleep. + * @reset_tid_config: Reset TID specific configuration for the peer. + * This callback may sleep. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3994,6 +4030,10 @@ struct cfg80211_ops { struct cfg80211_update_owe_info *owe_info); int (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); + int (*set_tid_config)(struct wiphy *wiphy, struct net_device *dev, + struct ieee80211_tid_config *tid_conf); + int (*reset_tid_config)(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 tid); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 934e62fe8705..0b12fcffb518 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -264,6 +264,27 @@ * %NL80211_ATTR_VLAN_ID. */ +/** + * DOC: TID configuration + * + * TID configuration support can be advertised by drivers by setting + * @NL80211_EXT_FEATURE_PER_TID_* and/or @NL80211_EXT_FEATURE_PER_STA_* config + * mentioned in &enum nl80211_tid_config_attr. + * Needed configuration parameters are mentioned in + * &enum nl80211_tid_config_attr and it will be passed using + * %NL80211_CMD_SET_TID_CONFIG through %NL80211_ATTR_TID_CONFIG. + * If the configuration needs to be applied for specific peer then MAC address + * of the peer needs to be passed in %NL80211_ATT_MAC, otherwise the + * configuration will be applied for all the connected peers in the vif except + * the peer which has peer specific configuration for the TID. + * And the peer specific configuration will be overridden if + * %NL80211_TID_CONFIG_ATTR_OVERRIDE flag is set. + * All this configurations are valid only for STA's current connection + * i.e. the configurations will be reset to default when the STA connects back + * after disconnection/roaming, and this configuration will be cleared when + * the interface goes down. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -1125,6 +1146,9 @@ * peer MAC address and %NL80211_ATTR_FRAME is used to specify the frame * content. The frame is ethernet data. * + * @NL80211_CMD_SET_TID_CONFIG: Data frame TID specific configuration + * is passed using %NL80211_ATTR_TID_CONFIG attribute. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1349,6 +1373,8 @@ enum nl80211_commands { NL80211_CMD_PROBE_MESH_LINK, + NL80211_CMD_SET_TID_CONFIG, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2409,6 +2435,9 @@ enum nl80211_commands { * %NL80211_ATTR_AKM_SUITES are default capabilities if AKM suites not * advertised for a specific interface type. * + * @NL80211_ATTR_TID_CONFIG: TID specific configuration in a + * nested attribute with &enum nl80211_tid_config_attr sub-attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2877,6 +2906,8 @@ enum nl80211_attrs { NL80211_ATTR_IFTYPE_AKM_SUITES, + NL80211_ATTR_TID_CONFIG, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4722,6 +4753,40 @@ enum nl80211_tx_power_setting { NL80211_TX_POWER_FIXED, }; +/** + * enum nl80211_tid_config - TID config state + * @NL80211_TID_CONFIG_ENABLE: Enable config for the TID + * @NL80211_TID_CONFIG_DISABLE: Disable config for the TID + */ +enum nl80211_tid_config { + NL80211_TID_CONFIG_ENABLE, + NL80211_TID_CONFIG_DISABLE, +}; + +/* enum nl80211_tid_config_attr - TID specific configuration. + * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if no peer + * is selected, if set indicates that the new configuration overrides + * all previous peer configurations, otherwise previous peer specific + * configurations should be left untouched. If peer is selected then + * it will reset particular TID configuration of that peer and it will + * not accept other TID config attributes along with peer. + * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs(bit 0 to 7) + * Its type is u8. + * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. + * specified in %NL80211_TID_CONFIG_ATTR_TID. see %enum nl80211_tid_config. + * Its type is u8. + */ +enum nl80211_tid_config_attr { + __NL80211_TID_CONFIG_ATTR_INVALID, + NL80211_TID_CONFIG_ATTR_OVERRIDE, + NL80211_TID_CONFIG_ATTR_TIDS, + NL80211_TID_CONFIG_ATTR_NOACK, + + /* keep last */ + __NL80211_TID_CONFIG_ATTR_AFTER_LAST, + NL80211_TID_CONFIG_ATTR_MAX = __NL80211_TID_CONFIG_ATTR_AFTER_LAST - 1 +}; + /** * enum nl80211_packet_pattern_attr - packet pattern attribute * @__NL80211_PKTPAT_INVALID: invalid number for nested attribute @@ -5540,6 +5605,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_AQL: The driver supports the Airtime Queue Limit (AQL) * feature, which prevents bufferbloat by using the expected transmission * time to limit the amount of data buffered in the hardware. + * @NL80211_EXT_FEATURE_PER_TID_NOACK_CONFIG: Driver supports per TID NoAck + * policy functionality. + * @NL80211_EXT_FEATURE_PER_STA_NOACK_CONFIG: Driver supports STA specific NoAck + * policy functionality. * * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection * and can receive key configuration for BIGTK using key indexes 6 and 7. @@ -5592,6 +5661,8 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_VLAN_OFFLOAD, NL80211_EXT_FEATURE_AQL, NL80211_EXT_FEATURE_BEACON_PROTECTION, + NL80211_EXT_FEATURE_PER_TID_NOACK_CONFIG, + NL80211_EXT_FEATURE_PER_STA_NOACK_CONFIG, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a75f72288139..a0839fae6555 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -328,6 +328,14 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG }, }; +static const struct nla_policy +nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { + [NL80211_TID_CONFIG_ATTR_OVERRIDE] = { .type = NLA_FLAG }, + [NL80211_TID_CONFIG_ATTR_TIDS] = { .type = NLA_U8 }, + [NL80211_TID_CONFIG_ATTR_NOACK] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), +}; + const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -634,6 +642,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy), [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy), + [NL80211_ATTR_TID_CONFIG] = + NLA_POLICY_NESTED_ARRAY(nl80211_tid_config_attr_policy), }; /* policy for the key attributes */ @@ -13934,6 +13944,137 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) return rdev_probe_mesh_link(rdev, dev, dest, buf, len); } +static int +__nl80211_check_tid_conf_support(struct cfg80211_registered_device *rdev, + struct netlink_ext_ack *extack, + const u8 *peer, struct nlattr *attrs[], + struct ieee80211_tid_cfg *tid_conf, + enum nl80211_tid_config_attr attr, + enum nl80211_ext_feature_index per_tid_config, + enum nl80211_ext_feature_index per_sta_config) +{ + if (!wiphy_ext_feature_isset(&rdev->wiphy, per_tid_config)) { + NL_SET_ERR_MSG_ATTR(extack, attrs[attr], + "TID specific configuration not supported"); + return -ENOTSUPP; + } + + if (peer && !wiphy_ext_feature_isset(&rdev->wiphy, per_sta_config)) { + NL_SET_ERR_MSG_ATTR(extack, attrs[attr], + "peer specific TID configuration not supported"); + return -ENOTSUPP; + } + + tid_conf->tid_conf_mask |= BIT(attr); + return 0; +} + +#define nl80211_check_tid_config_support(rdev, extack, peer, attrs, tid_conf, \ + conf) \ + __nl80211_check_tid_conf_support(rdev, extack, peer, attrs, tid_conf, \ + NL80211_TID_CONFIG_ATTR_##conf, \ + NL80211_EXT_FEATURE_PER_TID_##conf##_CONFIG, \ + NL80211_EXT_FEATURE_PER_STA_##conf##_CONFIG) + +static int parse_tid_conf(struct cfg80211_registered_device *rdev, + struct nlattr *attrs[], struct net_device *dev, + struct ieee80211_tid_cfg *tid_conf, + struct genl_info *info, const u8 *peer) +{ + struct netlink_ext_ack *extack = info->extack; + int err; + + if (!attrs[NL80211_TID_CONFIG_ATTR_TIDS]) + return -EINVAL; + + tid_conf->config_override = + nla_get_flag(attrs[NL80211_TID_CONFIG_ATTR_OVERRIDE]); + tid_conf->tid = nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_TIDS]); + + if (tid_conf->config_override) { + if (rdev->ops->reset_tid_config) { + err = rdev_reset_tid_config(rdev, dev, peer, + tid_conf->tid); + /* If peer is there no other configuration will be + * allowed + */ + if (err || peer) + return err; + } else { + return -EINVAL; + } + } + + if (attrs[NL80211_TID_CONFIG_ATTR_NOACK]) { + err = nl80211_check_tid_config_support(rdev, extack, peer, + attrs, tid_conf, + NOACK); + if (err) + return err; + + tid_conf->noack = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_NOACK]); + } + + return 0; +} + +static int nl80211_set_tid_config(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1]; + struct net_device *dev = info->user_ptr[1]; + struct ieee80211_tid_config *tid_config; + struct nlattr *tid; + int conf_idx = 0, rem_conf; + int ret = -EINVAL; + u32 num_conf = 0; + + if (!info->attrs[NL80211_ATTR_TID_CONFIG]) + return -EINVAL; + + if (!rdev->ops->set_tid_config) + return -EOPNOTSUPP; + + nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG], + rem_conf) + num_conf++; + + tid_config = kzalloc(struct_size(tid_config, tid_conf, num_conf), + GFP_KERNEL); + if (!tid_config) + return -ENOMEM; + + tid_config->n_tid_conf = num_conf; + + if (info->attrs[NL80211_ATTR_MAC]) + tid_config->peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG], + rem_conf) { + ret = nla_parse_nested(attrs, NL80211_TID_CONFIG_ATTR_MAX, + tid, NULL, NULL); + + if (ret) + goto bad_tid_conf; + + ret = parse_tid_conf(rdev, attrs, dev, + &tid_config->tid_conf[conf_idx], + info, tid_config->peer); + if (ret) + goto bad_tid_conf; + + conf_idx++; + } + + ret = rdev_set_tid_config(rdev, dev, tid_config); + +bad_tid_conf: + kfree(tid_config); + return ret; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -14888,6 +15029,13 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_SET_TID_CONFIG, + .doit = nl80211_set_tid_config, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index af7fcf2a3b4a..a754e0496b6c 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1326,4 +1326,28 @@ rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_tid_config *tid_conf) +{ + int ret; + + trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf); + ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *peer, + u8 tid) +{ + int ret; + + trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tid); + ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tid); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 56b78222746c..167b18896cd6 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3480,6 +3480,43 @@ TRACE_EVENT(rdev_probe_mesh_link, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest)) ); +TRACE_EVENT(rdev_set_tid_config, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct ieee80211_tid_config *tid_conf), + TP_ARGS(wiphy, netdev, tid_conf), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, tid_conf->peer); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) +); + +TRACE_EVENT(rdev_reset_tid_config, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *peer, u8 tid), + TP_ARGS(wiphy, netdev, peer, tid), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tid) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tid = tid; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tid: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tid) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 3710a8a6284f58a78ba4fe9c4b6672207636a223 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Feb 2020 11:34:25 +0100 Subject: nl80211: modify TID-config API Make some changes to the TID-config API: * use u16 in nl80211 (only, and restrict to using 8 bits for now), to avoid issues in the future if we ever want to use higher TIDs. * reject empty TIDs mask (via netlink policy) * change feature advertising to not use extended feature flags but have own mechanism for this, which simplifies the code * fix all variable names from 'tid' to 'tids' since it's a mask * change to cfg80211_ name prefixes, not ieee80211_ * fix some minor docs/spelling things. Change-Id: Ia234d464b3f914cdeab82f540e018855be580dce Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 43 +++++++++++-------- include/uapi/linux/nl80211.h | 51 +++++++++++++---------- net/wireless/nl80211.c | 99 +++++++++++++++++++++++++------------------- net/wireless/rdev-ops.h | 8 ++-- net/wireless/trace.h | 14 +++---- 5 files changed, 121 insertions(+), 94 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 65a8bf73d21e..bbe4acef729d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -626,36 +626,32 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; }; -enum ieee80211_tid_conf_mask { - IEEE80211_TID_CONF_NOACK = BIT(NL80211_TID_CONFIG_ATTR_NOACK), -}; - /** - * struct ieee80211_tid_cfg - TID specific configuration + * struct cfg80211_tid_cfg - TID specific configuration * @config_override: Flag to notify driver to reset TID configuration * of the peer. - * @tid: TID number - * @tid_conf_mask: bitmap indicating which parameter changed - * see &enum ieee80211_tid_conf_mask + * @tids: bitmap of TIDs to modify + * @mask: bitmap of attributes indicating which parameter changed, + * similar to &nl80211_tid_config_supp. * @noack: noack configuration value for the TID */ -struct ieee80211_tid_cfg { +struct cfg80211_tid_cfg { bool config_override; - u8 tid; - u32 tid_conf_mask; + u8 tids; + u32 mask; enum nl80211_tid_config noack; }; /** - * struct ieee80211_tid_config - TID configuration + * struct cfg80211_tid_config - TID configuration * @peer: Station's MAC address * @n_tid_conf: Number of TID specific configurations to be applied * @tid_conf: Configuration change info */ -struct ieee80211_tid_config { +struct cfg80211_tid_config { const u8 *peer; u32 n_tid_conf; - struct ieee80211_tid_cfg tid_conf[]; + struct cfg80211_tid_cfg tid_conf[]; }; /** @@ -3705,8 +3701,8 @@ struct cfg80211_update_owe_info { * and overrule HWMP path selection algorithm. * @set_tid_config: TID specific configuration, this can be peer or BSS specific * This callback may sleep. - * @reset_tid_config: Reset TID specific configuration for the peer. - * This callback may sleep. + * @reset_tid_config: Reset TID specific configuration for the peer, for the + * given TIDs. This callback may sleep. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4031,9 +4027,9 @@ struct cfg80211_ops { int (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); int (*set_tid_config)(struct wiphy *wiphy, struct net_device *dev, - struct ieee80211_tid_config *tid_conf); + struct cfg80211_tid_config *tid_conf); int (*reset_tid_config)(struct wiphy *wiphy, struct net_device *dev, - const u8 *peer, u8 tid); + const u8 *peer, u8 tids); }; /* @@ -4641,6 +4637,13 @@ struct wiphy_iftype_akm_suites { * @support_mbssid must be set for this to have any effect. * * @pmsr_capa: peer measurement capabilities + * + * @tid_config_support: describes the per-TID config support that the + * device has + * @tid_config_support.vif: bitmap of attributes (configurations) + * supported by the driver for each vif + * @tid_config_support.peer: bitmap of attributes (configurations) + * supported by the driver for each peer */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -4772,6 +4775,10 @@ struct wiphy { const struct cfg80211_pmsr_capabilities *pmsr_capa; + struct { + u64 peer, vif; + } tid_config_support; + char priv[0] __aligned(NETDEV_ALIGN); }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0b12fcffb518..591d843eda72 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -267,20 +267,22 @@ /** * DOC: TID configuration * - * TID configuration support can be advertised by drivers by setting - * @NL80211_EXT_FEATURE_PER_TID_* and/or @NL80211_EXT_FEATURE_PER_STA_* config - * mentioned in &enum nl80211_tid_config_attr. - * Needed configuration parameters are mentioned in - * &enum nl80211_tid_config_attr and it will be passed using - * %NL80211_CMD_SET_TID_CONFIG through %NL80211_ATTR_TID_CONFIG. - * If the configuration needs to be applied for specific peer then MAC address - * of the peer needs to be passed in %NL80211_ATT_MAC, otherwise the + * TID config support can be checked in the %NL80211_ATTR_TID_CONFIG + * attribute given in wiphy capabilities. + * + * The necessary configuration parameters are mentioned in + * &enum nl80211_tid_config_attr and it will be passed to the + * %NL80211_CMD_SET_TID_CONFIG command in %NL80211_ATTR_TID_CONFIG. + * + * If the configuration needs to be applied for specific peer then the MAC + * address of the peer needs to be passed in %NL80211_ATTR_MAC, otherwise the * configuration will be applied for all the connected peers in the vif except - * the peer which has peer specific configuration for the TID. - * And the peer specific configuration will be overridden if - * %NL80211_TID_CONFIG_ATTR_OVERRIDE flag is set. - * All this configurations are valid only for STA's current connection - * i.e. the configurations will be reset to default when the STA connects back + * any peers that have peer specific configuration for the TID by default; if + * the %NL80211_TID_CONFIG_ATTR_OVERRIDE flag is set, peer specific values + * will be overwritten. + * + * All this configuration is valid only for STA's current connection + * i.e. the configuration will be reset to default when the STA connects back * after disconnection/roaming, and this configuration will be cleared when * the interface goes down. */ @@ -2436,7 +2438,9 @@ enum nl80211_commands { * advertised for a specific interface type. * * @NL80211_ATTR_TID_CONFIG: TID specific configuration in a - * nested attribute with &enum nl80211_tid_config_attr sub-attributes. + * nested attribute with &enum nl80211_tid_config_attr sub-attributes; + * on output (in wiphy attributes) it contains only the feature sub- + * attributes. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -4764,20 +4768,29 @@ enum nl80211_tid_config { }; /* enum nl80211_tid_config_attr - TID specific configuration. + * @NL80211_TID_CONFIG_ATTR_PAD: pad attribute for 64-bit values + * @NL80211_TID_CONFIG_ATTR_VIF_SUPP: a bitmap (u64) of attributes supported + * for per-vif configuration; doesn't list the ones that are generic + * (%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE). + * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but + * per peer instead. * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if no peer * is selected, if set indicates that the new configuration overrides * all previous peer configurations, otherwise previous peer specific * configurations should be left untouched. If peer is selected then * it will reset particular TID configuration of that peer and it will * not accept other TID config attributes along with peer. - * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs(bit 0 to 7) - * Its type is u8. + * @NL80211_TID_CONFIG_ATTR_TIDS: a bitmask value of TIDs (bit 0 to 7) + * Its type is u16. * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. * specified in %NL80211_TID_CONFIG_ATTR_TID. see %enum nl80211_tid_config. * Its type is u8. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, + NL80211_TID_CONFIG_ATTR_PAD, + NL80211_TID_CONFIG_ATTR_VIF_SUPP, + NL80211_TID_CONFIG_ATTR_PEER_SUPP, NL80211_TID_CONFIG_ATTR_OVERRIDE, NL80211_TID_CONFIG_ATTR_TIDS, NL80211_TID_CONFIG_ATTR_NOACK, @@ -5605,10 +5618,6 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_AQL: The driver supports the Airtime Queue Limit (AQL) * feature, which prevents bufferbloat by using the expected transmission * time to limit the amount of data buffered in the hardware. - * @NL80211_EXT_FEATURE_PER_TID_NOACK_CONFIG: Driver supports per TID NoAck - * policy functionality. - * @NL80211_EXT_FEATURE_PER_STA_NOACK_CONFIG: Driver supports STA specific NoAck - * policy functionality. * * @NL80211_EXT_FEATURE_BEACON_PROTECTION: The driver supports Beacon protection * and can receive key configuration for BIGTK using key indexes 6 and 7. @@ -5661,8 +5670,6 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_VLAN_OFFLOAD, NL80211_EXT_FEATURE_AQL, NL80211_EXT_FEATURE_BEACON_PROTECTION, - NL80211_EXT_FEATURE_PER_TID_NOACK_CONFIG, - NL80211_EXT_FEATURE_PER_STA_NOACK_CONFIG, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a0839fae6555..56ac851ccee1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -330,8 +330,10 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { static const struct nla_policy nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { + [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 }, + [NL80211_TID_CONFIG_ATTR_PEER_SUPP] = { .type = NLA_U64 }, [NL80211_TID_CONFIG_ATTR_OVERRIDE] = { .type = NLA_FLAG }, - [NL80211_TID_CONFIG_ATTR_TIDS] = { .type = NLA_U8 }, + [NL80211_TID_CONFIG_ATTR_TIDS] = NLA_POLICY_RANGE(NLA_U16, 1, 0xff), [NL80211_TID_CONFIG_ATTR_NOACK] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; @@ -1957,6 +1959,40 @@ nl80211_put_iftype_akm_suites(struct cfg80211_registered_device *rdev, return 0; } +static int +nl80211_put_tid_config_support(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) +{ + struct nlattr *supp; + + if (!rdev->wiphy.tid_config_support.vif && + !rdev->wiphy.tid_config_support.peer) + return 0; + + supp = nla_nest_start(msg, NL80211_ATTR_TID_CONFIG); + if (!supp) + return -ENOSPC; + + if (rdev->wiphy.tid_config_support.vif && + nla_put_u64_64bit(msg, NL80211_TID_CONFIG_ATTR_VIF_SUPP, + rdev->wiphy.tid_config_support.vif, + NL80211_TID_CONFIG_ATTR_PAD)) + goto fail; + + if (rdev->wiphy.tid_config_support.peer && + nla_put_u64_64bit(msg, NL80211_TID_CONFIG_ATTR_PEER_SUPP, + rdev->wiphy.tid_config_support.peer, + NL80211_TID_CONFIG_ATTR_PAD)) + goto fail; + + nla_nest_end(msg, supp); + + return 0; +fail: + nla_nest_cancel(msg, supp); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -2518,6 +2554,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_iftype_akm_suites(rdev, msg)) goto nla_put_failure; + if (nl80211_put_tid_config_support(rdev, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -13944,44 +13983,13 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) return rdev_probe_mesh_link(rdev, dev, dest, buf, len); } -static int -__nl80211_check_tid_conf_support(struct cfg80211_registered_device *rdev, - struct netlink_ext_ack *extack, - const u8 *peer, struct nlattr *attrs[], - struct ieee80211_tid_cfg *tid_conf, - enum nl80211_tid_config_attr attr, - enum nl80211_ext_feature_index per_tid_config, - enum nl80211_ext_feature_index per_sta_config) -{ - if (!wiphy_ext_feature_isset(&rdev->wiphy, per_tid_config)) { - NL_SET_ERR_MSG_ATTR(extack, attrs[attr], - "TID specific configuration not supported"); - return -ENOTSUPP; - } - - if (peer && !wiphy_ext_feature_isset(&rdev->wiphy, per_sta_config)) { - NL_SET_ERR_MSG_ATTR(extack, attrs[attr], - "peer specific TID configuration not supported"); - return -ENOTSUPP; - } - - tid_conf->tid_conf_mask |= BIT(attr); - return 0; -} - -#define nl80211_check_tid_config_support(rdev, extack, peer, attrs, tid_conf, \ - conf) \ - __nl80211_check_tid_conf_support(rdev, extack, peer, attrs, tid_conf, \ - NL80211_TID_CONFIG_ATTR_##conf, \ - NL80211_EXT_FEATURE_PER_TID_##conf##_CONFIG, \ - NL80211_EXT_FEATURE_PER_STA_##conf##_CONFIG) - static int parse_tid_conf(struct cfg80211_registered_device *rdev, struct nlattr *attrs[], struct net_device *dev, - struct ieee80211_tid_cfg *tid_conf, + struct cfg80211_tid_cfg *tid_conf, struct genl_info *info, const u8 *peer) { struct netlink_ext_ack *extack = info->extack; + u64 mask; int err; if (!attrs[NL80211_TID_CONFIG_ATTR_TIDS]) @@ -13989,12 +13997,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, tid_conf->config_override = nla_get_flag(attrs[NL80211_TID_CONFIG_ATTR_OVERRIDE]); - tid_conf->tid = nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_TIDS]); + tid_conf->tids = nla_get_u16(attrs[NL80211_TID_CONFIG_ATTR_TIDS]); if (tid_conf->config_override) { if (rdev->ops->reset_tid_config) { err = rdev_reset_tid_config(rdev, dev, peer, - tid_conf->tid); + tid_conf->tids); /* If peer is there no other configuration will be * allowed */ @@ -14006,16 +14014,21 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, } if (attrs[NL80211_TID_CONFIG_ATTR_NOACK]) { - err = nl80211_check_tid_config_support(rdev, extack, peer, - attrs, tid_conf, - NOACK); - if (err) - return err; - + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_NOACK); tid_conf->noack = nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_NOACK]); } + if (peer) + mask = rdev->wiphy.tid_config_support.peer; + else + mask = rdev->wiphy.tid_config_support.vif; + + if (tid_conf->mask & ~mask) { + NL_SET_ERR_MSG(extack, "unsupported TID configuration"); + return -ENOTSUPP; + } + return 0; } @@ -14025,7 +14038,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1]; struct net_device *dev = info->user_ptr[1]; - struct ieee80211_tid_config *tid_config; + struct cfg80211_tid_config *tid_config; struct nlattr *tid; int conf_idx = 0, rem_conf; int ret = -EINVAL; diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index a754e0496b6c..99462f0c4e08 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1328,7 +1328,7 @@ rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct ieee80211_tid_config *tid_conf) + struct cfg80211_tid_config *tid_conf) { int ret; @@ -1340,12 +1340,12 @@ static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, - u8 tid) + u8 tids) { int ret; - trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tid); - ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tid); + trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids); + ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 167b18896cd6..839df54cee21 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3482,7 +3482,7 @@ TRACE_EVENT(rdev_probe_mesh_link, TRACE_EVENT(rdev_set_tid_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct ieee80211_tid_config *tid_conf), + struct cfg80211_tid_config *tid_conf), TP_ARGS(wiphy, netdev, tid_conf), TP_STRUCT__entry( WIPHY_ENTRY @@ -3500,22 +3500,22 @@ TRACE_EVENT(rdev_set_tid_config, TRACE_EVENT(rdev_reset_tid_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - const u8 *peer, u8 tid), - TP_ARGS(wiphy, netdev, peer, tid), + const u8 *peer, u8 tids), + TP_ARGS(wiphy, netdev, peer, tids), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) - __field(u8, tid) + __field(u8, tids) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); - __entry->tid = tid; + __entry->tids = tids; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tid: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tid) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids) ); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ -- cgit v1.2.3 From 6a21d16c4db08398c737e0ffd03e4eca7131ac78 Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Mon, 20 Jan 2020 13:21:23 +0530 Subject: nl80211: Add support to configure TID specific retry configuration This patch adds support to configure per TID retry configuration through the NL80211_TID_CONFIG_ATTR_RETRY_SHORT and NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes. This TID specific retry configuration will have more precedence than phy level configuration. Signed-off-by: Tamizh chelvam Link: https://lore.kernel.org/r/1579506687-18296-3-git-send-email-tamizhr@codeaurora.org [rebase completely on top of my previous API changes] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ include/uapi/linux/nl80211.h | 12 ++++++++++++ net/wireless/nl80211.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bbe4acef729d..98981d1a026b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -634,12 +634,15 @@ struct cfg80211_chan_def { * @mask: bitmap of attributes indicating which parameter changed, * similar to &nl80211_tid_config_supp. * @noack: noack configuration value for the TID + * @retry_long: retry count value + * @retry_short: retry count value */ struct cfg80211_tid_cfg { bool config_override; u8 tids; u32 mask; enum nl80211_tid_config noack; + u8 retry_long, retry_short; }; /** @@ -4644,6 +4647,8 @@ struct wiphy_iftype_akm_suites { * supported by the driver for each vif * @tid_config_support.peer: bitmap of attributes (configurations) * supported by the driver for each peer + * @tid_config_support.max_retry: maximum supported retry count for + * long/short retry configuration */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -4777,8 +4782,11 @@ struct wiphy { struct { u64 peer, vif; + u8 max_retry; } tid_config_support; + u8 max_data_retry_count; + char priv[0] __aligned(NETDEV_ALIGN); }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 591d843eda72..c3481e1feebe 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4785,6 +4785,16 @@ enum nl80211_tid_config { * @NL80211_TID_CONFIG_ATTR_NOACK: Configure ack policy for the TID. * specified in %NL80211_TID_CONFIG_ATTR_TID. see %enum nl80211_tid_config. * Its type is u8. + * @NL80211_TID_CONFIG_ATTR_RETRY_SHORT: Number of retries used with data frame + * transmission, user-space sets this configuration in + * &NL80211_CMD_SET_TID_CONFIG. It is u8 type, min value is 1 and + * the max value is advertised by the driver in this attribute on + * output in wiphy capabilities. + * @NL80211_TID_CONFIG_ATTR_RETRY_LONG: Number of retries used with data frame + * transmission, user-space sets this configuration in + * &NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and + * the max value is advertised by the driver in this attribute on + * output in wiphy capabilities. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4794,6 +4804,8 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_OVERRIDE, NL80211_TID_CONFIG_ATTR_TIDS, NL80211_TID_CONFIG_ATTR_NOACK, + NL80211_TID_CONFIG_ATTR_RETRY_SHORT, + NL80211_TID_CONFIG_ATTR_RETRY_LONG, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 56ac851ccee1..4c79ba685992 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -336,6 +336,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { [NL80211_TID_CONFIG_ATTR_TIDS] = NLA_POLICY_RANGE(NLA_U16, 1, 0xff), [NL80211_TID_CONFIG_ATTR_NOACK] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_TID_CONFIG_ATTR_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1), }; const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -1985,6 +1987,14 @@ nl80211_put_tid_config_support(struct cfg80211_registered_device *rdev, NL80211_TID_CONFIG_ATTR_PAD)) goto fail; + /* for now we just use the same value ... makes more sense */ + if (nla_put_u8(msg, NL80211_TID_CONFIG_ATTR_RETRY_SHORT, + rdev->wiphy.tid_config_support.max_retry)) + goto fail; + if (nla_put_u8(msg, NL80211_TID_CONFIG_ATTR_RETRY_LONG, + rdev->wiphy.tid_config_support.max_retry)) + goto fail; + nla_nest_end(msg, supp); return 0; @@ -14019,6 +14029,24 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_NOACK]); } + if (attrs[NL80211_TID_CONFIG_ATTR_RETRY_SHORT]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RETRY_SHORT); + tid_conf->retry_short = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RETRY_SHORT]); + + if (tid_conf->retry_short > rdev->wiphy.max_data_retry_count) + return -EINVAL; + } + + if (attrs[NL80211_TID_CONFIG_ATTR_RETRY_LONG]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RETRY_LONG); + tid_conf->retry_long = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RETRY_LONG]); + + if (tid_conf->retry_long > rdev->wiphy.max_data_retry_count) + return -EINVAL; + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From ade274b23e41886091a7e105ab3de71baef112e7 Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Mon, 20 Jan 2020 13:21:24 +0530 Subject: nl80211: Add support to configure TID specific AMPDU configuration This patch adds support to configure per TID AMPDU control configuration to enable/disable aggregation through the NL80211_TID_CONFIG_ATTR_AMPDU_CTRL attribute. Signed-off-by: Tamizh chelvam Link: https://lore.kernel.org/r/1579506687-18296-4-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 98981d1a026b..78171ae01406 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -636,6 +636,7 @@ struct cfg80211_chan_def { * @noack: noack configuration value for the TID * @retry_long: retry count value * @retry_short: retry count value + * @ampdu: Enable/Disable aggregation */ struct cfg80211_tid_cfg { bool config_override; @@ -643,6 +644,7 @@ struct cfg80211_tid_cfg { u32 mask; enum nl80211_tid_config noack; u8 retry_long, retry_short; + enum nl80211_tid_config ampdu; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c3481e1feebe..71cae33e6679 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4795,6 +4795,9 @@ enum nl80211_tid_config { * &NL80211_CMD_SET_TID_CONFIG. Its type is u8, min value is 1 and * the max value is advertised by the driver in this attribute on * output in wiphy capabilities. + * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable aggregation for the TIDs + * specified in %NL80211_TID_CONFIG_ATTR_TIDS. Its type is u8, using + * the values from &nl80211_tid_config. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4806,6 +4809,7 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_NOACK, NL80211_TID_CONFIG_ATTR_RETRY_SHORT, NL80211_TID_CONFIG_ATTR_RETRY_LONG, + NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4c79ba685992..078d30756b3e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -338,6 +338,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_TID_CONFIG_ATTR_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_TID_CONFIG_ATTR_AMPDU_CTRL] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -14047,6 +14049,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMPDU_CTRL); + tid_conf->ampdu = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From 04f7d142f51c6019a695cfd70c09bb60233472c5 Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Mon, 20 Jan 2020 13:21:25 +0530 Subject: nl80211: Add support to configure TID specific RTSCTS configuration This patch adds support to configure per TID RTSCTS control configuration to enable/disable through the NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL attribute. Signed-off-by: Tamizh chelvam Link: https://lore.kernel.org/r/1579506687-18296-5-git-send-email-tamizhr@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 78171ae01406..f7c84c32ba39 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -637,6 +637,7 @@ struct cfg80211_chan_def { * @retry_long: retry count value * @retry_short: retry count value * @ampdu: Enable/Disable aggregation + * @rtscts: Enable/Disable RTS/CTS */ struct cfg80211_tid_cfg { bool config_override; @@ -645,6 +646,7 @@ struct cfg80211_tid_cfg { enum nl80211_tid_config noack; u8 retry_long, retry_short; enum nl80211_tid_config ampdu; + enum nl80211_tid_config rtscts; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 71cae33e6679..b002ef2060fa 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4798,6 +4798,9 @@ enum nl80211_tid_config { * @NL80211_TID_CONFIG_ATTR_AMPDU_CTRL: Enable/Disable aggregation for the TIDs * specified in %NL80211_TID_CONFIG_ATTR_TIDS. Its type is u8, using * the values from &nl80211_tid_config. + * @NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL: Enable/Disable RTS_CTS for the TIDs + * specified in %NL80211_TID_CONFIG_ATTR_TIDS. It is u8 type, using + * the values from &nl80211_tid_config. */ enum nl80211_tid_config_attr { __NL80211_TID_CONFIG_ATTR_INVALID, @@ -4810,6 +4813,7 @@ enum nl80211_tid_config_attr { NL80211_TID_CONFIG_ATTR_RETRY_SHORT, NL80211_TID_CONFIG_ATTR_RETRY_LONG, NL80211_TID_CONFIG_ATTR_AMPDU_CTRL, + NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL, /* keep last */ __NL80211_TID_CONFIG_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 078d30756b3e..ae5e10fe1196 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -340,6 +340,8 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { [NL80211_TID_CONFIG_ATTR_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_TID_CONFIG_ATTR_AMPDU_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), }; const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { @@ -14055,6 +14057,12 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL); + tid_conf->rtscts = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else -- cgit v1.2.3 From 4e52889f48fedfa5f2e443dc8490550cb49fbdc3 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Mon, 13 Jan 2020 10:48:53 +0100 Subject: media: atmel: atmel-isc-base: expose white balance as v4l2 controls This exposes the white balance configuration of the ISC as v4l2 controls into userspace. There are 8 controls available: 4 gain controls, sliders, for each of the BAYER components: R, B, GR, GB. These gains are multipliers for each component, in format unsigned 0:4:9 with a default value of 512 (1.0 multiplier). 4 offset controls, sliders, for each of the BAYER components: R, B, GR, GB. These offsets are added/substracted from each component, in format signed 1:12:0 with a default value of 0 (+/- 0) To expose this to userspace, added 8 custom controls, in an auto cluster. To summarize the functionality: The auto cluster switch is the auto white balance control, and it works like this: AWB == 1: autowhitebalance is on, the do_white_balance button is inactive, the gains/offsets are inactive, but volatile and readable. Thus, the results of the whitebalance algorithm are available to userspace to read at any time. AWB == 0: autowhitebalance is off, cluster is in manual mode, user can configure the gain/offsets directly. More than that, if the do_white_balance button is pressed, the driver will perform one-time-adjustment, (preferably with color checker card) and the userspace can read again the new values. With this feature, the userspace can save the coefficients and reinstall them for example after reboot or reprobing the driver. [hverkuil: fix checkpatch warning] [hverkuil: minor spacing adjustments in the functionality description] Signed-off-by: Eugen Hristev Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/atmel/atmel-isc-base.c | 222 ++++++++++++++++++++++---- drivers/media/platform/atmel/atmel-isc.h | 23 ++- include/linux/atmel-isc-media.h | 58 +++++++ include/uapi/linux/v4l2-controls.h | 6 + 4 files changed, 281 insertions(+), 28 deletions(-) create mode 100644 include/linux/atmel-isc-media.h (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/atmel/atmel-isc-base.c b/drivers/media/platform/atmel/atmel-isc-base.c index d7669a03e98e..b6c9a78c9198 100644 --- a/drivers/media/platform/atmel/atmel-isc-base.c +++ b/drivers/media/platform/atmel/atmel-isc-base.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -224,10 +225,35 @@ const u32 isc_gamma_table[GAMMA_MAX + 1][GAMMA_ENTRIES] = { (((mbus_code) == MEDIA_BUS_FMT_Y10_1X10) | \ (((mbus_code) == MEDIA_BUS_FMT_Y8_1X8))) +#define ISC_CTRL_ISC_TO_V4L2(x) ((x) == ISC_WB_O_ZERO_VAL ? 0 : (x)) +#define ISC_CTRL_V4L2_TO_ISC(x) ((x) ? (x) : ISC_WB_O_ZERO_VAL) + +static inline void isc_update_v4l2_ctrls(struct isc_device *isc) +{ + struct isc_ctrls *ctrls = &isc->ctrls; + + /* In here we set the v4l2 controls w.r.t. our pipeline config */ + v4l2_ctrl_s_ctrl(isc->r_gain_ctrl, ctrls->gain[ISC_HIS_CFG_MODE_R]); + v4l2_ctrl_s_ctrl(isc->b_gain_ctrl, ctrls->gain[ISC_HIS_CFG_MODE_B]); + v4l2_ctrl_s_ctrl(isc->gr_gain_ctrl, ctrls->gain[ISC_HIS_CFG_MODE_GR]); + v4l2_ctrl_s_ctrl(isc->gb_gain_ctrl, ctrls->gain[ISC_HIS_CFG_MODE_GB]); + + v4l2_ctrl_s_ctrl(isc->r_off_ctrl, + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_R])); + v4l2_ctrl_s_ctrl(isc->b_off_ctrl, + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_B])); + v4l2_ctrl_s_ctrl(isc->gr_off_ctrl, + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_GR])); + v4l2_ctrl_s_ctrl(isc->gb_off_ctrl, + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_GB])); +} + static inline void isc_update_awb_ctrls(struct isc_device *isc) { struct isc_ctrls *ctrls = &isc->ctrls; + /* In here we set our actual hw pipeline config */ + regmap_write(isc->regmap, ISC_WB_O_RGR, (ISC_WB_O_ZERO_VAL - (ctrls->offset[ISC_HIS_CFG_MODE_R])) | ((ISC_WB_O_ZERO_VAL - ctrls->offset[ISC_HIS_CFG_MODE_GR]) << 16)); @@ -662,11 +688,9 @@ static void isc_set_pipeline(struct isc_device *isc, u32 pipeline) bay_cfg = isc->config.sd_format->cfa_baycfg; - if (ctrls->awb == ISC_WB_NONE) - isc_reset_awb_ctrls(isc); - regmap_write(regmap, ISC_WB_CFG, bay_cfg); isc_update_awb_ctrls(isc); + isc_update_v4l2_ctrls(isc); regmap_write(regmap, ISC_CFA_CFG, bay_cfg | ISC_CFA_CFG_EITPOL); @@ -1396,6 +1420,7 @@ static int isc_set_fmt(struct isc_device *isc, struct v4l2_format *f) isc->try_config.sd_format != isc->config.sd_format) { isc->ctrls.hist_stat = HIST_INIT; isc_reset_awb_ctrls(isc); + isc_update_v4l2_ctrls(isc); } /* make the try configuration active */ isc->config = isc->try_config; @@ -1814,10 +1839,6 @@ static void isc_awb_work(struct work_struct *w) ctrls->hist_id = hist_id; baysel = isc->config.sd_format->cfa_baycfg << ISC_HIS_CFG_BAYSEL_SHIFT; - /* if no more auto white balance, reset controls. */ - if (ctrls->awb == ISC_WB_NONE) - isc_reset_awb_ctrls(isc); - pm_runtime_get_sync(isc->dev); /* @@ -1842,6 +1863,8 @@ static void isc_awb_work(struct work_struct *w) if (ctrls->awb == ISC_WB_ONETIME) { v4l2_info(&isc->v4l2_dev, "Completed one time white-balance adjustment.\n"); + /* update the v4l2 controls values */ + isc_update_v4l2_ctrls(isc); ctrls->awb = ISC_WB_NONE; } } @@ -1873,6 +1896,27 @@ static int isc_s_ctrl(struct v4l2_ctrl *ctrl) case V4L2_CID_GAMMA: ctrls->gamma_index = ctrl->val; break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct v4l2_ctrl_ops isc_ctrl_ops = { + .s_ctrl = isc_s_ctrl, +}; + +static int isc_s_awb_ctrl(struct v4l2_ctrl *ctrl) +{ + struct isc_device *isc = container_of(ctrl->handler, + struct isc_device, ctrls.handler); + struct isc_ctrls *ctrls = &isc->ctrls; + + if (ctrl->flags & V4L2_CTRL_FLAG_INACTIVE) + return 0; + + switch (ctrl->id) { case V4L2_CID_AUTO_WHITE_BALANCE: if (ctrl->val == 1) ctrls->awb = ISC_WB_AUTO; @@ -1883,36 +1927,142 @@ static int isc_s_ctrl(struct v4l2_ctrl *ctrl) if (!isc->config.sd_format) break; - if (ctrls->hist_stat != HIST_ENABLED) - isc_reset_awb_ctrls(isc); + /* configure the controls with new values from v4l2 */ + if (ctrl->cluster[ISC_CTRL_R_GAIN]->is_new) + ctrls->gain[ISC_HIS_CFG_MODE_R] = isc->r_gain_ctrl->val; + if (ctrl->cluster[ISC_CTRL_B_GAIN]->is_new) + ctrls->gain[ISC_HIS_CFG_MODE_B] = isc->b_gain_ctrl->val; + if (ctrl->cluster[ISC_CTRL_GR_GAIN]->is_new) + ctrls->gain[ISC_HIS_CFG_MODE_GR] = isc->gr_gain_ctrl->val; + if (ctrl->cluster[ISC_CTRL_GB_GAIN]->is_new) + ctrls->gain[ISC_HIS_CFG_MODE_GB] = isc->gb_gain_ctrl->val; + + if (ctrl->cluster[ISC_CTRL_R_OFF]->is_new) + ctrls->offset[ISC_HIS_CFG_MODE_R] = + ISC_CTRL_V4L2_TO_ISC(isc->r_off_ctrl->val); + if (ctrl->cluster[ISC_CTRL_B_OFF]->is_new) + ctrls->offset[ISC_HIS_CFG_MODE_B] = + ISC_CTRL_V4L2_TO_ISC(isc->b_off_ctrl->val); + if (ctrl->cluster[ISC_CTRL_GR_OFF]->is_new) + ctrls->offset[ISC_HIS_CFG_MODE_GR] = + ISC_CTRL_V4L2_TO_ISC(isc->gr_off_ctrl->val); + if (ctrl->cluster[ISC_CTRL_GB_OFF]->is_new) + ctrls->offset[ISC_HIS_CFG_MODE_GB] = + ISC_CTRL_V4L2_TO_ISC(isc->gb_off_ctrl->val); - if (isc->ctrls.awb == ISC_WB_AUTO && + isc_update_awb_ctrls(isc); + + if (vb2_is_streaming(&isc->vb2_vidq)) { + /* + * If we are streaming, we can update profile to + * have the new settings in place. + */ + isc_update_profile(isc); + } else { + /* + * The auto cluster will activate automatically this + * control. This has to be deactivated when not + * streaming. + */ + v4l2_ctrl_activate(isc->do_wb_ctrl, false); + } + + /* if we have autowhitebalance on, start histogram procedure */ + if (ctrls->awb == ISC_WB_AUTO && vb2_is_streaming(&isc->vb2_vidq) && ISC_IS_FORMAT_RAW(isc->config.sd_format->mbus_code)) isc_set_histogram(isc, true); - break; - case V4L2_CID_DO_WHITE_BALANCE: - /* if AWB is enabled, do nothing */ - if (ctrls->awb == ISC_WB_AUTO) - return 0; + /* + * for one time whitebalance adjustment, check the button, + * if it's pressed, perform the one time operation. + */ + if (ctrls->awb == ISC_WB_NONE && + ctrl->cluster[ISC_CTRL_DO_WB]->is_new && + !(ctrl->cluster[ISC_CTRL_DO_WB]->flags & + V4L2_CTRL_FLAG_INACTIVE)) { + ctrls->awb = ISC_WB_ONETIME; + isc_set_histogram(isc, true); + v4l2_dbg(1, debug, &isc->v4l2_dev, + "One time white-balance started.\n"); + } + return 0; + } + return 0; +} - ctrls->awb = ISC_WB_ONETIME; - isc_set_histogram(isc, true); - v4l2_dbg(1, debug, &isc->v4l2_dev, - "One time white-balance started.\n"); +static int isc_g_volatile_awb_ctrl(struct v4l2_ctrl *ctrl) +{ + struct isc_device *isc = container_of(ctrl->handler, + struct isc_device, ctrls.handler); + struct isc_ctrls *ctrls = &isc->ctrls; + + switch (ctrl->id) { + /* being a cluster, this id will be called for every control */ + case V4L2_CID_AUTO_WHITE_BALANCE: + ctrl->cluster[ISC_CTRL_R_GAIN]->val = + ctrls->gain[ISC_HIS_CFG_MODE_R]; + ctrl->cluster[ISC_CTRL_B_GAIN]->val = + ctrls->gain[ISC_HIS_CFG_MODE_B]; + ctrl->cluster[ISC_CTRL_GR_GAIN]->val = + ctrls->gain[ISC_HIS_CFG_MODE_GR]; + ctrl->cluster[ISC_CTRL_GB_GAIN]->val = + ctrls->gain[ISC_HIS_CFG_MODE_GB]; + + ctrl->cluster[ISC_CTRL_R_OFF]->val = + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_R]); + ctrl->cluster[ISC_CTRL_B_OFF]->val = + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_B]); + ctrl->cluster[ISC_CTRL_GR_OFF]->val = + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_GR]); + ctrl->cluster[ISC_CTRL_GB_OFF]->val = + ISC_CTRL_ISC_TO_V4L2(ctrls->offset[ISC_HIS_CFG_MODE_GB]); break; - default: - return -EINVAL; } - return 0; } -static const struct v4l2_ctrl_ops isc_ctrl_ops = { - .s_ctrl = isc_s_ctrl, +static const struct v4l2_ctrl_ops isc_awb_ops = { + .s_ctrl = isc_s_awb_ctrl, + .g_volatile_ctrl = isc_g_volatile_awb_ctrl, }; +#define ISC_CTRL_OFF(_name, _id, _name_str) \ + static const struct v4l2_ctrl_config _name = { \ + .ops = &isc_awb_ops, \ + .id = _id, \ + .name = _name_str, \ + .type = V4L2_CTRL_TYPE_INTEGER, \ + .flags = V4L2_CTRL_FLAG_SLIDER, \ + .min = -4095, \ + .max = 4095, \ + .step = 1, \ + .def = 0, \ + } + +ISC_CTRL_OFF(isc_r_off_ctrl, ISC_CID_R_OFFSET, "Red Component Offset"); +ISC_CTRL_OFF(isc_b_off_ctrl, ISC_CID_B_OFFSET, "Blue Component Offset"); +ISC_CTRL_OFF(isc_gr_off_ctrl, ISC_CID_GR_OFFSET, "Green Red Component Offset"); +ISC_CTRL_OFF(isc_gb_off_ctrl, ISC_CID_GB_OFFSET, "Green Blue Component Offset"); + +#define ISC_CTRL_GAIN(_name, _id, _name_str) \ + static const struct v4l2_ctrl_config _name = { \ + .ops = &isc_awb_ops, \ + .id = _id, \ + .name = _name_str, \ + .type = V4L2_CTRL_TYPE_INTEGER, \ + .flags = V4L2_CTRL_FLAG_SLIDER, \ + .min = 0, \ + .max = 8191, \ + .step = 1, \ + .def = 512, \ + } + +ISC_CTRL_GAIN(isc_r_gain_ctrl, ISC_CID_R_GAIN, "Red Component Gain"); +ISC_CTRL_GAIN(isc_b_gain_ctrl, ISC_CID_B_GAIN, "Blue Component Gain"); +ISC_CTRL_GAIN(isc_gr_gain_ctrl, ISC_CID_GR_GAIN, "Green Red Component Gain"); +ISC_CTRL_GAIN(isc_gb_gain_ctrl, ISC_CID_GB_GAIN, "Green Blue Component Gain"); + static int isc_ctrl_init(struct isc_device *isc) { const struct v4l2_ctrl_ops *ops = &isc_ctrl_ops; @@ -1923,7 +2073,7 @@ static int isc_ctrl_init(struct isc_device *isc) ctrls->hist_stat = HIST_INIT; isc_reset_awb_ctrls(isc); - ret = v4l2_ctrl_handler_init(hdl, 5); + ret = v4l2_ctrl_handler_init(hdl, 13); if (ret < 0) return ret; @@ -1933,10 +2083,13 @@ static int isc_ctrl_init(struct isc_device *isc) v4l2_ctrl_new_std(hdl, ops, V4L2_CID_BRIGHTNESS, -1024, 1023, 1, 0); v4l2_ctrl_new_std(hdl, ops, V4L2_CID_CONTRAST, -2048, 2047, 1, 256); v4l2_ctrl_new_std(hdl, ops, V4L2_CID_GAMMA, 0, GAMMA_MAX, 1, 2); - v4l2_ctrl_new_std(hdl, ops, V4L2_CID_AUTO_WHITE_BALANCE, 0, 1, 1, 1); + isc->awb_ctrl = v4l2_ctrl_new_std(hdl, &isc_awb_ops, + V4L2_CID_AUTO_WHITE_BALANCE, + 0, 1, 1, 1); /* do_white_balance is a button, so min,max,step,default are ignored */ - isc->do_wb_ctrl = v4l2_ctrl_new_std(hdl, ops, V4L2_CID_DO_WHITE_BALANCE, + isc->do_wb_ctrl = v4l2_ctrl_new_std(hdl, &isc_awb_ops, + V4L2_CID_DO_WHITE_BALANCE, 0, 0, 0, 0); if (!isc->do_wb_ctrl) { @@ -1947,6 +2100,21 @@ static int isc_ctrl_init(struct isc_device *isc) v4l2_ctrl_activate(isc->do_wb_ctrl, false); + isc->r_gain_ctrl = v4l2_ctrl_new_custom(hdl, &isc_r_gain_ctrl, NULL); + isc->b_gain_ctrl = v4l2_ctrl_new_custom(hdl, &isc_b_gain_ctrl, NULL); + isc->gr_gain_ctrl = v4l2_ctrl_new_custom(hdl, &isc_gr_gain_ctrl, NULL); + isc->gb_gain_ctrl = v4l2_ctrl_new_custom(hdl, &isc_gb_gain_ctrl, NULL); + isc->r_off_ctrl = v4l2_ctrl_new_custom(hdl, &isc_r_off_ctrl, NULL); + isc->b_off_ctrl = v4l2_ctrl_new_custom(hdl, &isc_b_off_ctrl, NULL); + isc->gr_off_ctrl = v4l2_ctrl_new_custom(hdl, &isc_gr_off_ctrl, NULL); + isc->gb_off_ctrl = v4l2_ctrl_new_custom(hdl, &isc_gb_off_ctrl, NULL); + + /* + * The cluster is in auto mode with autowhitebalance enabled + * and manual mode otherwise. + */ + v4l2_ctrl_auto_cluster(10, &isc->awb_ctrl, 0, true); + v4l2_ctrl_handler_setup(hdl); return 0; diff --git a/drivers/media/platform/atmel/atmel-isc.h b/drivers/media/platform/atmel/atmel-isc.h index bfaed2fad2b5..fc56a745c7d1 100644 --- a/drivers/media/platform/atmel/atmel-isc.h +++ b/drivers/media/platform/atmel/atmel-isc.h @@ -213,7 +213,6 @@ struct isc_device { struct fmt_config try_config; struct isc_ctrls ctrls; - struct v4l2_ctrl *do_wb_ctrl; struct work_struct awb_work; struct mutex lock; /* serialize access to file operations */ @@ -223,6 +222,28 @@ struct isc_device { struct isc_subdev_entity *current_subdev; struct list_head subdev_entities; + + struct { +#define ISC_CTRL_DO_WB 1 +#define ISC_CTRL_R_GAIN 2 +#define ISC_CTRL_B_GAIN 3 +#define ISC_CTRL_GR_GAIN 4 +#define ISC_CTRL_GB_GAIN 5 +#define ISC_CTRL_R_OFF 6 +#define ISC_CTRL_B_OFF 7 +#define ISC_CTRL_GR_OFF 8 +#define ISC_CTRL_GB_OFF 9 + struct v4l2_ctrl *awb_ctrl; + struct v4l2_ctrl *do_wb_ctrl; + struct v4l2_ctrl *r_gain_ctrl; + struct v4l2_ctrl *b_gain_ctrl; + struct v4l2_ctrl *gr_gain_ctrl; + struct v4l2_ctrl *gb_gain_ctrl; + struct v4l2_ctrl *r_off_ctrl; + struct v4l2_ctrl *b_off_ctrl; + struct v4l2_ctrl *gr_off_ctrl; + struct v4l2_ctrl *gb_off_ctrl; + }; }; #define GAMMA_MAX 2 diff --git a/include/linux/atmel-isc-media.h b/include/linux/atmel-isc-media.h new file mode 100644 index 000000000000..79a320fb724e --- /dev/null +++ b/include/linux/atmel-isc-media.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2019 Microchip Technology Inc. and its subsidiaries + * + * Author: Eugen Hristev + */ + +#ifndef __LINUX_ATMEL_ISC_MEDIA_H__ +#define __LINUX_ATMEL_ISC_MEDIA_H__ + +/* + * There are 8 controls available: + * 4 gain controls, sliders, for each of the BAYER components: R, B, GR, GB. + * These gains are multipliers for each component, in format unsigned 0:4:9 with + * a default value of 512 (1.0 multiplier). + * 4 offset controls, sliders, for each of the BAYER components: R, B, GR, GB. + * These offsets are added/substracted from each component, in format signed + * 1:12:0 with a default value of 0 (+/- 0) + * + * To expose this to userspace, added 8 custom controls, in an auto cluster. + * + * To summarize the functionality: + * The auto cluster switch is the auto white balance control, and it works + * like this: + * AWB == 1: autowhitebalance is on, the do_white_balance button is inactive, + * the gains/offsets are inactive, but volatile and readable. + * Thus, the results of the whitebalance algorithm are available to userspace to + * read at any time. + * AWB == 0: autowhitebalance is off, cluster is in manual mode, user can + * configure the gain/offsets directly. + * More than that, if the do_white_balance button is + * pressed, the driver will perform one-time-adjustment, (preferably with color + * checker card) and the userspace can read again the new values. + * + * With this feature, the userspace can save the coefficients and reinstall them + * for example after reboot or reprobing the driver. + */ + +enum atmel_isc_ctrl_id { + /* Red component gain control */ + ISC_CID_R_GAIN = (V4L2_CID_USER_ATMEL_ISC_BASE + 0), + /* Blue component gain control */ + ISC_CID_B_GAIN, + /* Green Red component gain control */ + ISC_CID_GR_GAIN, + /* Green Blue gain control */ + ISC_CID_GB_GAIN, + /* Red component offset control */ + ISC_CID_R_OFFSET, + /* Blue component offset control */ + ISC_CID_B_OFFSET, + /* Green Red component offset control */ + ISC_CID_GR_OFFSET, + /* Green Blue component offset control */ + ISC_CID_GB_OFFSET, +}; + +#endif diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 5a7bedee2b0e..1a58d7cc4ccc 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -192,6 +192,12 @@ enum v4l2_colorfx { * We reserve 16 controls for this driver. */ #define V4L2_CID_USER_IMX_BASE (V4L2_CID_USER_BASE + 0x10b0) +/* + * The base for the atmel isc driver controls. + * We reserve 32 controls for this driver. + */ +#define V4L2_CID_USER_ATMEL_ISC_BASE (V4L2_CID_USER_BASE + 0x10c0) + /* MPEG-class control IDs */ /* The MPEG controls are applicable to all codec controls * and the 'MPEG' part of the define is historical */ -- cgit v1.2.3 From 571912c69f0ed731bd1e071ade9dc7ca4aa52065 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Mon, 24 Feb 2020 10:57:50 +0530 Subject: net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc. The Bareudp tunnel module provides a generic L3 encapsulation tunnelling module for tunnelling different protocols like MPLS, IP,NSH etc inside a UDP tunnel. Signed-off-by: Martin Varghese Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 34 ++ Documentation/networking/index.rst | 1 + drivers/net/Kconfig | 13 + drivers/net/Makefile | 1 + drivers/net/bareudp.c | 743 +++++++++++++++++++++++++++++++++++ include/net/bareudp.h | 19 + include/net/ipv6.h | 6 + include/net/route.h | 6 + include/uapi/linux/if_link.h | 11 + net/ipv4/route.c | 48 +++ net/ipv6/ip6_output.c | 70 ++++ 11 files changed, 952 insertions(+) create mode 100644 Documentation/networking/bareudp.rst create mode 100644 drivers/net/bareudp.c create mode 100644 include/net/bareudp.h (limited to 'include/uapi/linux') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst new file mode 100644 index 000000000000..6ee68c16cf5b --- /dev/null +++ b/Documentation/networking/bareudp.rst @@ -0,0 +1,34 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +Bare UDP Tunnelling Module Documentation +======================================== + +There are various L3 encapsulation standards using UDP being discussed to +leverage the UDP based load balancing capability of different networks. +MPLSoUDP (__ https://tools.ietf.org/html/rfc7510) is one among them. + +The Bareudp tunnel module provides a generic L3 encapsulation tunnelling +support for tunnelling different L3 protocols like MPLS, IP, NSH etc. inside +a UDP tunnel. + +Usage +------ + +1) Device creation & deletion + + a) ip link add dev bareudp0 type bareudp dstport 6635 ethertype 0x8847. + + This creates a bareudp tunnel device which tunnels L3 traffic with ethertype + 0x8847 (MPLS traffic). The destination port of the UDP header will be set to + 6635.The device will listen on UDP port 6635 to receive traffic. + + b) ip link delete bareudp0 + +2) Device Usage + +The bareudp device could be used along with OVS or flower filter in TC. +The OVS or TC flower layer must set the tunnel information in SKB dst field before +sending packet buffer to the bareudp device for transmission. On reception the +bareudp device extracts and stores the tunnel information in SKB dst field before +passing the packet buffer to the network stack. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index d07d9855dcd3..3a83cfb66704 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -8,6 +8,7 @@ Contents: netdev-FAQ af_xdp + bareudp batman-adv can can_ucan_protocol diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 25a8f9387d5a..66e410e58c8e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -258,6 +258,19 @@ config GENEVE To compile this driver as a module, choose M here: the module will be called geneve. +config BAREUDP + tristate "Bare UDP Encapsulation" + depends on INET + depends on IPV6 || !IPV6 + select NET_UDP_TUNNEL + select GRO_CELLS + help + This adds a bare UDP tunnel module for tunnelling different + kinds of traffic like MPLS, IP, etc. inside a UDP tunnel. + + To compile this driver as a module, choose M here: the module + will be called bareudp. + config GTP tristate "GPRS Tunneling Protocol datapath (GTP-U)" depends on INET diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 71b88ffc5587..65967246f240 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_VETH) += veth.o obj-$(CONFIG_VIRTIO_NET) += virtio_net.o obj-$(CONFIG_VXLAN) += vxlan.o obj-$(CONFIG_GENEVE) += geneve.o +obj-$(CONFIG_BAREUDP) += bareudp.o obj-$(CONFIG_GTP) += gtp.o obj-$(CONFIG_NLMON) += nlmon.o obj-$(CONFIG_NET_VRF) += vrf.o diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c new file mode 100644 index 000000000000..32518969139e --- /dev/null +++ b/drivers/net/bareudp.c @@ -0,0 +1,743 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Bareudp: UDP tunnel encasulation for different Payload types like + * MPLS, NSH, IP, etc. + * Copyright (c) 2019 Nokia, Inc. + * Authors: Martin Varghese, + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BAREUDP_BASE_HLEN sizeof(struct udphdr) +#define BAREUDP_IPV4_HLEN (sizeof(struct iphdr) + \ + sizeof(struct udphdr)) +#define BAREUDP_IPV6_HLEN (sizeof(struct ipv6hdr) + \ + sizeof(struct udphdr)) + +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); + +/* per-network namespace private data for this module */ + +static unsigned int bareudp_net_id; + +struct bareudp_net { + struct list_head bareudp_list; +}; + +/* Pseudo network device */ +struct bareudp_dev { + struct net *net; /* netns for packet i/o */ + struct net_device *dev; /* netdev for bareudp tunnel */ + __be16 ethertype; + __be16 port; + u16 sport_min; + struct socket __rcu *sock; + struct list_head next; /* bareudp node on namespace list */ + struct gro_cells gro_cells; +}; + +static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct metadata_dst *tun_dst = NULL; + struct pcpu_sw_netstats *stats; + struct bareudp_dev *bareudp; + unsigned short family; + unsigned int len; + __be16 proto; + void *oiph; + int err; + + bareudp = rcu_dereference_sk_user_data(sk); + if (!bareudp) + goto drop; + + if (skb->protocol == htons(ETH_P_IP)) + family = AF_INET; + else + family = AF_INET6; + + proto = bareudp->ethertype; + + if (iptunnel_pull_header(skb, BAREUDP_BASE_HLEN, + proto, + !net_eq(bareudp->net, + dev_net(bareudp->dev)))) { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + skb_dst_set(skb, &tun_dst->dst); + skb->dev = bareudp->dev; + oiph = skb_network_header(skb); + skb_reset_network_header(skb); + + if (family == AF_INET) + err = IP_ECN_decapsulate(oiph, skb); +#if IS_ENABLED(CONFIG_IPV6) + else + err = IP6_ECN_decapsulate(oiph, skb); +#endif + + if (unlikely(err)) { + if (log_ecn_error) { + if (family == AF_INET) + net_info_ratelimited("non-ECT from %pI4 " + "with TOS=%#x\n", + &((struct iphdr *)oiph)->saddr, + ((struct iphdr *)oiph)->tos); +#if IS_ENABLED(CONFIG_IPV6) + else + net_info_ratelimited("non-ECT from %pI6\n", + &((struct ipv6hdr *)oiph)->saddr); +#endif + } + if (err > 1) { + ++bareudp->dev->stats.rx_frame_errors; + ++bareudp->dev->stats.rx_errors; + goto drop; + } + } + + len = skb->len; + err = gro_cells_receive(&bareudp->gro_cells, skb); + if (likely(err == NET_RX_SUCCESS)) { + stats = this_cpu_ptr(bareudp->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(&stats->syncp); + } + return 0; +drop: + /* Consume bad packet */ + kfree_skb(skb); + + return 0; +} + +static int bareudp_err_lookup(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static int bareudp_init(struct net_device *dev) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&bareudp->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + return 0; +} + +static void bareudp_uninit(struct net_device *dev) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + + gro_cells_destroy(&bareudp->gro_cells); + free_percpu(dev->tstats); +} + +static struct socket *bareudp_create_sock(struct net *net, __be16 port) +{ + struct udp_port_cfg udp_conf; + struct socket *sock; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); +#if IS_ENABLED(CONFIG_IPV6) + udp_conf.family = AF_INET6; +#else + udp_conf.family = AF_INET; +#endif + udp_conf.local_udp_port = port; + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +/* Create new listen socket if needed */ +static int bareudp_socket_create(struct bareudp_dev *bareudp, __be16 port) +{ + struct udp_tunnel_sock_cfg tunnel_cfg; + struct socket *sock; + + sock = bareudp_create_sock(bareudp->net, port); + if (IS_ERR(sock)) + return PTR_ERR(sock); + + /* Mark socket as an encapsulation socket */ + memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); + tunnel_cfg.sk_user_data = bareudp; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = bareudp_udp_encap_recv; + tunnel_cfg.encap_err_lookup = bareudp_err_lookup; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg); + + if (sock->sk->sk_family == AF_INET6) + udp_encap_enable(); + + rcu_assign_pointer(bareudp->sock, sock); + return 0; +} + +static int bareudp_open(struct net_device *dev) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + int ret = 0; + + ret = bareudp_socket_create(bareudp, bareudp->port); + return ret; +} + +static void bareudp_sock_release(struct bareudp_dev *bareudp) +{ + struct socket *sock; + + sock = bareudp->sock; + rcu_assign_pointer(bareudp->sock, NULL); + synchronize_net(); + udp_tunnel_sock_release(sock); +} + +static int bareudp_stop(struct net_device *dev) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + + bareudp_sock_release(bareudp); + return 0; +} + +static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, + struct bareudp_dev *bareudp, + const struct ip_tunnel_info *info) +{ + bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); + struct socket *sock = rcu_dereference(bareudp->sock); + bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); + const struct ip_tunnel_key *key = &info->key; + struct rtable *rt; + __be16 sport, df; + int min_headroom; + __u8 tos, ttl; + __be32 saddr; + int err; + + if (!sock) + return -ESHUTDOWN; + + rt = ip_route_output_tunnel(skb, dev, bareudp->net, &saddr, info, + IPPROTO_UDP, use_cache); + + if (IS_ERR(rt)) + return PTR_ERR(rt); + + skb_tunnel_check_pmtu(skb, &rt->dst, + BAREUDP_IPV4_HLEN + info->options_len); + + sport = udp_flow_src_port(bareudp->net, skb, + bareudp->sport_min, USHRT_MAX, + true); + tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); + ttl = key->ttl; + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + skb_scrub_packet(skb, xnet); + + if (!skb_pull(skb, skb_network_offset(skb))) + goto free_dst; + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + goto free_dst; + + err = udp_tunnel_handle_offloads(skb, udp_sum); + if (err) + goto free_dst; + + skb_set_inner_protocol(skb, bareudp->ethertype); + udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, + tos, ttl, df, sport, bareudp->port, + !net_eq(bareudp->net, dev_net(bareudp->dev)), + !(info->key.tun_flags & TUNNEL_CSUM)); + return 0; + +free_dst: + dst_release(&rt->dst); + return err; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, + struct bareudp_dev *bareudp, + const struct ip_tunnel_info *info) +{ + bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); + struct socket *sock = rcu_dereference(bareudp->sock); + bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); + const struct ip_tunnel_key *key = &info->key; + struct dst_entry *dst = NULL; + struct in6_addr saddr, daddr; + int min_headroom; + __u8 prio, ttl; + __be16 sport; + int err; + + if (!sock) + return -ESHUTDOWN; + + dst = ip6_dst_lookup_tunnel(skb, dev, bareudp->net, sock, &saddr, info, + IPPROTO_UDP, use_cache); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len); + + sport = udp_flow_src_port(bareudp->net, skb, + bareudp->sport_min, USHRT_MAX, + true); + prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); + ttl = key->ttl; + + skb_scrub_packet(skb, xnet); + + if (!skb_pull(skb, skb_network_offset(skb))) + goto free_dst; + + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + goto free_dst; + + err = udp_tunnel_handle_offloads(skb, udp_sum); + if (err) + goto free_dst; + + daddr = info->key.u.ipv6.dst; + udp_tunnel6_xmit_skb(dst, sock->sk, skb, dev, + &saddr, &daddr, prio, ttl, + info->key.label, sport, bareudp->port, + !(info->key.tun_flags & TUNNEL_CSUM)); + return 0; + +free_dst: + dst_release(dst); + return err; +} +#endif + +static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + struct ip_tunnel_info *info = NULL; + int err; + + if (skb->protocol != bareudp->ethertype) { + err = -EINVAL; + goto tx_error; + } + + info = skb_tunnel_info(skb); + if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { + err = -EINVAL; + goto tx_error; + } + + rcu_read_lock(); +#if IS_ENABLED(CONFIG_IPV6) + if (info->mode & IP_TUNNEL_INFO_IPV6) + err = bareudp6_xmit_skb(skb, dev, bareudp, info); + else +#endif + err = bareudp_xmit_skb(skb, dev, bareudp, info); + + rcu_read_unlock(); + + if (likely(!err)) + return NETDEV_TX_OK; +tx_error: + dev_kfree_skb(skb); + + if (err == -ELOOP) + dev->stats.collisions++; + else if (err == -ENETUNREACH) + dev->stats.tx_carrier_errors++; + + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} + +static int bareudp_fill_metadata_dst(struct net_device *dev, + struct sk_buff *skb) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct bareudp_dev *bareudp = netdev_priv(dev); + bool use_cache; + + use_cache = ip_tunnel_dst_cache_usable(skb, info); + + if (ip_tunnel_info_af(info) == AF_INET) { + struct rtable *rt; + __be32 saddr; + + rt = ip_route_output_tunnel(skb, dev, bareudp->net, &saddr, + info, IPPROTO_UDP, use_cache); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + info->key.u.ipv4.src = saddr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (ip_tunnel_info_af(info) == AF_INET6) { + struct dst_entry *dst; + struct in6_addr saddr; + struct socket *sock = rcu_dereference(bareudp->sock); + + dst = ip6_dst_lookup_tunnel(skb, dev, bareudp->net, sock, + &saddr, info, IPPROTO_UDP, + use_cache); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + dst_release(dst); + info->key.u.ipv6.src = saddr; +#endif + } else { + return -EINVAL; + } + + info->key.tp_src = udp_flow_src_port(bareudp->net, skb, + bareudp->sport_min, + USHRT_MAX, true); + info->key.tp_dst = bareudp->port; + return 0; +} + +static const struct net_device_ops bareudp_netdev_ops = { + .ndo_init = bareudp_init, + .ndo_uninit = bareudp_uninit, + .ndo_open = bareudp_open, + .ndo_stop = bareudp_stop, + .ndo_start_xmit = bareudp_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_fill_metadata_dst = bareudp_fill_metadata_dst, +}; + +static const struct nla_policy bareudp_policy[IFLA_BAREUDP_MAX + 1] = { + [IFLA_BAREUDP_PORT] = { .type = NLA_U16 }, + [IFLA_BAREUDP_ETHERTYPE] = { .type = NLA_U16 }, + [IFLA_BAREUDP_SRCPORT_MIN] = { .type = NLA_U16 }, +}; + +/* Info for udev, that this is a virtual tunnel endpoint */ +static struct device_type bareudp_type = { + .name = "bareudp", +}; + +/* Initialize the device structure. */ +static void bareudp_setup(struct net_device *dev) +{ + dev->netdev_ops = &bareudp_netdev_ops; + dev->needs_free_netdev = true; + SET_NETDEV_DEVTYPE(dev, &bareudp_type); + dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; + dev->features |= NETIF_F_RXCSUM; + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->mtu = ETH_DATA_LEN; + dev->min_mtu = IPV4_MIN_MTU; + dev->max_mtu = IP_MAX_MTU - BAREUDP_BASE_HLEN; + dev->type = ARPHRD_NONE; + netif_keep_dst(dev); + dev->priv_flags |= IFF_NO_QUEUE; + dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; +} + +static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + if (!data) { + NL_SET_ERR_MSG(extack, + "Not enough attributes provided to perform the operation"); + return -EINVAL; + } + return 0; +} + +static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf) +{ + if (!data[IFLA_BAREUDP_PORT] || !data[IFLA_BAREUDP_ETHERTYPE]) + return -EINVAL; + + if (data[IFLA_BAREUDP_PORT]) + conf->port = nla_get_u16(data[IFLA_BAREUDP_PORT]); + + if (data[IFLA_BAREUDP_ETHERTYPE]) + conf->ethertype = nla_get_u16(data[IFLA_BAREUDP_ETHERTYPE]); + + if (data[IFLA_BAREUDP_SRCPORT_MIN]) + conf->sport_min = nla_get_u16(data[IFLA_BAREUDP_SRCPORT_MIN]); + + return 0; +} + +static struct bareudp_dev *bareudp_find_dev(struct bareudp_net *bn, + const struct bareudp_conf *conf) +{ + struct bareudp_dev *bareudp, *t = NULL; + + list_for_each_entry(bareudp, &bn->bareudp_list, next) { + if (conf->port == bareudp->port) + t = bareudp; + } + return t; +} + +static int bareudp_configure(struct net *net, struct net_device *dev, + struct bareudp_conf *conf) +{ + struct bareudp_net *bn = net_generic(net, bareudp_net_id); + struct bareudp_dev *t, *bareudp = netdev_priv(dev); + int err; + + bareudp->net = net; + bareudp->dev = dev; + t = bareudp_find_dev(bn, conf); + if (t) + return -EBUSY; + + bareudp->port = conf->port; + bareudp->ethertype = conf->ethertype; + bareudp->sport_min = conf->sport_min; + err = register_netdevice(dev); + if (err) + return err; + + list_add(&bareudp->next, &bn->bareudp_list); + return 0; +} + +static int bareudp_link_config(struct net_device *dev, + struct nlattr *tb[]) +{ + int err; + + if (tb[IFLA_MTU]) { + err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + if (err) + return err; + } + return 0; +} + +static int bareudp_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct bareudp_conf conf; + int err; + + err = bareudp2info(data, &conf); + if (err) + return err; + + err = bareudp_configure(net, dev, &conf); + if (err) + return err; + + err = bareudp_link_config(dev, tb); + if (err) + return err; + + return 0; +} + +static void bareudp_dellink(struct net_device *dev, struct list_head *head) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + + list_del(&bareudp->next); + unregister_netdevice_queue(dev, head); +} + +static size_t bareudp_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_PORT */ + nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ + nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ + 0; +} + +static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + + if (nla_put_be16(skb, IFLA_BAREUDP_PORT, bareudp->port)) + goto nla_put_failure; + if (nla_put_be16(skb, IFLA_BAREUDP_ETHERTYPE, bareudp->ethertype)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_BAREUDP_SRCPORT_MIN, bareudp->sport_min)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static struct rtnl_link_ops bareudp_link_ops __read_mostly = { + .kind = "bareudp", + .maxtype = IFLA_BAREUDP_MAX, + .policy = bareudp_policy, + .priv_size = sizeof(struct bareudp_dev), + .setup = bareudp_setup, + .validate = bareudp_validate, + .newlink = bareudp_newlink, + .dellink = bareudp_dellink, + .get_size = bareudp_get_size, + .fill_info = bareudp_fill_info, +}; + +struct net_device *bareudp_dev_create(struct net *net, const char *name, + u8 name_assign_type, + struct bareudp_conf *conf) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + LIST_HEAD(list_kill); + int err; + + memset(tb, 0, sizeof(tb)); + dev = rtnl_create_link(net, name, name_assign_type, + &bareudp_link_ops, tb, NULL); + if (IS_ERR(dev)) + return dev; + + err = bareudp_configure(net, dev, conf); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + err = dev_set_mtu(dev, IP_MAX_MTU - BAREUDP_BASE_HLEN); + if (err) + goto err; + + err = rtnl_configure_link(dev, NULL); + if (err < 0) + goto err; + + return dev; +err: + bareudp_dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bareudp_dev_create); + +static __net_init int bareudp_init_net(struct net *net) +{ + struct bareudp_net *bn = net_generic(net, bareudp_net_id); + + INIT_LIST_HEAD(&bn->bareudp_list); + return 0; +} + +static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) +{ + struct bareudp_net *bn = net_generic(net, bareudp_net_id); + struct bareudp_dev *bareudp, *next; + + list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) + unregister_netdevice_queue(bareudp->dev, head); +} + +static void __net_exit bareudp_exit_batch_net(struct list_head *net_list) +{ + struct net *net; + LIST_HEAD(list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) + bareudp_destroy_tunnels(net, &list); + + /* unregister the devices gathered above */ + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations bareudp_net_ops = { + .init = bareudp_init_net, + .exit_batch = bareudp_exit_batch_net, + .id = &bareudp_net_id, + .size = sizeof(struct bareudp_net), +}; + +static int __init bareudp_init_module(void) +{ + int rc; + + rc = register_pernet_subsys(&bareudp_net_ops); + if (rc) + goto out1; + + rc = rtnl_link_register(&bareudp_link_ops); + if (rc) + goto out2; + + return 0; +out2: + unregister_pernet_subsys(&bareudp_net_ops); +out1: + return rc; +} +late_initcall(bareudp_init_module); + +static void __exit bareudp_cleanup_module(void) +{ + rtnl_link_unregister(&bareudp_link_ops); + unregister_pernet_subsys(&bareudp_net_ops); +} +module_exit(bareudp_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Varghese "); +MODULE_DESCRIPTION("Interface driver for UDP encapsulated traffic"); diff --git a/include/net/bareudp.h b/include/net/bareudp.h new file mode 100644 index 000000000000..513fae6f6ba1 --- /dev/null +++ b/include/net/bareudp.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NET_BAREUDP_H +#define __NET_BAREUDP_H + +#include +#include + +struct bareudp_conf { + __be16 ethertype; + __be16 port; + u16 sport_min; +}; + +struct net_device *bareudp_dev_create(struct net *net, const char *name, + u8 name_assign_type, + struct bareudp_conf *info); + +#endif diff --git a/include/net/ipv6.h b/include/net/ipv6.h index cec1a54401f2..1bf8065fe871 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1027,6 +1027,12 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, st struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); +struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, + struct net_device *dev, + struct net *net, struct socket *sock, + struct in6_addr *saddr, + const struct ip_tunnel_info *info, + u8 protocol, bool use_cache); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); diff --git a/include/net/route.h b/include/net/route.h index a9c60fc68e36..81750ae50833 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -128,6 +128,12 @@ static inline struct rtable *__ip_route_output_key(struct net *net, struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, const struct sock *sk); +struct rtable *ip_route_output_tunnel(struct sk_buff *skb, + struct net_device *dev, + struct net *net, __be32 *saddr, + const struct ip_tunnel_info *info, + u8 protocol, bool use_cache); + struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 024af2d1d0af..fb4b33af23d5 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -590,6 +590,17 @@ enum ifla_geneve_df { GENEVE_DF_MAX = __GENEVE_DF_END - 1, }; +/* Bareudp section */ +enum { + IFLA_BAREUDP_UNSPEC, + IFLA_BAREUDP_PORT, + IFLA_BAREUDP_ETHERTYPE, + IFLA_BAREUDP_SRCPORT_MIN, + __IFLA_BAREUDP_MAX +}; + +#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) + /* PPP section */ enum { IFLA_PPP_UNSPEC, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ebe7060d0fc9..042599cc691d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2774,6 +2774,54 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); +struct rtable *ip_route_output_tunnel(struct sk_buff *skb, + struct net_device *dev, + struct net *net, __be32 *saddr, + const struct ip_tunnel_info *info, + u8 protocol, bool use_cache) +{ +#ifdef CONFIG_DST_CACHE + struct dst_cache *dst_cache; +#endif + struct rtable *rt = NULL; + struct flowi4 fl4; + __u8 tos; + +#ifdef CONFIG_DST_CACHE + dst_cache = (struct dst_cache *)&info->dst_cache; + if (use_cache) { + rt = dst_cache_get_ip4(dst_cache, saddr); + if (rt) + return rt; + } +#endif + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = protocol; + fl4.daddr = info->key.u.ipv4.dst; + fl4.saddr = info->key.u.ipv4.src; + tos = info->key.tos; + fl4.flowi4_tos = RT_TOS(tos); + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); + return ERR_PTR(-ENETUNREACH); + } + if (rt->dst.dev == dev) { /* is this necessary? */ + netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); + ip_rt_put(rt); + return ERR_PTR(-ELOOP); + } +#ifdef CONFIG_DST_CACHE + if (use_cache) + dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); +#endif + *saddr = fl4.saddr; + return rt; +} +EXPORT_SYMBOL_GPL(ip_route_output_tunnel); + /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, struct flowi4 *fl4, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 087304427bbb..8a8c2d0cfcc8 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -54,6 +54,7 @@ #include #include #include +#include static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -1196,6 +1197,75 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); +/** + * ip6_dst_lookup_tunnel - perform route lookup on tunnel + * @skb: Packet for which lookup is done + * @dev: Tunnel device + * @net: Network namespace of tunnel device + * @sk: Socket which provides route info + * @saddr: Memory to store the src ip address + * @info: Tunnel information + * @protocol: IP protocol + * @use_cahce: Flag to enable cache usage + * This function performs a route lookup on a tunnel + * + * It returns a valid dst pointer and stores src address to be used in + * tunnel in param saddr on success, else a pointer encoded error code. + */ + +struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, + struct net_device *dev, + struct net *net, + struct socket *sock, + struct in6_addr *saddr, + const struct ip_tunnel_info *info, + u8 protocol, + bool use_cache) +{ + struct dst_entry *dst = NULL; +#ifdef CONFIG_DST_CACHE + struct dst_cache *dst_cache; +#endif + struct flowi6 fl6; + __u8 prio; + +#ifdef CONFIG_DST_CACHE + dst_cache = (struct dst_cache *)&info->dst_cache; + if (use_cache) { + dst = dst_cache_get_ip6(dst_cache, saddr); + if (dst) + return dst; + } +#endif + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = protocol; + fl6.daddr = info->key.u.ipv6.dst; + fl6.saddr = info->key.u.ipv6.src; + prio = info->key.tos; + fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio), + info->key.label); + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, + NULL); + if (IS_ERR(dst)) { + netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); + return ERR_PTR(-ENETUNREACH); + } + if (dst->dev == dev) { /* is this necessary? */ + netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); + dst_release(dst); + return ERR_PTR(-ELOOP); + } +#ifdef CONFIG_DST_CACHE + if (use_cache) + dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); +#endif + *saddr = fl6.saddr; + return dst; +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel); + static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { -- cgit v1.2.3 From 4b5f67232d956a5f837082578ba682410ccab498 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Mon, 24 Feb 2020 10:58:35 +0530 Subject: net: Special handling for IP & MPLS. Special handling is needed in bareudp module for IP & MPLS as they support more than one ethertypes. MPLS has 2 ethertypes. 0x8847 for MPLS unicast and 0x8848 for MPLS multicast. While decapsulating MPLS packet from UDP packet the tunnel destination IP address is checked to determine the ethertype. The ethertype of the packet will be set to 0x8848 if the tunnel destination IP address is a multicast IP address. The ethertype of the packet will be set to 0x8847 if the tunnel destination IP address is a unicast IP address. IP has 2 ethertypes.0x0800 for IPV4 and 0x86dd for IPv6. The version field of the IP header tunnelled will be checked to determine the ethertype. This special handling to tunnel additional ethertypes will be disabled by default and can be enabled using a flag called multiproto. This flag can be used only with ethertypes 0x8847 and 0x0800. Signed-off-by: Martin Varghese Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 20 ++++++++++- drivers/net/bareudp.c | 67 ++++++++++++++++++++++++++++++++++-- include/net/bareudp.h | 1 + include/uapi/linux/if_link.h | 1 + 4 files changed, 85 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 6ee68c16cf5b..465a8b251bfe 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -12,6 +12,15 @@ The Bareudp tunnel module provides a generic L3 encapsulation tunnelling support for tunnelling different L3 protocols like MPLS, IP, NSH etc. inside a UDP tunnel. +Special Handling +---------------- +The bareudp device supports special handling for MPLS & IP as they can have +multiple ethertypes. +MPLS procotcol can have ethertypes ETH_P_MPLS_UC (unicast) & ETH_P_MPLS_MC (multicast). +IP protocol can have ethertypes ETH_P_IP (v4) & ETH_P_IPV6 (v6). +This special handling can be enabled only for ethertypes ETH_P_IP & ETH_P_MPLS_UC +with a flag called multiproto mode. + Usage ------ @@ -25,7 +34,16 @@ Usage b) ip link delete bareudp0 -2) Device Usage +2) Device creation with multiple proto mode enabled + +There are two ways to create a bareudp device for MPLS & IP with multiproto mode +enabled. + + a) ip link add dev bareudp0 type bareudp dstport 6635 ethertype 0x8847 multiproto + + b) ip link add dev bareudp0 type bareudp dstport 6635 ethertype mpls + +3) Device Usage The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 32518969139e..77e72477499d 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -45,6 +45,7 @@ struct bareudp_dev { __be16 ethertype; __be16 port; u16 sport_min; + bool multi_proto_mode; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -70,7 +71,52 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) else family = AF_INET6; - proto = bareudp->ethertype; + if (bareudp->ethertype == htons(ETH_P_IP)) { + struct iphdr *iphdr; + + iphdr = (struct iphdr *)(skb->data + BAREUDP_BASE_HLEN); + if (iphdr->version == 4) { + proto = bareudp->ethertype; + } else if (bareudp->multi_proto_mode && (iphdr->version == 6)) { + proto = htons(ETH_P_IPV6); + } else { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { + struct iphdr *tunnel_hdr; + + tunnel_hdr = (struct iphdr *)skb_network_header(skb); + if (tunnel_hdr->version == 4) { + if (!ipv4_is_multicast(tunnel_hdr->daddr)) { + proto = bareudp->ethertype; + } else if (bareudp->multi_proto_mode && + ipv4_is_multicast(tunnel_hdr->daddr)) { + proto = htons(ETH_P_MPLS_MC); + } else { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + } else { + int addr_type; + struct ipv6hdr *tunnel_hdr_v6; + + tunnel_hdr_v6 = (struct ipv6hdr *)skb_network_header(skb); + addr_type = + ipv6_addr_type((struct in6_addr *)&tunnel_hdr_v6->daddr); + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + proto = bareudp->ethertype; + } else if (bareudp->multi_proto_mode && + (addr_type & IPV6_ADDR_MULTICAST)) { + proto = htons(ETH_P_MPLS_MC); + } else { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + } + } else { + proto = bareudp->ethertype; + } if (iptunnel_pull_header(skb, BAREUDP_BASE_HLEN, proto, @@ -369,8 +415,12 @@ static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) int err; if (skb->protocol != bareudp->ethertype) { - err = -EINVAL; - goto tx_error; + if (!bareudp->multi_proto_mode || + (skb->protocol != htons(ETH_P_MPLS_MC) && + skb->protocol != htons(ETH_P_IPV6))) { + err = -EINVAL; + goto tx_error; + } } info = skb_tunnel_info(skb); @@ -463,6 +513,7 @@ static const struct nla_policy bareudp_policy[IFLA_BAREUDP_MAX + 1] = { [IFLA_BAREUDP_PORT] = { .type = NLA_U16 }, [IFLA_BAREUDP_ETHERTYPE] = { .type = NLA_U16 }, [IFLA_BAREUDP_SRCPORT_MIN] = { .type = NLA_U16 }, + [IFLA_BAREUDP_MULTIPROTO_MODE] = { .type = NLA_FLAG }, }; /* Info for udev, that this is a virtual tunnel endpoint */ @@ -545,9 +596,15 @@ static int bareudp_configure(struct net *net, struct net_device *dev, if (t) return -EBUSY; + if (conf->multi_proto_mode && + (conf->ethertype != htons(ETH_P_MPLS_UC) && + conf->ethertype != htons(ETH_P_IP))) + return -EINVAL; + bareudp->port = conf->port; bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; + bareudp->multi_proto_mode = conf->multi_proto_mode; err = register_netdevice(dev); if (err) return err; @@ -604,6 +661,7 @@ static size_t bareudp_get_size(const struct net_device *dev) return nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_PORT */ nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ + nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ 0; } @@ -617,6 +675,9 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BAREUDP_SRCPORT_MIN, bareudp->sport_min)) goto nla_put_failure; + if (bareudp->multi_proto_mode && + nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) + goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index 513fae6f6ba1..cb03f6f15956 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -10,6 +10,7 @@ struct bareudp_conf { __be16 ethertype; __be16 port; u16 sport_min; + bool multi_proto_mode; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index fb4b33af23d5..61e0801c82df 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -596,6 +596,7 @@ enum { IFLA_BAREUDP_PORT, IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, + IFLA_BAREUDP_MULTIPROTO_MODE, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3 From 85b0589ede83d7b4aeb2bc3cb8910183876cd5ee Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 25 Feb 2020 11:45:19 +0100 Subject: devlink: add trap metadata type for cookie Allow driver to indicate cookie metadata for registered traps. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 3 +++ 3 files changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 07923e619206..014a8b3d1499 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -541,6 +541,7 @@ struct devlink_trap_group { }; #define DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT BIT(0) +#define DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE BIT(1) /** * struct devlink_trap - Immutable packet trap attributes. diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ae37fd4d194a..be2a2948f452 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -252,6 +252,8 @@ enum devlink_trap_type { enum { /* Trap can report input port as metadata */ DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT, + /* Trap can report flow action cookie as metadata */ + DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE, }; enum devlink_attr { diff --git a/net/core/devlink.c b/net/core/devlink.c index 0d7c5d3443d2..12e6ef749b8a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5540,6 +5540,9 @@ static int devlink_trap_metadata_put(struct sk_buff *msg, if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) goto nla_put_failure; + if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) + goto nla_put_failure; nla_nest_end(msg, attr); -- cgit v1.2.3 From 742b8cceaabc3ed09d4c4d527d9c6311c4925545 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 25 Feb 2020 11:45:20 +0100 Subject: drop_monitor: extend by passing cookie from driver If driver passed along the cookie, push it through Netlink. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/drop_monitor.h | 3 +++ include/uapi/linux/net_dropmon.h | 1 + net/core/drop_monitor.c | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/drop_monitor.h b/include/net/drop_monitor.h index 2ab668461463..ddd441a60e03 100644 --- a/include/net/drop_monitor.h +++ b/include/net/drop_monitor.h @@ -6,17 +6,20 @@ #include #include #include +#include /** * struct net_dm_hw_metadata - Hardware-supplied packet metadata. * @trap_group_name: Hardware trap group name. * @trap_name: Hardware trap name. * @input_dev: Input netdevice. + * @fa_cookie: Flow action user cookie. */ struct net_dm_hw_metadata { const char *trap_group_name; const char *trap_name; struct net_device *input_dev; + const struct flow_action_cookie *fa_cookie; }; #if IS_ENABLED(CONFIG_NET_DROP_MONITOR) diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 8bf79a9eb234..66048cc5d7b3 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -92,6 +92,7 @@ enum net_dm_attr { NET_DM_ATTR_HW_TRAP_COUNT, /* u32 */ NET_DM_ATTR_SW_DROPS, /* flag */ NET_DM_ATTR_HW_DROPS, /* flag */ + NET_DM_ATTR_FLOW_ACTION_COOKIE, /* binary */ __NET_DM_ATTR_MAX, NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1 diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 31700e0c3928..d58c1c45a895 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -700,6 +701,13 @@ static void net_dm_packet_work(struct work_struct *work) net_dm_packet_report(skb); } +static size_t +net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata) +{ + return hw_metadata->fa_cookie ? + nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; +} + static size_t net_dm_hw_packet_report_size(size_t payload_len, const struct net_dm_hw_metadata *hw_metadata) @@ -717,6 +725,8 @@ net_dm_hw_packet_report_size(size_t payload_len, nla_total_size(strlen(hw_metadata->trap_name) + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ + net_dm_flow_action_cookie_size(hw_metadata) + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ @@ -762,6 +772,12 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg, goto nla_put_failure; } + if (hw_metadata->fa_cookie && + nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, + hw_metadata->fa_cookie->cookie_len, + hw_metadata->fa_cookie->cookie)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; @@ -794,11 +810,12 @@ nla_put_failure: static struct net_dm_hw_metadata * net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata) { + const struct flow_action_cookie *fa_cookie; struct net_dm_hw_metadata *n_hw_metadata; const char *trap_group_name; const char *trap_name; - n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC); + n_hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); if (!n_hw_metadata) return NULL; @@ -812,12 +829,25 @@ net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata) goto free_trap_group; n_hw_metadata->trap_name = trap_name; + if (hw_metadata->fa_cookie) { + size_t cookie_size = sizeof(*fa_cookie) + + hw_metadata->fa_cookie->cookie_len; + + fa_cookie = kmemdup(hw_metadata->fa_cookie, cookie_size, + GFP_ATOMIC); + if (!fa_cookie) + goto free_trap_name; + n_hw_metadata->fa_cookie = fa_cookie; + } + n_hw_metadata->input_dev = hw_metadata->input_dev; if (n_hw_metadata->input_dev) dev_hold(n_hw_metadata->input_dev); return n_hw_metadata; +free_trap_name: + kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: @@ -830,6 +860,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata) { if (hw_metadata->input_dev) dev_put(hw_metadata->input_dev); + kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); kfree(hw_metadata); -- cgit v1.2.3 From 29b40f105ec8d555984c1f72dc9133b122e51903 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Mon, 30 Sep 2019 04:19:18 -0400 Subject: KVM: s390: protvirt: Add initial vm and cpu lifecycle handling This contains 3 main changes: 1. changes in SIE control block handling for secure guests 2. helper functions for create/destroy/unpack secure guests 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure machines Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck [borntraeger@de.ibm.com: patch merging, splitting, fixing] Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 24 +++- arch/s390/include/asm/uv.h | 69 ++++++++++ arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/kvm-s390.c | 214 ++++++++++++++++++++++++++++++- arch/s390/kvm/kvm-s390.h | 33 +++++ arch/s390/kvm/pv.c | 266 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 31 +++++ 7 files changed, 635 insertions(+), 4 deletions(-) create mode 100644 arch/s390/kvm/pv.c (limited to 'include/uapi/linux') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d058289385a5..1aa2382fe363 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -160,7 +160,13 @@ struct kvm_s390_sie_block { __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ - __u8 reserved10[16]; /* 0x0010 */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; #define PROG_BLOCK_SIE (1<<0) #define PROG_REQUEST (1<<1) atomic_t prog20; /* 0x0020 */ @@ -233,7 +239,7 @@ struct kvm_s390_sie_block { #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ - __u8 reserved68; /* 0x0068 */ + __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ __u8 reserved6a[2]; /* 0x006a */ __u32 todpr; /* 0x006c */ @@ -645,6 +651,11 @@ struct kvm_guestdbg_info_arch { unsigned long last_bp; }; +struct kvm_s390_pv_vcpu { + u64 handle; + unsigned long stor_base; +}; + struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; /* if vsie is active, currently executed shadow sie control block */ @@ -673,6 +684,7 @@ struct kvm_vcpu_arch { __u64 cputm_start; bool gs_enabled; bool skey_enabled; + struct kvm_s390_pv_vcpu pv; }; struct kvm_vm_stat { @@ -843,6 +855,13 @@ struct kvm_s390_gisa_interrupt { DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); }; +struct kvm_s390_pv { + u64 handle; + u64 guest_len; + unsigned long stor_base; + void *stor_var; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -878,6 +897,7 @@ struct kvm_arch{ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index d7aa91c89f6c..91ef26983bfd 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -23,11 +23,19 @@ #define UVC_RC_INV_STATE 0x0003 #define UVC_RC_INV_LEN 0x0005 #define UVC_RC_NO_RESUME 0x0007 +#define UVC_RC_NEED_DESTROY 0x8000 #define UVC_CMD_QUI 0x0001 #define UVC_CMD_INIT_UV 0x000f +#define UVC_CMD_CREATE_SEC_CONF 0x0100 +#define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_CREATE_SEC_CPU 0x0120 +#define UVC_CMD_DESTROY_SEC_CPU 0x0121 #define UVC_CMD_CONV_TO_SEC_STOR 0x0200 #define UVC_CMD_CONV_FROM_SEC_STOR 0x0201 +#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 +#define UVC_CMD_UNPACK_IMG 0x0301 +#define UVC_CMD_VERIFY_IMG 0x0302 #define UVC_CMD_PIN_PAGE_SHARED 0x0341 #define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 @@ -37,10 +45,17 @@ enum uv_cmds_inst { BIT_UVC_CMD_QUI = 0, BIT_UVC_CMD_INIT_UV = 1, + BIT_UVC_CMD_CREATE_SEC_CONF = 2, + BIT_UVC_CMD_DESTROY_SEC_CONF = 3, + BIT_UVC_CMD_CREATE_SEC_CPU = 4, + BIT_UVC_CMD_DESTROY_SEC_CPU = 5, BIT_UVC_CMD_CONV_TO_SEC_STOR = 6, BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7, BIT_UVC_CMD_SET_SHARED_ACCESS = 8, BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, + BIT_UVC_CMD_SET_SEC_PARMS = 11, + BIT_UVC_CMD_UNPACK_IMG = 13, + BIT_UVC_CMD_VERIFY_IMG = 14, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, }; @@ -52,6 +67,7 @@ struct uv_cb_header { u16 rrc; /* Return Reason Code */ } __packed __aligned(8); +/* Query Ultravisor Information */ struct uv_cb_qui { struct uv_cb_header header; u64 reserved08; @@ -71,6 +87,7 @@ struct uv_cb_qui { u8 reserveda0[200 - 160]; } __packed __aligned(8); +/* Initialize Ultravisor */ struct uv_cb_init { struct uv_cb_header header; u64 reserved08[2]; @@ -79,6 +96,35 @@ struct uv_cb_init { u64 reserved28[4]; } __packed __aligned(8); +/* Create Guest Configuration */ +struct uv_cb_cgc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 conf_base_stor_origin; + u64 conf_virt_stor_origin; + u64 reserved30; + u64 guest_stor_origin; + u64 guest_stor_len; + u64 guest_sca; + u64 guest_asce; + u64 reserved58[5]; +} __packed __aligned(8); + +/* Create Secure CPU */ +struct uv_cb_csc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 guest_handle; + u64 stor_origin; + u8 reserved30[6]; + u16 num; + u64 state_origin; + u64 reserved40[4]; +} __packed __aligned(8); + +/* Convert to Secure */ struct uv_cb_cts { struct uv_cb_header header; u64 reserved08[2]; @@ -86,12 +132,34 @@ struct uv_cb_cts { u64 gaddr; } __packed __aligned(8); +/* Convert from Secure / Pin Page Shared */ struct uv_cb_cfs { struct uv_cb_header header; u64 reserved08[2]; u64 paddr; } __packed __aligned(8); +/* Set Secure Config Parameter */ +struct uv_cb_ssc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 sec_header_origin; + u32 sec_header_len; + u32 reserved2c; + u64 reserved30[4]; +} __packed __aligned(8); + +/* Unpack */ +struct uv_cb_unp { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; + u64 tweak[2]; + u64 reserved38[3]; +} __packed __aligned(8); + /* * A common UV call struct for calls that take no payload * Examples: @@ -105,6 +173,7 @@ struct uv_cb_nodata { u64 reserved20[4]; } __packed __aligned(8); +/* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; u64 reserved08[3]; diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 05ee90a5ea08..12decca22e7c 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -9,6 +9,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o vsie.o +kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7e4a982bfea3..87258bebb955 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -234,8 +235,10 @@ int kvm_arch_check_processor_compat(void) return 0; } +/* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); +static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -2165,6 +2168,160 @@ out: return r; } +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) +{ + struct kvm_vcpu *vcpu; + u16 rc, rrc; + int ret = 0; + int i; + + /* + * We ignore failures and try to destroy as many CPUs as possible. + * At the same time we must not free the assigned resources when + * this fails, as the ultravisor has still access to that memory. + * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak + * behind. + * We want to return the first failure rc and rrc, though. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) { + *rcp = rc; + *rrcp = rrc; + ret = -EIO; + } + mutex_unlock(&vcpu->mutex); + } + return ret; +} + +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int i, r = 0; + u16 dummy; + + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + r = kvm_s390_pv_create_cpu(vcpu, rc, rrc); + mutex_unlock(&vcpu->mutex); + if (r) + break; + } + if (r) + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + return r; +} + +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) +{ + int r = 0; + u16 dummy; + void __user *argp = (void __user *)cmd->data; + + switch (cmd->cmd) { + case KVM_PV_ENABLE: { + r = -EINVAL; + if (kvm_s390_pv_is_protected(kvm)) + break; + + /* + * FMT 4 SIE needs esca. As we never switch back to bsca from + * esca, we need no cleanup in the error cases below + */ + r = sca_switch_to_extended(kvm); + if (r) + break; + + r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc); + if (r) + break; + + r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); + if (r) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + break; + } + case KVM_PV_DISABLE: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_SET_SEC_PARMS: { + struct kvm_s390_pv_sec_parm parms = {}; + void *hdr; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&parms, argp, sizeof(parms))) + break; + + /* Currently restricted to 8KB */ + r = -EINVAL; + if (parms.length > PAGE_SIZE * 2) + break; + + r = -ENOMEM; + hdr = vmalloc(parms.length); + if (!hdr) + break; + + r = -EFAULT; + if (!copy_from_user(hdr, (void __user *)parms.origin, + parms.length)) + r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length, + &cmd->rc, &cmd->rrc); + + vfree(hdr); + break; + } + case KVM_PV_UNPACK: { + struct kvm_s390_pv_unp unp = {}; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&unp, argp, sizeof(unp))) + break; + + r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_VERIFY: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc, + cmd->rrc); + break; + } + default: + r = -ENOTTY; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2262,6 +2419,31 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->slots_lock); break; } + case KVM_S390_PV_COMMAND: { + struct kvm_pv_cmd args; + + r = 0; + if (!is_prot_virt_host()) { + r = -EINVAL; + break; + } + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + if (args.flags) { + r = -EINVAL; + break; + } + mutex_lock(&kvm->lock); + r = kvm_s390_handle_pv(kvm, &args); + mutex_unlock(&kvm->lock); + if (copy_to_user(argp, &args, sizeof(args))) { + r = -EFAULT; + break; + } + break; + } default: r = -ENOTTY; } @@ -2525,6 +2707,8 @@ out_err: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + u16 rc, rrc; + VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); kvm_s390_clear_local_irqs(vcpu); @@ -2537,6 +2721,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); + /* We can not hold the vcpu mutex here, we are already dying */ + if (kvm_s390_pv_cpu_get_handle(vcpu)) + kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); } @@ -2558,10 +2745,20 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u16 rc, rrc; + kvm_free_vcpus(kvm); sca_dispose(kvm); - debug_unregister(kvm->arch.dbf); kvm_s390_gisa_destroy(kvm); + /* + * We are already at the end of life and kvm->lock is not taken. + * This is ok as the file descriptor is closed by now and nobody + * can mess with the pv state. To avoid lockdep_assert_held from + * complaining we do not use kvm_s390_pv_is_protected. + */ + if (kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) gmap_remove(kvm->arch.gmap); @@ -2657,6 +2854,9 @@ static int sca_switch_to_extended(struct kvm *kvm) unsigned int vcpu_idx; u32 scaol, scaoh; + if (kvm->arch.use_esca) + return 0; + new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO); if (!new_sca) return -ENOMEM; @@ -2908,6 +3108,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; + u16 uvrc, uvrrc; atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | @@ -2975,6 +3176,14 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + mutex_lock(&vcpu->kvm->lock); + if (kvm_s390_pv_is_protected(vcpu->kvm)) { + rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); + if (rc) + kvm_s390_vcpu_unsetup_cmma(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); + return rc; } @@ -4540,6 +4749,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit) return -EINVAL; + /* When we are protected, we should not change the memory slots */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index be55b4b99bd3..13e6986596ed 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -207,6 +208,38 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +/* implemented in pv.c */ +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc); +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc); + +static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) +{ + return kvm->arch.pv.handle; +} + +static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv.handle; +} + +static inline bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} + +static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c new file mode 100644 index 000000000000..e9e020475f4a --- /dev/null +++ b/arch/s390/kvm/pv.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hosting Protected Virtual Machines + * + * Copyright IBM Corp. 2019, 2020 + * Author(s): Janosch Frank + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" + +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + int cc = 0; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", + *rc, *rrc); + } + /* Intended memory leak for something that should never happen. */ + if (!cc) + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + vcpu->arch.sie_block->pv_handle_cpu = 0; + vcpu->arch.sie_block->pv_handle_config = 0; + memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); + vcpu->arch.sie_block->sdf = 0; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + return cc ? EIO : 0; +} + +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + struct uv_cb_csc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CPU, + .header.len = sizeof(uvcb), + }; + int cc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) + return -EINVAL; + + vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, + get_order(uv_info.guest_cpu_stor_len)); + if (!vcpu->arch.pv.stor_base) + return -ENOMEM; + + /* Input */ + uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); + uvcb.num = vcpu->arch.sie_block->icpua; + uvcb.state_origin = (u64)vcpu->arch.sie_block; + uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x", + vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc, + uvcb.header.rrc); + + if (cc) { + u16 dummy; + + kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy); + return -EIO; + } + + /* Output */ + vcpu->arch.pv.handle = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm); + vcpu->arch.sie_block->sdf = 2; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +/* only free resources when the destroy was successful */ +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) +{ + vfree(kvm->arch.pv.stor_var); + free_pages(kvm->arch.pv.stor_base, + get_order(uv_info.guest_base_stor_len)); + memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); +} + +static int kvm_s390_pv_alloc_vm(struct kvm *kvm) +{ + unsigned long base = uv_info.guest_base_stor_len; + unsigned long virt = uv_info.guest_virt_var_stor_len; + unsigned long npages = 0, vlen = 0; + struct kvm_memory_slot *memslot; + + kvm->arch.pv.stor_var = NULL; + kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base)); + if (!kvm->arch.pv.stor_base) + return -ENOMEM; + + /* + * Calculate current guest storage for allocation of the + * variable storage, which is based on the length in MB. + * + * Slots are sorted by GFN + */ + mutex_lock(&kvm->slots_lock); + memslot = kvm_memslots(kvm)->memslots; + npages = memslot->base_gfn + memslot->npages; + mutex_unlock(&kvm->slots_lock); + + kvm->arch.pv.guest_len = npages * PAGE_SIZE; + + /* Allocate variable storage */ + vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); + vlen += uv_info.guest_virt_base_stor_len; + kvm->arch.pv.stor_var = vzalloc(vlen); + if (!kvm->arch.pv.stor_var) + goto out_err; + return 0; + +out_err: + kvm_s390_pv_dealloc_vm(kvm); + return -ENOMEM; +} + +/* this should not fail, but if it does, we must not free the donated memory */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + atomic_set(&kvm->mm->context.is_protected, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); + /* Inteded memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_cgc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CONF, + .header.len = sizeof(uvcb) + }; + int cc, ret; + u16 dummy; + + ret = kvm_s390_pv_alloc_vm(kvm); + if (ret) + return ret; + + /* Inputs */ + uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ + uvcb.guest_stor_len = kvm->arch.pv.guest_len; + uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_sca = (unsigned long)kvm->arch.sca; + uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; + uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc); + + /* Outputs */ + kvm->arch.pv.handle = uvcb.guest_handle; + + if (cc) { + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + else + kvm_s390_pv_dealloc_vm(kvm); + return -EIO; + } + kvm->arch.gmap->guest_handle = uvcb.guest_handle; + atomic_set(&kvm->mm->context.is_protected, 1); + return 0; +} + +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc) +{ + struct uv_cb_ssc uvcb = { + .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS, + .header.len = sizeof(uvcb), + .sec_header_origin = (u64)hdr, + .sec_header_len = length, + .guest_handle = kvm_s390_pv_get_handle(kvm), + }; + int cc = uv_call(0, (u64)&uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", + *rc, *rrc); + return cc ? -EINVAL : 0; +} + +static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, + u64 offset, u16 *rc, u16 *rrc) +{ + struct uv_cb_unp uvcb = { + .header.cmd = UVC_CMD_UNPACK_IMG, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = addr, + .tweak[0] = tweak, + .tweak[1] = offset, + }; + int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + + if (ret && ret != -EAGAIN) + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", + uvcb.gaddr, *rc, *rrc); + return ret; +} + +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc) +{ + u64 offset = 0; + int ret = 0; + + if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK) + return -EINVAL; + + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", + addr, size); + + while (offset < size) { + ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); + if (ret == -EAGAIN) { + cond_resched(); + if (fatal_signal_pending(current)) + break; + continue; + } + if (ret) + break; + addr += PAGE_SIZE; + offset += PAGE_SIZE; + } + if (!ret) + KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful"); + return ret; +} diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4b95f9a31a2f..ad69817f7792 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1478,6 +1478,37 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) +struct kvm_s390_pv_sec_parm { + __u64 origin; + __u64 length; +}; + +struct kvm_s390_pv_unp { + __u64 addr; + __u64 size; + __u64 tweak; +}; + +enum pv_cmd_id { + KVM_PV_ENABLE, + KVM_PV_DISABLE, + KVM_PV_SET_SEC_PARMS, + KVM_PV_UNPACK, + KVM_PV_VERIFY, +}; + +struct kvm_pv_cmd { + __u32 cmd; /* Command to be executed */ + __u16 rc; /* Ultravisor return code */ + __u16 rrc; /* Ultravisor return reason code */ + __u64 data; /* Data or address */ + __u32 flags; /* flags for future extensions. Must be 0 for now */ + __u32 reserved[3]; +}; + +/* Available with KVM_CAP_S390_PROTECTED */ +#define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ -- cgit v1.2.3 From 19e1227768863a1469797c13ef8fea1af7beac2c Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 2 Apr 2019 09:21:06 +0200 Subject: KVM: S390: protvirt: Introduce instruction data area bounce buffer Now that we can't access guest memory anymore, we have a dedicated satellite block that's a bounce buffer for instruction data. We re-use the memop interface to copy the instruction data to / from userspace. This lets us re-use a lot of QEMU code which used that interface to make logical guest memory accesses which are not possible anymore in protected mode anyway. Signed-off-by: Janosch Frank Reviewed-by: Thomas Huth Reviewed-by: David Hildenbrand [borntraeger@de.ibm.com: patch merging, splitting, fixing] Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 11 ++++++- arch/s390/kvm/kvm-s390.c | 66 +++++++++++++++++++++++++++++++++++----- arch/s390/kvm/pv.c | 16 ++++++++++ include/uapi/linux/kvm.h | 9 ++++-- 4 files changed, 91 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 4fcbb055a565..aa945b101fff 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -127,6 +127,12 @@ struct mcck_volatile_info { #define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ CR14_EXTERNAL_DAMAGE_SUBMASK) +#define SIDAD_SIZE_MASK 0xff +#define sida_origin(sie_block) \ + ((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -315,7 +321,10 @@ struct kvm_s390_sie_block { #define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ - __u64 gbea; /* 0x0180 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; __u8 reserved188[8]; /* 0x0188 */ __u64 sdnxo; /* 0x0190 */ __u8 reserved198[8]; /* 0x0198 */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bd62312fdc0e..efbbcd2948a3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4495,12 +4495,40 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + int r = 0; + + if (mop->flags || !mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset < mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) + return -E2BIG; + + switch (mop->op) { + case KVM_S390_MEMOP_SIDA_READ: + if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) + + mop->sida_offset), mop->size)) + r = -EFAULT; + + break; + case KVM_S390_MEMOP_SIDA_WRITE: + if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) + + mop->sida_offset), uaddr, mop->size)) + r = -EFAULT; + break; + } + return r; +} static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void *tmpbuf = NULL; - int r, srcu_idx; + int r = 0; const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION | KVM_S390_MEMOP_F_CHECK_ONLY; @@ -4510,14 +4538,15 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { @@ -4543,12 +4572,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, } r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); break; - default: - r = -EINVAL; } - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); @@ -4556,6 +4581,31 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + int r, srcu_idx; + + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (mop->op) { + case KVM_S390_MEMOP_LOGICAL_READ: + case KVM_S390_MEMOP_LOGICAL_WRITE: + r = kvm_s390_guest_mem_op(vcpu, mop); + break; + case KVM_S390_MEMOP_SIDA_READ: + case KVM_S390_MEMOP_SIDA_WRITE: + /* we are locked against sida going away by the vcpu->mutex */ + r = kvm_s390_guest_sida_op(vcpu, mop); + break; + default: + r = -EINVAL; + } + + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + return r; +} + long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4686,7 +4736,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_s390_mem_op mem_op; if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) - r = kvm_s390_guest_mem_op(vcpu, &mem_op); + r = kvm_s390_guest_memsida_op(vcpu, &mem_op); else r = -EFAULT; break; diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 9840ee49e572..13a41533eacf 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -33,10 +33,18 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) if (!cc) free_pages(vcpu->arch.pv.stor_base, get_order(uv_info.guest_cpu_stor_len)); + + free_page(sida_origin(vcpu->arch.sie_block)); vcpu->arch.sie_block->pv_handle_cpu = 0; vcpu->arch.sie_block->pv_handle_config = 0; memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); vcpu->arch.sie_block->sdf = 0; + /* + * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0). + * Use the reset value of gbea to avoid leaking the kernel pointer of + * the just freed sida. + */ + vcpu->arch.sie_block->gbea = 1; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return cc ? EIO : 0; @@ -64,6 +72,14 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) uvcb.state_origin = (u64)vcpu->arch.sie_block; uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + /* Alloc Secure Instruction Data Area Designation */ + vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!vcpu->arch.sie_block->sidad) { + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + return -ENOMEM; + } + cc = uv_call(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ad69817f7792..f4cac1c09e97 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -474,12 +474,17 @@ struct kvm_s390_mem_op { __u32 size; /* amount of bytes */ __u32 op; /* type of operation */ __u64 buf; /* buffer in userspace */ - __u8 ar; /* the access register number */ - __u8 reserved[31]; /* should be set to 0 */ + union { + __u8 ar; /* the access register number */ + __u32 sida_offset; /* offset into the sida */ + __u8 reserved[32]; /* should be set to 0 */ + }; }; /* types for kvm_s390_mem_op->op */ #define KVM_S390_MEMOP_LOGICAL_READ 0 #define KVM_S390_MEMOP_LOGICAL_WRITE 1 +#define KVM_S390_MEMOP_SIDA_READ 2 +#define KVM_S390_MEMOP_SIDA_WRITE 3 /* flags for kvm_s390_mem_op->flags */ #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) -- cgit v1.2.3 From e0d2773d487c2a41c99d9e256d51cc0a859aa9ab Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 9 May 2019 13:07:21 +0200 Subject: KVM: s390: protvirt: UV calls in support of diag308 0, 1 diag 308 subcode 0 and 1 require several KVM and Ultravisor interactions. Specific to these "soft" reboots are * The "unshare all" UVC * The "prepare for reset" UVC Signed-off-by: Janosch Frank Acked-by: David Hildenbrand Reviewed-by: Cornelia Huck [borntraeger@de.ibm.com: patch merging, splitting, fixing] Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/uv.h | 4 ++++ arch/s390/kvm/kvm-s390.c | 22 ++++++++++++++++++++++ include/uapi/linux/kvm.h | 2 ++ 3 files changed, 28 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 91ef26983bfd..f149c29ddb84 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -36,6 +36,8 @@ #define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 #define UVC_CMD_UNPACK_IMG 0x0301 #define UVC_CMD_VERIFY_IMG 0x0302 +#define UVC_CMD_PREPARE_RESET 0x0320 +#define UVC_CMD_SET_UNSHARE_ALL 0x0340 #define UVC_CMD_PIN_PAGE_SHARED 0x0341 #define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 @@ -56,6 +58,8 @@ enum uv_cmds_inst { BIT_UVC_CMD_SET_SEC_PARMS = 11, BIT_UVC_CMD_UNPACK_IMG = 13, BIT_UVC_CMD_VERIFY_IMG = 14, + BIT_UVC_CMD_PREPARE_RESET = 18, + BIT_UVC_CMD_UNSHARE_ALL = 20, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index abe295077d00..16531b251eab 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2328,6 +2328,28 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) cmd->rrc); break; } + case KVM_PV_PREP_RESET: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_UNSHARE_ALL: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f4cac1c09e97..2c354ba3a610 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1500,6 +1500,8 @@ enum pv_cmd_id { KVM_PV_SET_SEC_PARMS, KVM_PV_UNPACK, KVM_PV_VERIFY, + KVM_PV_PREP_RESET, + KVM_PV_UNSHARE_ALL, }; struct kvm_pv_cmd { -- cgit v1.2.3 From 13da9ae1cdbf1ec4ea36b7612e606681c27cca13 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 18 Feb 2020 15:08:07 -0500 Subject: KVM: s390: protvirt: introduce and enable KVM_CAP_S390_PROTECTED Now that everything is in place, we can announce the feature. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand --- arch/s390/kvm/kvm-s390.c | 3 +++ include/uapi/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bb060064cce0..f4cd436ba979 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -574,6 +574,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED: + r = is_prot_virt_host(); + break; default: r = 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2c354ba3a610..2ab168d6d2a8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1015,6 +1015,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_NISV_TO_USER 177 #define KVM_CAP_ARM_INJECT_EXT_DABT 178 #define KVM_CAP_S390_VCPU_RESETS 179 +#define KVM_CAP_S390_PROTECTED 180 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From d7f10df86202273155a9d8f8553bc2ad28e0dd46 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 26 Feb 2020 18:17:44 -0600 Subject: bpf: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200227001744.GA3317@embeddedor --- include/linux/bpf-cgroup.h | 2 +- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 2 +- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/hashtab.c | 2 +- kernel/bpf/lpm_trie.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a11d5b7dbbf3..a7cd5c7a2509 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -36,7 +36,7 @@ struct bpf_cgroup_storage_map; struct bpf_storage_buffer { struct rcu_head rcu; - char data[0]; + char data[]; }; struct bpf_cgroup_storage { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1acd5bf70350..9aa33b8f3d55 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -859,7 +859,7 @@ struct bpf_prog_array_item { struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog_array_item items[0]; + struct bpf_prog_array_item items[]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 906e9f2752db..8e98ced0963b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..c498f0fffb40 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,7 @@ enum bpf_struct_ops_state { struct bpf_struct_ops_value { BPF_STRUCT_OPS_COMMON_VALUE; - char data[0] ____cacheline_aligned_in_smp; + char data[] ____cacheline_aligned_in_smp; }; struct bpf_struct_ops_map { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 53d9483fee10..d541c8486c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -118,7 +118,7 @@ struct htab_elem { struct bpf_lru_node lru_node; }; u32 hash; - char key[0] __aligned(8); + char key[] __aligned(8); }; static inline bool htab_is_prealloc(const struct bpf_htab *htab) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 3b3c420bc8ed..65c236cf341e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -25,7 +25,7 @@ struct lpm_trie_node { struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; - u8 data[0]; + u8 data[]; }; struct lpm_trie { -- cgit v1.2.3 From 0df6d32842b9a5f97a29ea90c8adc5cfac38341d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:15 -0800 Subject: inet_diag: Move the INET_DIAG_REQ_BYTECODE nlattr to cb->data The INET_DIAG_REQ_BYTECODE nlattr is currently re-found every time when the "dump()" is re-started. In a latter patch, it will also need to parse the new INET_DIAG_REQ_SK_BPF_STORAGES nlattr to learn the map_fds. Thus, this patch takes this chance to store the parsed nlattr in cb->data during the "start" time of a dump. By doing this, the "bc" argument also becomes unnecessary and is removed. Also, the two copies of the INET_DIAG_REQ_BYTECODE parsing-audit logic between compat/current version can be consolidated to one. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230415.1975555-1-kafai@fb.com --- include/linux/inet_diag.h | 11 ++-- include/uapi/linux/inet_diag.h | 3 +- net/dccp/diag.c | 4 +- net/ipv4/inet_diag.c | 117 ++++++++++++++++++++++++----------------- net/ipv4/raw_diag.c | 6 ++- net/ipv4/tcp_diag.c | 4 +- net/ipv4/udp_diag.c | 15 +++--- net/sctp/diag.c | 2 +- 8 files changed, 98 insertions(+), 64 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 6b157ce07d74..1bb94cac265f 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -15,8 +15,7 @@ struct netlink_callback; struct inet_diag_handler { void (*dump)(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc); + const struct inet_diag_req_v2 *r); int (*dump_one)(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); @@ -39,6 +38,11 @@ struct inet_diag_handler { __u16 idiag_info_size; }; +struct inet_diag_dump_data { + struct nlattr *req_nlas[__INET_DIAG_REQ_MAX]; +#define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE] +}; + struct inet_connection_sock; int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, @@ -46,8 +50,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, u16 nlmsg_flags, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc); + const struct inet_diag_req_v2 *r); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct netlink_callback *cb, const struct inet_diag_req_v2 *req); diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index a1ff345b3f33..bab9a9f8da12 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -64,9 +64,10 @@ struct inet_diag_req_raw { enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, + __INET_DIAG_REQ_MAX, }; -#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE +#define INET_DIAG_REQ_MAX (__INET_DIAG_REQ_MAX - 1) /* Bytecode is sequence of 4 byte commands followed by variable arguments. * All the commands identified by "code" are conditional jumps forward: diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 8f1e2a653f6d..8a82c5a2c5a8 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -46,9 +46,9 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc); + inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); } static int dccp_diag_dump_one(struct netlink_callback *cb, diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index d2ecff3195ba..4bce8a477699 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -495,9 +495,11 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { + struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, + .data = &empty_dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -863,14 +865,17 @@ static void twsk_build_assert(void) void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; int i, num, s_i, s_num; + struct nlattr *bc; struct sock *sk; + bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; @@ -1014,15 +1019,14 @@ out: EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc) + const struct inet_diag_req_v2 *r) { const struct inet_diag_handler *handler; int err = 0; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) - handler->dump(skb, cb, r, bc); + handler->dump(skb, cb, r); else err = PTR_ERR(handler); inet_diag_unlock_handler(handler); @@ -1032,13 +1036,57 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int hdrlen = sizeof(struct inet_diag_req_v2); - struct nlattr *bc = NULL; + return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh)); +} + +static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct inet_diag_dump_data *cb_data; + struct sk_buff *skb = cb->skb; + struct nlattr *nla; + int rem, err; + + cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); + if (!cb_data) + return -ENOMEM; + + nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), rem) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + cb_data->req_nlas[type] = nla; + } + + nla = cb_data->inet_diag_nla_bc; + if (nla) { + err = inet_diag_bc_audit(nla, skb); + if (err) { + kfree(cb_data); + return err; + } + } + + cb->data = cb_data; + return 0; +} + +static int inet_diag_dump_start(struct netlink_callback *cb) +{ + return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2)); +} + +static int inet_diag_dump_start_compat(struct netlink_callback *cb) +{ + return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req)); +} - if (nlmsg_attrlen(cb->nlh, hdrlen)) - bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); +static int inet_diag_dump_done(struct netlink_callback *cb) +{ + kfree(cb->data); - return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); + return 0; } static int inet_diag_type2proto(int type) @@ -1057,9 +1105,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); - int hdrlen = sizeof(struct inet_diag_req); struct inet_diag_req_v2 req; - struct nlattr *bc = NULL; req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); @@ -1067,10 +1113,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - if (nlmsg_attrlen(cb->nlh, hdrlen)) - bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - - return __inet_diag_dump(skb, cb, &req, bc); + return __inet_diag_dump(skb, cb, &req); } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, @@ -1098,22 +1141,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { - if (nlmsg_attrlen(nlh, hdrlen)) { - struct nlattr *attr; - int err; - - attr = nlmsg_find_attr(nlh, hdrlen, - INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr, skb); - if (err) - return err; - } - { - struct netlink_dump_control c = { - .dump = inet_diag_dump_compat, - }; - return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); - } + struct netlink_dump_control c = { + .start = inet_diag_dump_start_compat, + .done = inet_diag_dump_done, + .dump = inet_diag_dump_compat, + }; + return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } return inet_diag_get_exact_compat(skb, nlh); @@ -1129,22 +1162,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { - if (nlmsg_attrlen(h, hdrlen)) { - struct nlattr *attr; - int err; - - attr = nlmsg_find_attr(h, hdrlen, - INET_DIAG_REQ_BYTECODE); - err = inet_diag_bc_audit(attr, skb); - if (err) - return err; - } - { - struct netlink_dump_control c = { - .dump = inet_diag_dump, - }; - return netlink_dump_start(net->diag_nlsk, skb, h, &c); - } + struct netlink_dump_control c = { + .start = inet_diag_dump_start, + .done = inet_diag_dump_done, + .dump = inet_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index a2933eeabd91..d19cce39be1b 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -138,17 +138,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct sock *sk = NULL; + struct nlattr *bc; if (IS_ERR(hashinfo)) return; + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index bcd3a26efff1..75a1c985f49a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -179,9 +179,9 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); + inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 7d65a6a5cd51..93884696abdd 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -89,12 +89,16 @@ out_nosk: static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; + struct nlattr *bc; + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -142,9 +146,9 @@ done: } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - udp_dump(&udp_table, skb, cb, r, bc); + udp_dump(&udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, @@ -245,10 +249,9 @@ static const struct inet_diag_handler udp_diag_handler = { }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - struct nlattr *bc) + const struct inet_diag_req_v2 *r) { - udp_dump(&udplite_table, skb, cb, r, bc); + udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, diff --git a/net/sctp/diag.c b/net/sctp/diag.c index bed6436cd0af..69743a6aaf6f 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -471,7 +471,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb, } static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r) { u32 idiag_states = r->idiag_states; struct net *net = sock_net(skb->sk); -- cgit v1.2.3 From 1ed4d92458a969e71e7914550b6f0c730c14d84e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:21 -0800 Subject: bpf: INET_DIAG support in bpf_sk_storage This patch adds INET_DIAG support to bpf_sk_storage. 1. Although this series adds bpf_sk_storage diag capability to inet sk, bpf_sk_storage is in general applicable to all fullsock. Hence, the bpf_sk_storage logic will operate on SK_DIAG_* nlattr. The caller will pass in its specific nesting nlattr (e.g. INET_DIAG_*) as the argument. 2. The request will be like: INET_DIAG_REQ_SK_BPF_STORAGES (nla_nest) (defined in latter patch) SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32) SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32) ...... Considering there could have multiple bpf_sk_storages in a sk, instead of reusing INET_DIAG_INFO ("ss -i"), the user can select some specific bpf_sk_storage to dump by specifying an array of SK_DIAG_BPF_STORAGE_REQ_MAP_FD. If no SK_DIAG_BPF_STORAGE_REQ_MAP_FD is specified (i.e. an empty INET_DIAG_REQ_SK_BPF_STORAGES), it will dump all bpf_sk_storages of a sk. 3. The reply will be like: INET_DIAG_BPF_SK_STORAGES (nla_nest) (defined in latter patch) SK_DIAG_BPF_STORAGE (nla_nest) SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) SK_DIAG_BPF_STORAGE (nla_nest) SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) ...... 4. Unlike other INET_DIAG info of a sk which is pretty static, the size required to dump the bpf_sk_storage(s) of a sk is dynamic as the system adding more bpf_sk_storage_map. It is hard to set a static min_dump_alloc size. Hence, this series learns it at the runtime and adjust the cb->min_dump_alloc as it iterates all sk(s) of a system. The "unsigned int *res_diag_size" in bpf_sk_storage_diag_put() is for this purpose. The next patch will update the cb->min_dump_alloc as it iterates the sk(s). Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230421.1975729-1-kafai@fb.com --- include/linux/bpf.h | 1 + include/net/bpf_sk_storage.h | 27 ++++ include/uapi/linux/sock_diag.h | 26 ++++ kernel/bpf/syscall.c | 15 +++ net/core/bpf_sk_storage.c | 283 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 346 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9aa33b8f3d55..6015a4daf118 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1023,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); +struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 8e4f831d2e52..5036c94c0503 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +struct bpf_sk_storage_diag; +struct sk_buff; +struct nlattr; +struct sock; + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs); +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag); +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size); #else static inline int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) { return 0; } +static inline struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla) +{ + return NULL; +} +static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ +} +static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + return 0; +} #endif #endif /* _BPF_SK_STORAGE_H */ diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index e5925009a652..5f74a5f6091d 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -36,4 +36,30 @@ enum sknetlink_groups { }; #define SKNLGRP_MAX (__SKNLGRP_MAX - 1) +enum { + SK_DIAG_BPF_STORAGE_REQ_NONE, + SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + __SK_DIAG_BPF_STORAGE_REQ_MAX, +}; + +#define SK_DIAG_BPF_STORAGE_REQ_MAX (__SK_DIAG_BPF_STORAGE_REQ_MAX - 1) + +enum { + SK_DIAG_BPF_STORAGE_REP_NONE, + SK_DIAG_BPF_STORAGE, + __SK_DIAG_BPF_STORAGE_REP_MAX, +}; + +#define SK_DIAB_BPF_STORAGE_REP_MAX (__SK_DIAG_BPF_STORAGE_REP_MAX - 1) + +enum { + SK_DIAG_BPF_STORAGE_NONE, + SK_DIAG_BPF_STORAGE_PAD, + SK_DIAG_BPF_STORAGE_MAP_ID, + SK_DIAG_BPF_STORAGE_MAP_VALUE, + __SK_DIAG_BPF_STORAGE_MAX, +}; + +#define SK_DIAG_BPF_STORAGE_MAX (__SK_DIAG_BPF_STORAGE_MAX - 1) + #endif /* _UAPI__SOCK_DIAG_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a79743a89815..c536c65256ad 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -902,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); +struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map); + fdput(f); + + return map; +} + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 3ab23f698221..3415a4896c59 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -8,6 +8,7 @@ #include #include #include +#include #include static atomic_t cache_idx; @@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) kfree(map); } +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_sk_storage_elem))) + static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || @@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || - /* U16_MAX is much more than enough for sk local storage - * considering a tcp_sock is ~2k. - */ - attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + if (attr->value_size > MAX_VALUE_SIZE) return -E2BIG; return 0; @@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_SOCKET, }; + +struct bpf_sk_storage_diag { + u32 nr_maps; + struct bpf_map *maps[]; +}; + +/* The reply will be like: + * INET_DIAG_BPF_SK_STORAGES (nla_nest) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * .... + */ +static int nla_value_size(u32 value_size) +{ + /* SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + */ + return nla_total_size(0) + nla_total_size(sizeof(u32)) + + nla_total_size_64bit(value_size); +} + +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ + u32 i; + + if (!diag) + return; + + for (i = 0; i < diag->nr_maps; i++) + bpf_map_put(diag->maps[i]); + + kfree(diag); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); + +static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, + const struct bpf_map *map) +{ + u32 i; + + for (i = 0; i < diag->nr_maps; i++) { + if (diag->maps[i] == map) + return true; + } + + return false; +} + +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) +{ + struct bpf_sk_storage_diag *diag; + struct nlattr *nla; + u32 nr_maps = 0; + int rem, err; + + /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as + * the map_alloc_check() side also does. + */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + nla_for_each_nested(nla, nla_stgs, rem) { + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + nr_maps++; + } + + diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps, + GFP_KERNEL); + if (!diag) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nla, nla_stgs, rem) { + struct bpf_map *map; + int map_fd; + + if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + continue; + + map_fd = nla_get_u32(nla); + map = bpf_map_get(map_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_free; + } + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { + bpf_map_put(map); + err = -EINVAL; + goto err_free; + } + if (diag_check_dup(diag, map)) { + bpf_map_put(map); + err = -EEXIST; + goto err_free; + } + diag->maps[diag->nr_maps++] = map; + } + + return diag; + +err_free: + bpf_sk_storage_diag_free(diag); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); + +static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb) +{ + struct nlattr *nla_stg, *nla_value; + struct bpf_sk_storage_map *smap; + + /* It cannot exceed max nlattr's payload */ + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE); + + nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); + if (!nla_stg) + return -EMSGSIZE; + + smap = rcu_dereference(sdata->smap); + if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) + goto errout; + + nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, + smap->map.value_size, + SK_DIAG_BPF_STORAGE_PAD); + if (!nla_value) + goto errout; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, nla_data(nla_value), + sdata->data, true); + else + copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + + nla_nest_end(skb, nla_stg); + return 0; + +errout: + nla_nest_cancel(skb, nla_stg); + return -EMSGSIZE; +} + +static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + + rcu_read_lock(); + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + smap = rcu_dereference(SDATA(selem)->smap); + diag_size += nla_value_size(smap->map.value_size); + + if (nla_stgs && diag_get(SDATA(selem), skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} + +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_data *sdata; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + u32 i; + + *res_diag_size = 0; + + /* No map has been specified. Dump all. */ + if (!diag->nr_maps) + return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, + res_diag_size); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + for (i = 0; i < diag->nr_maps; i++) { + sdata = __sk_storage_lookup(sk_storage, + (struct bpf_sk_storage_map *)diag->maps[i], + false); + + if (!sdata) + continue; + + diag_size += nla_value_size(diag->maps[i]->value_size); + + if (nla_stgs && diag_get(sdata, skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); -- cgit v1.2.3 From 085c20cacf2b72991ce1c9d99a5e2f1d9e73bb68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Feb 2020 15:04:27 -0800 Subject: bpf: inet_diag: Dump bpf_sk_storages in inet_diag_dump() This patch will dump out the bpf_sk_storages of a sk if the request has the INET_DIAG_REQ_SK_BPF_STORAGES nlattr. An array of SK_DIAG_BPF_STORAGE_REQ_MAP_FD can be specified in INET_DIAG_REQ_SK_BPF_STORAGES to select which bpf_sk_storage to dump. If no map_fd is specified, all bpf_sk_storages of a sk will be dumped. bpf_sk_storages can be added to the system at runtime. It is difficult to find a proper static value for cb->min_dump_alloc. This patch learns the nlattr size required to dump the bpf_sk_storages of a sk. If it happens to be the very first nlmsg of a dump and it cannot fit the needed bpf_sk_storages, it will try to expand the skb by "pskb_expand_head()". Instead of expanding it in inet_sk_diag_fill(), it is expanded at a sleepable context in __inet_diag_dump() so __GFP_DIRECT_RECLAIM can be used. In __inet_diag_dump(), it will retry as long as the skb is empty and the cb->min_dump_alloc becomes larger than before. cb->min_dump_alloc is bounded by KMALLOC_MAX_SIZE. The min_dump_alloc is also changed from 'u16' to 'u32' to accommodate a sk that may have a few large bpf_sk_storages. The updated cb->min_dump_alloc will also be used to allocate the skb in the next dump. This logic already exists in netlink_dump(). Here is the sample output of a locally modified 'ss' and it could be made more readable by using BTF later: [root@arch-fb-vm1 ~]# ss --bpf-map-id 14 --bpf-map-id 13 -t6an 'dst [::1]:8989' State Recv-Q Send-Q Local Address:Port Peer Address:PortProcess ESTAB 0 0 [::1]:51072 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] ESTAB 0 0 [::1]:51070 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] [root@arch-fb-vm1 ~]# ~/devshare/github/iproute2/misc/ss --bpf-maps -t6an 'dst [::1]:8989' State Recv-Q Send-Q Local Address:Port Peer Address:Port Process ESTAB 0 0 [::1]:51072 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] bpf_map_id:12 value:[ 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000... total:65407 ] ESTAB 0 0 [::1]:51070 [::1]:8989 bpf_map_id:14 value:[ 3feb ] bpf_map_id:13 value:[ 3f ] bpf_map_id:12 value:[ 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000... total:65407 ] Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200225230427.1976129-1-kafai@fb.com --- include/linux/inet_diag.h | 4 +++ include/linux/netlink.h | 4 +-- include/uapi/linux/inet_diag.h | 2 ++ net/ipv4/inet_diag.c | 74 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 1bb94cac265f..e4ba25d63913 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -38,9 +38,13 @@ struct inet_diag_handler { __u16 idiag_info_size; }; +struct bpf_sk_storage_diag; struct inet_diag_dump_data { struct nlattr *req_nlas[__INET_DIAG_REQ_MAX]; #define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE] +#define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES] + + struct bpf_sk_storage_diag *bpf_stg_diag; }; struct inet_connection_sock; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 205fa7b1f07a..788969ccbbde 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -188,10 +188,10 @@ struct netlink_callback { struct module *module; struct netlink_ext_ack *extack; u16 family; - u16 min_dump_alloc; - bool strict_check; u16 answer_flags; + u32 min_dump_alloc; unsigned int prev_seq, seq; + bool strict_check; union { u8 ctx[48]; diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bab9a9f8da12..75dffd78363a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -64,6 +64,7 @@ struct inet_diag_req_raw { enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, + INET_DIAG_REQ_SK_BPF_STORAGES, __INET_DIAG_REQ_MAX, }; @@ -155,6 +156,7 @@ enum { INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_MD5SIG, INET_DIAG_ULP_INFO, + INET_DIAG_SK_BPF_STORAGES, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4bce8a477699..e1cad25909df 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -156,6 +157,8 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, @@ -163,12 +166,14 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; + struct inet_diag_dump_data *cb_data; int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + cb_data = cb->data; handler = inet_diag_table[req->sdiag_protocol]; BUG_ON(!handler); @@ -302,6 +307,48 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } + /* Keep it at the end for potential retry with a larger skb, + * or else do best-effort fitting, which is only done for the + * first_nlmsg. + */ + if (cb_data->bpf_stg_diag) { + bool first_nlmsg = ((unsigned char *)nlh == skb->data); + unsigned int prev_min_dump_alloc; + unsigned int total_nla_size = 0; + unsigned int msg_len; + int err; + + msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; + err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb, + INET_DIAG_SK_BPF_STORAGES, + &total_nla_size); + + if (!err) + goto out; + + total_nla_size += msg_len; + prev_min_dump_alloc = cb->min_dump_alloc; + if (total_nla_size > prev_min_dump_alloc) + cb->min_dump_alloc = min_t(u32, total_nla_size, + MAX_DUMP_ALLOC_SIZE); + + if (!first_nlmsg) + goto errout; + + if (cb->min_dump_alloc > prev_min_dump_alloc) + /* Retry with pskb_expand_head() with + * __GFP_DIRECT_RECLAIM + */ + goto errout; + + WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc); + + /* Send what we have for this sk + * and move on to the next sk in the following + * dump() + */ + } + out: nlmsg_end(skb, nlh); return 0; @@ -1022,8 +1069,11 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { const struct inet_diag_handler *handler; + u32 prev_min_dump_alloc; int err = 0; +again: + prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); @@ -1031,6 +1081,15 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, err = PTR_ERR(handler); inet_diag_unlock_handler(handler); + /* The skb is not large enough to fit one sk info and + * inet_sk_diag_fill() has requested for a larger skb. + */ + if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) { + err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL); + if (!err) + goto again; + } + return err ? : skb->len; } @@ -1068,6 +1127,18 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) } } + nla = cb_data->inet_diag_nla_bpf_stgs; + if (nla) { + struct bpf_sk_storage_diag *bpf_stg_diag; + + bpf_stg_diag = bpf_sk_storage_diag_alloc(nla); + if (IS_ERR(bpf_stg_diag)) { + kfree(cb_data); + return PTR_ERR(bpf_stg_diag); + } + cb_data->bpf_stg_diag = bpf_stg_diag; + } + cb->data = cb_data; return 0; } @@ -1084,6 +1155,9 @@ static int inet_diag_dump_start_compat(struct netlink_callback *cb) static int inet_diag_dump_done(struct netlink_callback *cb) { + struct inet_diag_dump_data *cb_data = cb->data; + + bpf_sk_storage_diag_free(cb_data->bpf_stg_diag); kfree(cb->data); return 0; -- cgit v1.2.3 From 5a8b7c4b7f95c8936cd3dcba01169cd5840e291f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 28 Feb 2020 19:07:01 -0600 Subject: arcnet: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- include/uapi/linux/if_arcnet.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index 683878036d76..b122cfac7128 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -60,7 +60,7 @@ struct arc_rfc1201 { __u8 proto; /* protocol ID field - varies */ __u8 split_flag; /* for use with split packets */ __be16 sequence; /* sequence number */ - __u8 payload[0]; /* space remaining in packet (504 bytes)*/ + __u8 payload[]; /* space remaining in packet (504 bytes)*/ }; #define RFC1201_HDR_SIZE 4 @@ -69,7 +69,7 @@ struct arc_rfc1201 { */ struct arc_rfc1051 { __u8 proto; /* ARC_P_RFC1051_ARP/RFC1051_IP */ - __u8 payload[0]; /* 507 bytes */ + __u8 payload[]; /* 507 bytes */ }; #define RFC1051_HDR_SIZE 1 @@ -80,7 +80,7 @@ struct arc_rfc1051 { struct arc_eth_encap { __u8 proto; /* Always ARC_P_ETHER */ struct ethhdr eth; /* standard ethernet header (yuck!) */ - __u8 payload[0]; /* 493 bytes */ + __u8 payload[]; /* 493 bytes */ }; #define ETH_ENCAP_HDR_SIZE 14 -- cgit v1.2.3 From 1776658da830eb024b63ac34167dfd4bd6c21cf9 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 2 Mar 2020 06:02:09 -0600 Subject: drop_monitor: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- include/uapi/linux/net_dropmon.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 66048cc5d7b3..67e31f329190 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -29,12 +29,12 @@ struct net_dm_config_entry { struct net_dm_config_msg { __u32 entries; - struct net_dm_config_entry options[0]; + struct net_dm_config_entry options[]; }; struct net_dm_alert_msg { __u32 entries; - struct net_dm_drop_point points[0]; + struct net_dm_drop_point points[]; }; struct net_dm_user_msg { -- cgit v1.2.3 From 7d67af2c013402537385dae343a2d0f6a4cb3bfd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2020 11:32:45 +0300 Subject: io_uring: add splice(2) support Add support for splice(2). - output file is specified as sqe->fd, so it's handled by generic code - hash_reg_file handled by generic code as well - len is 32bit, but should be fine - the fd_in is registered file, when SPLICE_F_FD_IN_FIXED is set, which is a splice flag (i.e. sqe->splice_flags). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 14 +++++- 2 files changed, 122 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1a3de7337274..1ef20a2af10b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -76,6 +76,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -428,6 +429,15 @@ struct io_epoll { struct epoll_event event; }; +struct io_splice { + struct file *file_out; + struct file *file_in; + loff_t off_out; + loff_t off_in; + u64 len; + unsigned int flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -544,6 +554,7 @@ struct io_kiocb { struct io_fadvise fadvise; struct io_madvise madvise; struct io_epoll epoll; + struct io_splice splice; }; struct io_async_ctx *io; @@ -744,6 +755,11 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .file_table = 1, }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + } }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -758,6 +774,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); +static int io_file_get(struct io_submit_state *state, + struct io_kiocb *req, + int fd, struct file **out_file, + bool fixed); static struct kmem_cache *req_cachep; @@ -2404,6 +2424,77 @@ out_free: return ret; } +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + int ret; + + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; + + sp->file_in = NULL; + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, + (sp->flags & SPLICE_F_FD_IN_FIXED)); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + + return 0; +} + +static bool io_splice_punt(struct file *file) +{ + if (get_pipe_info(file)) + return false; + if (!io_file_supports_async(file)) + return true; + return !(file->f_mode & O_NONBLOCK); +} + +static int io_splice(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + long ret; + + if (force_nonblock) { + if (io_splice_punt(in) || io_splice_punt(out)) + return -EAGAIN; + flags |= SPLICE_F_NONBLOCK; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +} + /* * IORING_OP_NOP just posts a completion event, nothing else. */ @@ -4230,6 +4321,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_EPOLL_CTL: ret = io_epoll_ctl_prep(req, sqe); break; + case IORING_OP_SPLICE: + ret = io_splice_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4292,6 +4386,10 @@ static void io_cleanup_req(struct io_kiocb *req) case IORING_OP_STATX: putname(req->open.filename); break; + case IORING_OP_SPLICE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4495,6 +4593,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_epoll_ctl(req, nxt, force_nonblock); break; + case IORING_OP_SPLICE: + if (sqe) { + ret = io_splice_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_splice(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -7230,6 +7336,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(16, __u64, addr); + BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); @@ -7244,9 +7351,11 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); + BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(42, __u16, personality); + BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 3f7961c1c243..08891cc1c1e7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -23,7 +23,10 @@ struct io_uring_sqe { __u64 off; /* offset into file */ __u64 addr2; }; - __u64 addr; /* pointer to buffer or iovecs */ + union { + __u64 addr; /* pointer to buffer or iovecs */ + __u64 splice_off_in; + }; __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; @@ -37,6 +40,7 @@ struct io_uring_sqe { __u32 open_flags; __u32 statx_flags; __u32 fadvise_advice; + __u32 splice_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -45,6 +49,7 @@ struct io_uring_sqe { __u16 buf_index; /* personality to use, if used */ __u16 personality; + __s32 splice_fd_in; }; __u64 __pad2[3]; }; @@ -113,6 +118,7 @@ enum { IORING_OP_RECV, IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, + IORING_OP_SPLICE, /* this goes last, obviously */ IORING_OP_LAST, @@ -128,6 +134,12 @@ enum { */ #define IORING_TIMEOUT_ABS (1U << 0) +/* + * sqe->splice_flags + * extends splice(2) flags + */ +#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From d7718a9d25a61442da8ee8aeeff6a0097f0ccfd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 14 Feb 2020 22:23:12 -0700 Subject: io_uring: use poll driven retry for files that support it Currently io_uring tries any request in a non-blocking manner, if it can, and then retries from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry backend, use that to retry requests if the file supports it. This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking manner, we arm a poll handler for notification on when the socket becomes readable. When it does, the pending read is executed directly by the task again, through the io_uring task work handlers. Not only is this faster and more efficient, it also means we're not generating potentially tons of async threads that just sit and block, waiting for the IO to complete. The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast, and that pollother_op is fast as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 354 ++++++++++++++++++++++++++++++---------- include/trace/events/io_uring.h | 103 ++++++++++++ include/uapi/linux/io_uring.h | 1 + 3 files changed, 375 insertions(+), 83 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d973de75127..8c976fde40bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -487,6 +487,7 @@ enum { REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, + REQ_F_POLLED_BIT, }; enum { @@ -529,6 +530,13 @@ enum { REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), /* in overflow list */ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), +}; + +struct async_poll { + struct io_poll_iocb poll; + struct io_wq_work work; }; /* @@ -562,27 +570,29 @@ struct io_kiocb { u8 opcode; struct io_ring_ctx *ctx; - union { - struct list_head list; - struct hlist_node hash_node; - }; - struct list_head link_list; + struct list_head list; unsigned int flags; refcount_t refs; + struct task_struct *task; u64 user_data; u32 result; u32 sequence; + struct list_head link_list; + struct list_head inflight_entry; union { /* * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them. + * obviously. Right now only IORING_OP_POLL_ADD uses them, and + * async armed poll handlers for regular commands. The latter + * restore the work, if needed. */ struct { - struct task_struct *task; struct callback_head task_work; + struct hlist_node hash_node; + struct async_poll *apoll; }; struct io_wq_work work; }; @@ -3563,9 +3573,209 @@ out: #endif } -static bool io_poll_remove_one(struct io_kiocb *req) +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, + struct wait_queue_head *head) +{ + if (unlikely(poll->head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + poll->head = head; + add_wait_queue(head, &poll->wait); +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + __io_queue_proc(&pt->req->apoll->poll, pt, head); +} + +static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, + __poll_t mask, task_work_func_t func) +{ + struct task_struct *tsk; + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + + list_del_init(&poll->wait.entry); + + tsk = req->task; + req->result = mask; + init_task_work(&req->task_work, func); + /* + * If this fails, then the task is exiting. If that is the case, then + * the exit check will ultimately cancel these work items. Hence we + * don't need to check here and handle it specifically. + */ + task_work_add(tsk, &req->task_work, true); + wake_up_process(tsk); + return 1; +} + +static void io_async_task_func(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct async_poll *apoll = req->apoll; + struct io_ring_ctx *ctx = req->ctx; + + trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + + WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); + + if (hash_hashed(&req->hash_node)) { + spin_lock_irq(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock_irq(&ctx->completion_lock); + } + + /* restore ->work in case we need to retry again */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + + kfree(apoll); +} + +static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = &req->apoll->poll; + + trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, + key_to_poll(key)); + + return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static __poll_t __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask, + wait_queue_func_t wake_func) + __acquires(&ctx->completion_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool cancel = false; + + poll->file = req->file; + poll->head = NULL; + poll->done = poll->canceled = false; + poll->events = mask; + + ipt->pt._key = mask; + ipt->req = req; + ipt->error = -EINVAL; + + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); + poll->wait.private = req; + + mask = vfs_poll(req->file, &ipt->pt) & poll->events; + + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt->error) + cancel = true; + ipt->error = 0; + mask = 0; + } + if (mask || ipt->error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + io_poll_req_insert(req); + spin_unlock(&poll->head->lock); + } + + return mask; +} + +static bool io_arm_poll_handler(struct io_kiocb *req) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask, ret; + + if (!req->file || !file_can_poll(req->file)) + return false; + if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + return false; + if (!def->pollin && !def->pollout) + return false; + + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return false; + + req->flags |= REQ_F_POLLED; + memcpy(&apoll->work, &req->work, sizeof(req->work)); + + /* + * Don't need a reference here, as we're adding it to the task + * task_works list. If the task exits, the list is pruned. + */ + req->task = current; + req->apoll = apoll; + INIT_HLIST_NODE(&req->hash_node); + + if (def->pollin) + mask = POLLIN | POLLRDNORM; + if (def->pollout) + mask |= POLLOUT | POLLWRNORM; + mask |= POLLERR | POLLPRI; + + ipt.pt._qproc = io_async_queue_proc; + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, + io_async_wake); + if (ret) { + ipt.error = 0; + apoll->poll.done = true; + spin_unlock_irq(&ctx->completion_lock); + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + return false; + } + spin_unlock_irq(&ctx->completion_lock); + trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, + apoll->poll.events); + return true; +} + +static bool __io_poll_remove_one(struct io_kiocb *req, + struct io_poll_iocb *poll) { - struct io_poll_iocb *poll = &req->poll; bool do_complete = false; spin_lock(&poll->head->lock); @@ -3575,7 +3785,24 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = true; } spin_unlock(&poll->head->lock); + return do_complete; +} + +static bool io_poll_remove_one(struct io_kiocb *req) +{ + bool do_complete; + + if (req->opcode == IORING_OP_POLL_ADD) { + do_complete = __io_poll_remove_one(req, &req->poll); + } else { + /* non-poll requests have submit ref still */ + do_complete = __io_poll_remove_one(req, &req->apoll->poll); + if (do_complete) + io_put_req(req); + } + hash_del(&req->hash_node); + if (do_complete) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); @@ -3686,8 +3913,13 @@ static void io_poll_task_func(struct callback_head *cb) struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) + if (nxt) { + struct io_ring_ctx *ctx = nxt->ctx; + + mutex_lock(&ctx->uring_lock); __io_queue_sqe(nxt, NULL); + mutex_unlock(&ctx->uring_lock); + } } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -3695,51 +3927,16 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, { struct io_kiocb *req = wait->private; struct io_poll_iocb *poll = &req->poll; - __poll_t mask = key_to_poll(key); - struct task_struct *tsk; - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - - list_del_init(&poll->wait.entry); - - tsk = req->task; - req->result = mask; - init_task_work(&req->task_work, io_poll_task_func); - task_work_add(tsk, &req->task_work, true); - wake_up_process(tsk); - return 1; + return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); } -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int error; -}; - static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - if (unlikely(pt->req->poll.head)) { - pt->error = -EINVAL; - return; - } - - pt->error = 0; - pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + __io_queue_proc(&pt->req->poll, pt, head); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -3757,7 +3954,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - /* task will wait for requests on exit, don't need a ref */ + /* + * Don't need a reference here, as we're adding it to the task + * task_works list. If the task exits, the list is pruned. + */ req->task = current; return 0; } @@ -3767,46 +3967,15 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - bool cancel = false; __poll_t mask; INIT_HLIST_NODE(&req->hash_node); - - poll->head = NULL; - poll->done = false; - poll->canceled = false; - - ipt.pt._qproc = io_poll_queue_proc; - ipt.pt._key = poll->events; - ipt.req = req; - ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ - - /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); - poll->wait.private = req; - INIT_LIST_HEAD(&req->list); + ipt.pt._qproc = io_poll_queue_proc; - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, + io_poll_wake); - spin_lock_irq(&ctx->completion_lock); - if (likely(poll->head)) { - spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt.error) - cancel = true; - ipt.error = 0; - mask = 0; - } - if (mask || ipt.error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock(&poll->head->lock); - } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; io_poll_complete(req, mask, 0); @@ -4751,6 +4920,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; + /* for polled retry, if flag is set, we already went through here */ + if (req->flags & REQ_F_POLLED) + return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, link_list); @@ -4788,6 +4960,11 @@ again: */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { + if (io_arm_poll_handler(req)) { + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + goto done_req; + } punt: if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); @@ -6782,6 +6959,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "Personalities:\n"); idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); } + seq_printf(m, "PollList:\n"); + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_kiocb *req; + + hlist_for_each_entry(req, list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + req->task->task_works != NULL); + } + spin_unlock_irq(&ctx->completion_lock); mutex_unlock(&ctx->uring_lock); } @@ -6998,7 +7186,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 27bd9e4f927b..9f0d3b7d56b0 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->force_nonblock, __entry->sq_thread) ); +TRACE_EVENT(io_uring_poll_arm, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), + + TP_ARGS(ctx, opcode, user_data, mask, events), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + __field( int, events ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + __entry->events = events; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask, __entry->events) +); + +TRACE_EVENT(io_uring_poll_wake, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_add, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask %x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_run, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data), + + TP_ARGS(ctx, opcode, user_data), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + ), + + TP_printk("ring %p, op %d, data 0x%llx", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 08891cc1c1e7..53b36311cdac 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -216,6 +216,7 @@ struct io_uring_params { #define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) +#define IORING_FEAT_FAST_POLL (1U << 5) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From 636be4241bdd88fec273b38723e44bad4e1c4fae Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 27 Feb 2020 14:25:31 -0500 Subject: dm: bump version of core and various targets Changes made during the 5.6 cycle warrant bumping the version number for DM core and the targets modified by this commit. It should be noted that dm-thin, dm-crypt and dm-raid already had their target version bumped during the 5.6 merge window. Signed-off-by; Mike Snitzer --- drivers/md/dm-cache-target.c | 2 +- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-writecache.c | 2 +- drivers/md/dm-zoned-target.c | 2 +- include/uapi/linux/dm-ioctl.h | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index f4be63671233..d3bb355819a4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3492,7 +3492,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 1, 0}, + .version = {2, 2, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index a82a9c257744..2f03fecd312d 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4202,7 +4202,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2bc18c9c3abc..58fd137b6ae1 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -2053,7 +2053,7 @@ static int multipath_busy(struct dm_target *ti) *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 0d61e9c67986..eec9f252e935 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1221,7 +1221,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d692d2a00745..a09bdc000e64 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2320,7 +2320,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b1e64cd31647..f4f83d39b3dc 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -967,7 +967,7 @@ static int dmz_iterate_devices(struct dm_target *ti, static struct target_type dmz_type = { .name = "zoned", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = dmz_ctr, diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 2df8ceca1f9b..6622912c2342 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -272,9 +272,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 41 +#define DM_VERSION_MINOR 42 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2019-09-16)" +#define DM_VERSION_EXTRA "-ioctl (2020-02-27)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From acf1ee44ca5da39755d2aa9080392eae46a0eb34 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 3 Mar 2020 08:12:42 -0600 Subject: devlink: Introduce devlink port flavour virtual Currently mlx5 PCI PF and VF devlink devices register their ports as physical port in non-representors mode. Introduce a new port flavour as virtual so that virtual devices can register 'virtual' flavour to make it more clear to users. An example of one PCI PF and 2 PCI virtual functions, each having one devlink port. $ devlink port show pci/0000:06:00.0/1: type eth netdev ens2f0 flavour physical port 0 pci/0000:06:00.2/1: type eth netdev ens2f2 flavour virtual port 0 pci/0000:06:00.3/1: type eth netdev ens2f3 flavour virtual port 0 Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index be2a2948f452..dfdffc42e87d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -187,6 +187,7 @@ enum devlink_port_flavour { * for the PCI VF. It is an internal * port that faces the PCI VF. */ + DEVLINK_PORT_FLAVOUR_VIRTUAL, /* Any virtual port facing the user. */ }; enum devlink_param_cmode { diff --git a/net/core/devlink.c b/net/core/devlink.c index 295d761cbfb1..e8ccea9035c8 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -545,6 +545,7 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: + case DEVLINK_PORT_FLAVOUR_VIRTUAL: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; @@ -6806,6 +6807,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: + case DEVLINK_PORT_FLAVOUR_VIRTUAL: if (!attrs->split) n = snprintf(name, len, "p%u", attrs->phys.port_number); else -- cgit v1.2.3 From cf62089b0edd7e74a1f474844b4d9f7b5697fb5c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 3 Mar 2020 15:05:01 -0500 Subject: bpf: Add gso_size to __sk_buff BPF programs may want to know whether an skb is gso. The canonical answer is skb_is_gso(skb), which tests that gso_size != 0. Expose this field in the same manner as gso_segs. That field itself is not a sufficient signal, as the comment in skb_shared_info makes clear: gso_segs may be zero, e.g., from dodgy sources. Also prepare net/bpf/test_run for upcoming BPF_PROG_TEST_RUN tests of the feature. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200303200503.226217-2-willemdebruijn.kernel@gmail.com --- include/uapi/linux/bpf.h | 1 + net/bpf/test_run.c | 7 +++++++ net/core/filter.c | 44 ++++++++++++++++++++++++++++++-------------- 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8e98ced0963b..180337fae97e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3176,6 +3176,7 @@ struct __sk_buff { __u32 wire_len; __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; }; struct bpf_tunnel_key { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 562443f94133..1cd7a1c2f8b2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -277,6 +277,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* gso_segs is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), + offsetof(struct __sk_buff, gso_size))) + return -EINVAL; + + /* gso_size is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), sizeof(struct __sk_buff))) return -EINVAL; @@ -297,6 +303,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; + skb_shinfo(skb)->gso_size = __skb->gso_size; return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index 4a08c9fb2be7..cd0a532db4e7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7139,6 +7139,27 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -7461,26 +7482,21 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, gso_segs): - /* si->dst_reg = skb_shinfo(SKB); */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - BPF_REG_AX, si->src_reg, - offsetof(struct sk_buff, end)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, head)); - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); -#else - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - si->dst_reg, si->src_reg, - offsetof(struct sk_buff, end)); -#endif + insn = bpf_convert_shinfo_access(si, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; + case offsetof(struct __sk_buff, gso_size): + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_size, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); -- cgit v1.2.3 From 1aae4bdd787998ea331a56f3db9d8595790fe2f9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 2 Mar 2020 16:32:31 -0800 Subject: bpf: Switch BPF UAPI #define constants used from BPF program side to enums Switch BPF UAPI constants, previously defined as #define macro, to anonymous enum values. This preserves constants values and behavior in expressions, but has added advantaged of being captured as part of DWARF and, subsequently, BTF type info. Which, in turn, greatly improves usefulness of generated vmlinux.h for BPF applications, as it will not require BPF users to copy/paste various flags and constants, which are frequently used with BPF helpers. Only those constants that are used/useful from BPF program side are converted. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200303003233.3496043-2-andriin@fb.com --- include/uapi/linux/bpf.h | 175 +++++++++++++++++++++++++--------------- tools/include/uapi/linux/bpf.h | 177 +++++++++++++++++++++++++---------------- 2 files changed, 219 insertions(+), 133 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 180337fae97e..d6b33ea27bcc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -325,44 +325,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +393,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -3045,72 +3049,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; /* BPF_FUNC_read_branch_records flags. */ -#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3529,13 +3561,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3614,8 +3647,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3623,12 +3658,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3644,8 +3683,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3717,9 +3758,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7c689f4552dd..d6b33ea27bcc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ + __u8 data[]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -325,44 +325,46 @@ enum bpf_attach_type { #define BPF_PSEUDO_CALL 1 /* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ -#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; /* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) +enum { + BPF_F_NO_PREALLOC = (1U << 0), /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. */ -#define BPF_F_NO_COMMON_LRU (1U << 1) + BPF_F_NO_COMMON_LRU = (1U << 1), /* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U + BPF_F_NUMA_NODE = (1U << 2), /* Flags for accessing BPF object from syscall side. */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), /* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) + BPF_F_STACK_BUILD_ID = (1U << 5), /* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) + BPF_F_ZERO_SEED = (1U << 6), /* Flags for accessing BPF object from program side. */ -#define BPF_F_RDONLY_PROG (1U << 7) -#define BPF_F_WRONLY_PROG (1U << 8) + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), /* Clone map from listener for newly accepted socket */ -#define BPF_F_CLONE (1U << 9) + BPF_F_CLONE = (1U << 9), /* Enable memory-mapping BPF map */ -#define BPF_F_MMAPABLE (1U << 10) + BPF_F_MMAPABLE = (1U << 10), +}; /* Flags for BPF_PROG_QUERY. */ @@ -391,6 +393,8 @@ struct bpf_stack_build_id { }; }; +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -3045,72 +3049,100 @@ enum bpf_func_id { /* All flags used by eBPF helper functions, placed here. */ /* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. */ -#define BPF_F_HDR_FIELD_MASK 0xfULL +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; /* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) +enum { + BPF_F_INGRESS = (1ULL << 0), +}; /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), /* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), /* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; /* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), +}; /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, /* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; /* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; /* BPF_FUNC_skb_adjust_room flags. */ -#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), +}; -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) -#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) -#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) -#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) #define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* BPF_FUNC_sysctl_get_name flags. */ -#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; /* BPF_FUNC_sk_storage_get flags */ -#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +enum { + BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), +}; /* BPF_FUNC_read_branch_records flags. */ -#define BPF_F_GET_BRANCH_RECORDS_SIZE (1ULL << 0) +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3529,13 +3561,14 @@ struct bpf_sock_ops { }; /* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently - * supported cb flags - */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, +}; /* List of known BPF sock_ops operators. * New entries can only be added at the end @@ -3614,8 +3647,10 @@ enum { BPF_TCP_MAX_STATES /* Leave at the end! */ }; -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ +}; struct bpf_perf_event_value { __u64 counter; @@ -3623,12 +3658,16 @@ struct bpf_perf_event_value { __u64 running; }; -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; struct bpf_cgroup_dev_ctx { /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ @@ -3644,8 +3683,10 @@ struct bpf_raw_tracepoint_args { /* DIRECT: Skip the FIB rules and go to FIB table associated with device * OUTPUT: Do lookup from egress perspective; default is ingress */ -#define BPF_FIB_LOOKUP_DIRECT (1U << 0) -#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), +}; enum { BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ @@ -3717,9 +3758,11 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; -#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) -#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; struct bpf_flow_keys { __u16 nhoff; -- cgit v1.2.3 From ae24082331d9bbaae283aafbe930a8f0eb85605a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Wed, 4 Mar 2020 20:18:49 +0100 Subject: bpf: Introduce BPF_MODIFY_RETURN When multiple programs are attached, each program receives the return value from the previous program on the stack and the last program provides the return value to the attached function. The fmod_ret bpf programs are run after the fentry programs and before the fexit programs. The original function is only called if all the fmod_ret programs return 0 to avoid any unintended side-effects. The success value, i.e. 0 is not currently configurable but can be made so where user-space can specify it at load time. For example: int func_to_be_attached(int a, int b) { <--- do_fentry do_fmod_ret: if (ret != 0) goto do_fexit; original_function: } <--- do_fexit The fmod_ret program attached to this function can be defined as: SEC("fmod_ret/func_to_be_attached") int BPF_PROG(func_name, int a, int b, int ret) { // This will skip the original function logic. return 1; } The first fmod_ret program is passed 0 in its return argument. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200304191853.1529-4-kpsingh@chromium.org --- arch/x86/net/bpf_jit_comp.c | 130 +++++++++++++++++++++++++++++++++++++---- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 3 +- kernel/bpf/syscall.c | 1 + kernel/bpf/trampoline.c | 5 +- kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 130 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d6349e930b06..b1fd000feb89 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1362,7 +1362,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size) + struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; int cnt = 0; @@ -1383,6 +1383,13 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; + /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + * of the previous call which is then passed on the stack to + * the next BPF program. + */ + if (mod_ret) + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1442,6 +1449,23 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) return 0; } +static int emit_mod_ret_check_imm8(u8 **pprog, int value) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (!is_imm8(value)) + return -EINVAL; + + if (value == 0) + EMIT2(0x85, add_2reg(0xC0, BPF_REG_0, BPF_REG_0)); + else + EMIT3(0x83, add_1reg(0xF8, BPF_REG_0), value); + + *pprog = prog; + return 0; +} + static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_progs *tp, int stack_size) { @@ -1449,9 +1473,49 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, + struct bpf_tramp_progs *tp, int stack_size, + u8 **branches) +{ + u8 *prog = *pprog; + int i; + + /* The first fmod_ret program will receive a garbage return value. + * Set this to 0 to avoid confusing the program. + */ + emit_mov_imm32(&prog, false, BPF_REG_0, 0); + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + for (i = 0; i < tp->nr_progs; i++) { + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true)) return -EINVAL; + + /* Generate a branch: + * + * if (ret != 0) + * goto do_fexit; + * + * If needed this can be extended to any integer value which can + * be passed by user-space when the program is loaded. + */ + if (emit_mod_ret_check_imm8(&prog, 0)) + return -EINVAL; + + /* Save the location of the branch and Generate 6 nops + * (4 bytes for an offset and 2 bytes for the jump) These nops + * are replaced with a conditional jump once do_fexit (i.e. the + * start of the fexit invocation) is finalized. + */ + branches[i] = prog; + emit_nops(&prog, 4 + 2); } + *pprog = prog; return 0; } @@ -1521,10 +1585,12 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, struct bpf_tramp_progs *tprogs, void *orig_call) { - int cnt = 0, nr_args = m->nr_args; + int ret, i, cnt = 0, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; + struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; + u8 **branches = NULL; u8 *prog; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ @@ -1557,24 +1623,60 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; + if (fmod_ret->nr_progs) { + branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *), + GFP_KERNEL); + if (!branches) + return -ENOMEM; + + if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size, + branches)) { + ret = -EINVAL; + goto cleanup; + } + } + if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs) + if (fentry->nr_progs || fmod_ret->nr_progs) restore_regs(m, &prog, nr_args, stack_size); /* call original function */ - if (emit_call(&prog, orig_call, prog)) - return -EINVAL; + if (emit_call(&prog, orig_call, prog)) { + ret = -EINVAL; + goto cleanup; + } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); } + if (fmod_ret->nr_progs) { + /* From Intel 64 and IA-32 Architectures Optimization + * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler + * Coding Rule 11: All branch targets should be 16-byte + * aligned. + */ + emit_align(&prog, 16); + /* Update the branches saved in invoke_bpf_mod_ret with the + * aligned address of do_fexit. + */ + for (i = 0; i < fmod_ret->nr_progs; i++) + emit_cond_near_jump(&branches[i], prog, branches[i], + X86_JNE); + } + if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) - return -EINVAL; + if (invoke_bpf(m, &prog, fexit, stack_size)) { + ret = -EINVAL; + goto cleanup; + } if (flags & BPF_TRAMP_F_RESTORE_REGS) restore_regs(m, &prog, nr_args, stack_size); + /* This needs to be done regardless. If there were fmod_ret programs, + * the return value is only updated on the stack and still needs to be + * restored to R0. + */ if (flags & BPF_TRAMP_F_CALL_ORIG) /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); @@ -1586,9 +1688,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ EMIT1(0xC3); /* ret */ /* Make sure the trampoline generation logic doesn't overflow */ - if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) - return -EFAULT; - return prog - (u8 *)image; + if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) { + ret = -EFAULT; + goto cleanup; + } + ret = prog - (u8 *)image; + +cleanup: + kfree(branches); + return ret; } static int emit_fallback_jump(u8 **pprog) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 98ec10b23dbb..f748b31e5888 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -474,6 +474,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); enum bpf_tramp_prog_type { BPF_TRAMP_FENTRY, BPF_TRAMP_FEXIT, + BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6b33ea27bcc..40b2d9476268 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 787140095e58..30841fb8b3c0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3710,7 +3710,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, nr_args--; } - if (prog->expected_attach_type == BPF_TRACE_FEXIT && + if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && arg == nr_args) { if (!t) /* Default prog with 5 args. 6th arg is retval. */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 13de65363ba2..7ce0815793dd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2324,6 +2324,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_MODIFY_RETURN && prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 546198f6f307..221a17af1f81 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -232,7 +232,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || + tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -269,6 +270,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) switch (t) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_MODIFY_RETURN: + return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 289383edfc8c..2460c8e6b5be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9950,6 +9950,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!prog_extension) return -EINVAL; /* fallthrough */ + case BPF_MODIFY_RETURN: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6b33ea27bcc..40b2d9476268 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -210,6 +210,7 @@ enum bpf_attach_type { BPF_TRACE_RAW_TP, BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 51891498f2da78ee64dfad88fa53c9e85fb50abf Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 4 Mar 2020 11:05:17 -0700 Subject: seccomp: allow TSYNC and USER_NOTIF together The restriction introduced in 7a0df7fbc145 ("seccomp: Make NEW_LISTENER and TSYNC flags exclusive") is mostly artificial: there is enough information in a seccomp user notification to tell which thread triggered a notification. The reason it was introduced is because TSYNC makes the syscall return a thread-id on failure, and NEW_LISTENER returns an fd, and there's no way to distinguish between these two cases (well, I suppose the caller could check all fds it has, then do the syscall, and if the return value was an fd that already existed, then it must be a thread id, but bleh). Matthew would like to use these two flags together in the Chrome sandbox which wants to use TSYNC for video drivers and NEW_LISTENER to proxy syscalls. So, let's fix this ugliness by adding another flag, TSYNC_ESRCH, which tells the kernel to just return -ESRCH on a TSYNC error. This way, NEW_LISTENER (and any subsequent seccomp() commands that want to return positive values) don't conflict with each other. Suggested-by: Matthew Denton Signed-off-by: Tycho Andersen Link: https://lore.kernel.org/r/20200304180517.23867-1-tycho@tycho.ws Signed-off-by: Kees Cook --- include/linux/seccomp.h | 3 +- include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 14 +++-- tools/testing/selftests/seccomp/seccomp_bpf.c | 74 ++++++++++++++++++++++++++- 4 files changed, 86 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 03583b6d1416..4192369b8418 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -7,7 +7,8 @@ #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ SECCOMP_FILTER_FLAG_LOG | \ SECCOMP_FILTER_FLAG_SPEC_ALLOW | \ - SECCOMP_FILTER_FLAG_NEW_LISTENER) + SECCOMP_FILTER_FLAG_NEW_LISTENER | \ + SECCOMP_FILTER_FLAG_TSYNC_ESRCH) #ifdef CONFIG_SECCOMP diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index be84d87f1f46..c1735455bc53 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -22,6 +22,7 @@ #define SECCOMP_FILTER_FLAG_LOG (1UL << 1) #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4) /* * All BPF programs must return a 32-bit value. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b6ea3dcb57bf..29022c1bbe18 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -528,8 +528,12 @@ static long seccomp_attach_filter(unsigned int flags, int ret; ret = seccomp_can_sync_threads(); - if (ret) - return ret; + if (ret) { + if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) + return -ESRCH; + else + return ret; + } } /* Set log flag, if present. */ @@ -1288,10 +1292,12 @@ static long seccomp_set_mode_filter(unsigned int flags, * In the successful case, NEW_LISTENER returns the new listener fd. * But in the failure case, TSYNC returns the thread that died. If you * combine these two flags, there's no way to tell whether something - * succeeded or failed. So, let's disallow this combination. + * succeeded or failed. So, let's disallow this combination if the user + * has not explicitly requested no errors from TSYNC. */ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && - (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) && + ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; /* Prepare the new filter before holding any locks. */ diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index ee1b727ede04..a9ad3bd8b2ad 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -212,6 +212,10 @@ struct seccomp_notif_sizes { #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001 #endif +#ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4) +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -2187,7 +2191,8 @@ TEST(detect_seccomp_filter_flags) unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_LOG, SECCOMP_FILTER_FLAG_SPEC_ALLOW, - SECCOMP_FILTER_FLAG_NEW_LISTENER }; + SECCOMP_FILTER_FLAG_NEW_LISTENER, + SECCOMP_FILTER_FLAG_TSYNC_ESRCH }; unsigned int exclusive[] = { SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_FILTER_FLAG_NEW_LISTENER }; @@ -2645,6 +2650,55 @@ TEST_F(TSYNC, two_siblings_with_one_divergence) EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); } +TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err) +{ + long ret, flags; + void *status; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); + } + self->sibling[0].diverge = 1; + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + flags = SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_TSYNC_ESRCH; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog); + ASSERT_EQ(ESRCH, errno) { + TH_LOG("Did not return ESRCH for diverged sibling."); + } + ASSERT_EQ(-1, ret) { + TH_LOG("Did not fail on diverged sibling."); + } + + /* Wake the threads */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure they are both unkilled. */ + PTHREAD_JOIN(self->sibling[0].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); + PTHREAD_JOIN(self->sibling[1].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); +} + TEST_F(TSYNC, two_siblings_not_under_filter) { long ret, sib; @@ -3196,6 +3250,24 @@ TEST(user_notification_basic) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(user_notification_with_tsync) +{ + int ret; + unsigned int flags; + + /* these were exclusive */ + flags = SECCOMP_FILTER_FLAG_NEW_LISTENER | + SECCOMP_FILTER_FLAG_TSYNC; + ASSERT_EQ(-1, user_trap_syscall(__NR_getppid, flags)); + ASSERT_EQ(EINVAL, errno); + + /* but now they're not */ + flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH; + ret = user_trap_syscall(__NR_getppid, flags); + close(ret); + ASSERT_LE(0, ret); +} + TEST(user_notification_kill_in_middle) { pid_t pid; -- cgit v1.2.3 From 44f8658017419dccbeefe64f30122fa191d0e173 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 7 Mar 2020 12:40:20 +0100 Subject: sched: act: allow user to specify type of HW stats for a filter Currently, user who is adding an action expects HW to report stats, however it does not have exact expectations about the stats types. That is aligned with TCA_ACT_HW_STATS_TYPE_ANY. Allow user to specify the type of HW stats for an action and require it. Pass the information down to flow_offload layer. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 4 ++++ include/uapi/linux/pkt_cls.h | 22 ++++++++++++++++++++++ net/sched/act_api.c | 36 ++++++++++++++++++++++++++++++++++++ net/sched/cls_api.c | 7 +++++++ 4 files changed, 69 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/act_api.h b/include/net/act_api.h index 71347a90a9d1..41337c7fc728 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -41,6 +41,7 @@ struct tc_action { struct tc_cookie __rcu *act_cookie; struct tcf_chain __rcu *goto_chain; u32 tcfa_flags; + u8 hw_stats_type; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt @@ -52,6 +53,9 @@ struct tc_action { #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock +#define TCA_ACT_HW_STATS_TYPE_ANY (TCA_ACT_HW_STATS_TYPE_IMMEDIATE | \ + TCA_ACT_HW_STATS_TYPE_DELAYED) + /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 449a63971451..81cc1a869588 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -17,6 +17,7 @@ enum { TCA_ACT_PAD, TCA_ACT_COOKIE, TCA_ACT_FLAGS, + TCA_ACT_HW_STATS_TYPE, __TCA_ACT_MAX }; @@ -24,6 +25,27 @@ enum { * actions stats. */ +/* tca HW stats type + * When user does not pass the attribute, he does not care. + * It is the same as if he would pass the attribute with + * all supported bits set. + * In case no bits are set, user is not interested in getting any HW statistics. + */ +#define TCA_ACT_HW_STATS_TYPE_IMMEDIATE (1 << 0) /* Means that in dump, user + * gets the current HW stats + * state from the device + * queried at the dump time. + */ +#define TCA_ACT_HW_STATS_TYPE_DELAYED (1 << 1) /* Means that in dump, user gets + * HW stats that might be out + * of date for some time, maybe + * couple of seconds. This is + * the case when driver polls + * stats updates periodically + * or when it gets async stats update + * from the device. + */ + #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) #define TCA_ACT_MAX_PRIO 32 diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8c466a712cda..aa7b737fed2e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -185,6 +185,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS_TYPE */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ @@ -788,6 +789,17 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + if (a->hw_stats_type != TCA_ACT_HW_STATS_TYPE_ANY) { + struct nla_bitfield32 hw_stats_type = { + a->hw_stats_type, + TCA_ACT_HW_STATS_TYPE_ANY, + }; + + if (nla_put(skb, TCA_ACT_HW_STATS_TYPE, sizeof(hw_stats_type), + &hw_stats_type)) + goto nla_put_failure; + } + if (a->tcfa_flags) { struct nla_bitfield32 flags = { a->tcfa_flags, a->tcfa_flags, }; @@ -854,7 +866,23 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static u8 tcf_action_hw_stats_type_get(struct nlattr *hw_stats_type_attr) +{ + struct nla_bitfield32 hw_stats_type_bf; + + /* If the user did not pass the attr, that means he does + * not care about the type. Return "any" in that case + * which is setting on all supported types. + */ + if (!hw_stats_type_attr) + return TCA_ACT_HW_STATS_TYPE_ANY; + hw_stats_type_bf = nla_get_bitfield32(hw_stats_type_attr); + return hw_stats_type_bf.value; +} + static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; +static const u32 tca_act_hw_stats_type_allowed = TCA_ACT_HW_STATS_TYPE_ANY; + static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, @@ -863,6 +891,8 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, .validation_data = &tca_act_flags_allowed }, + [TCA_ACT_HW_STATS_TYPE] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_hw_stats_type_allowed }, }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -871,6 +901,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { + u8 hw_stats_type = TCA_ACT_HW_STATS_TYPE_ANY; struct nla_bitfield32 flags = { 0, 0 }; struct tc_action *a; struct tc_action_ops *a_o; @@ -903,6 +934,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } + hw_stats_type = + tcf_action_hw_stats_type_get(tb[TCA_ACT_HW_STATS_TYPE]); if (tb[TCA_ACT_FLAGS]) flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { @@ -953,6 +986,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); + if (!name) + a->hw_stats_type = hw_stats_type; + /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then * ACT_P_CREATED is not returned (a zero is). diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 4e766c5ab77a..e91448640a4f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3464,6 +3464,10 @@ int tc_setup_flow_action(struct flow_action *flow_action, struct tc_action *act; int i, j, k, err = 0; + BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_ANY != FLOW_ACTION_HW_STATS_TYPE_ANY); + BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_IMMEDIATE != FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE); + BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_DELAYED != FLOW_ACTION_HW_STATS_TYPE_DELAYED); + if (!exts) return 0; @@ -3476,6 +3480,9 @@ int tc_setup_flow_action(struct flow_action *flow_action, err = tcf_act_get_cookie(entry, act); if (err) goto err_out_locked; + + entry->hw_stats_type = act->hw_stats_type; + if (is_tcf_gact_ok(act)) { entry->id = FLOW_ACTION_ACCEPT; } else if (is_tcf_gact_shot(act)) { -- cgit v1.2.3 From e08ab0b377a1489760533424437c5f4be7f484a4 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Mon, 9 Mar 2020 13:16:40 -0700 Subject: tcp: add bytes not sent to SCM_TIMESTAMPING_OPT_STATS Add TCP_NLA_BYTES_NOTSENT to SCM_TIMESTAMPING_OPT_STATS that reports bytes in the write queue but not sent. This is the same metric as what is exported with tcp_info.tcpi_notsent_bytes. Signed-off-by: Yousuk Seung Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 1a7fc856e237..f2acb2566333 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -312,6 +312,7 @@ enum { TCP_NLA_REORD_SEEN, /* reordering events seen */ TCP_NLA_SRTT, /* smoothed RTT in usecs */ TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ + TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 48aa457a9516..b7134f76f840 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3344,6 +3344,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ 0; } @@ -3399,6 +3400,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); + nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, + max_t(int, 0, tp->write_seq - tp->snd_nxt)); return stats; } -- cgit v1.2.3 From ddf0322db79c5984dc1a1db890f946dd19b7d6d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 23 Feb 2020 16:41:33 -0700 Subject: io_uring: add IORING_OP_PROVIDE_BUFFERS IORING_OP_PROVIDE_BUFFERS uses the buffer registration infrastructure to support passing in an addr/len that is associated with a buffer ID and buffer group ID. The group ID is used to index and lookup the buffers, while the buffer ID can be used to notify the application which buffer in the group was used. The addr passed in is the starting buffer address, and length is each buffer length. A number of buffers to add with can be specified, in which case addr is incremented by length for each addition, and each buffer increments the buffer ID specified. No validation is done of the buffer ID. If the application provides buffers within the same group with identical buffer IDs, then it'll have a hard time telling which buffer ID was used. The only restriction is that the buffer ID can be a max of 16-bits in size, so USHRT_MAX is the maximum ID that can be used. Signed-off-by: Jens Axboe --- fs/io_uring.c | 138 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/io_uring.h | 10 ++- 2 files changed, 145 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1f3ae208f6a6..1a58f2042815 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -447,6 +447,15 @@ struct io_splice { unsigned int flags; }; +struct io_provide_buf { + struct file *file; + __u64 addr; + __s32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -572,6 +581,7 @@ struct io_kiocb { struct io_madvise madvise; struct io_epoll epoll; struct io_splice splice; + struct io_provide_buf pbuf; }; struct io_async_ctx *io; @@ -799,7 +809,8 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, - } + }, + [IORING_OP_PROVIDE_BUFFERS] = {}, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2785,6 +2796,120 @@ static int io_openat(struct io_kiocb *req, bool force_nonblock) return io_openat2(req, force_nonblock); } +static int io_provide_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -E2BIG; + p->nbufs = tmp; + p->addr = READ_ONCE(sqe->addr); + p->len = READ_ONCE(sqe->len); + + if (!access_ok(u64_to_user_ptr(p->addr), p->len)) + return -EFAULT; + + p->bgid = READ_ONCE(sqe->buf_group); + tmp = READ_ONCE(sqe->off); + if (tmp > USHRT_MAX) + return -E2BIG; + p->bid = tmp; + return 0; +} + +static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) +{ + struct io_buffer *buf; + u64 addr = pbuf->addr; + int i, bid = pbuf->bid; + + for (i = 0; i < pbuf->nbufs; i++) { + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + break; + + buf->addr = addr; + buf->len = pbuf->len; + buf->bid = bid; + addr += pbuf->len; + bid++; + if (!*head) { + INIT_LIST_HEAD(&buf->list); + *head = buf; + } else { + list_add_tail(&buf->list, &(*head)->list); + } + } + + return i ? i : -ENOMEM; +} + +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +{ + if (needs_lock) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (needs_lock) + mutex_lock(&ctx->uring_lock); +} + +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head, *list; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + + ret = io_add_buffers(p, &head); + if (ret < 0) + goto out; + + if (!list) { + ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, + GFP_KERNEL); + if (ret < 0) { + while (!list_empty(&head->list)) { + struct io_buffer *buf; + + buf = list_first_entry(&head->list, + struct io_buffer, list); + list_del(&buf->list); + kfree(buf); + } + kfree(head); + goto out; + } + } +out: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4392,6 +4517,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_SPLICE: ret = io_splice_prep(req, sqe); break; + case IORING_OP_PROVIDE_BUFFERS: + ret = io_provide_buffers_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -4669,6 +4797,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_splice(req, force_nonblock); break; + case IORING_OP_PROVIDE_BUFFERS: + if (sqe) { + ret = io_provide_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_provide_buffers(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 53b36311cdac..bc34a57a660b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -45,8 +45,13 @@ struct io_uring_sqe { __u64 user_data; /* data to be passed back at completion time */ union { struct { - /* index into fixed buffers, if used */ - __u16 buf_index; + /* pack this to avoid bogus arm OABI complaints */ + union { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* for grouped buffer selection */ + __u16 buf_group; + } __attribute__((packed)); /* personality to use, if used */ __u16 personality; __s32 splice_fd_in; @@ -119,6 +124,7 @@ enum { IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, IORING_OP_SPLICE, + IORING_OP_PROVIDE_BUFFERS, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From bcda7baaa3f15c7a95db3c024bb046d6e298f76b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 23 Feb 2020 16:42:51 -0700 Subject: io_uring: support buffer selection for OP_READ and OP_RECV If a server process has tons of pending socket connections, generally it uses epoll to wait for activity. When the socket is ready for reading (or writing), the task can select a buffer and issue a recv/send on the given fd. Now that we have fast (non-async thread) support, a task can have tons of pending reads or writes pending. But that means they need buffers to back that data, and if the number of connections is high enough, having them preallocated for all possible connections is unfeasible. With IORING_OP_PROVIDE_BUFFERS, an application can register buffers to use for any request. The request then sets IOSQE_BUFFER_SELECT in the sqe, and a given group ID in sqe->buf_group. When the fd becomes ready, a free buffer from the specified group is selected. If none are available, the request is terminated with -ENOBUFS. If successful, the CQE on completion will contain the buffer ID chosen in the cqe->flags member, encoded as: (buffer_id << IORING_CQE_BUFFER_SHIFT) | IORING_CQE_F_BUFFER; Once a buffer has been consumed by a request, it is no longer available and must be registered again with IORING_OP_PROVIDE_BUFFERS. Requests need to support this feature. For now, IORING_OP_READ and IORING_OP_RECV support it. This is checked on SQE submission, a CQE with res == -EOPNOTSUPP will be posted if attempted on unsupported requests. Signed-off-by: Jens Axboe --- fs/io_uring.c | 224 ++++++++++++++++++++++++++++++++++-------- include/uapi/linux/io_uring.h | 14 +++ 2 files changed, 199 insertions(+), 39 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1a58f2042815..a80b5c189c14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -395,7 +395,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; + int bgid; size_t len; + struct io_buffer *kbuf; }; struct io_open { @@ -490,6 +492,7 @@ enum { REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, @@ -506,6 +509,7 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, }; enum { @@ -519,6 +523,8 @@ enum { REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), /* already grabbed next link */ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), @@ -550,6 +556,8 @@ enum { REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), /* already went through poll handler */ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), }; struct async_poll { @@ -612,6 +620,7 @@ struct io_kiocb { struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; + int cflags; }; struct io_wq_work work; }; @@ -661,6 +670,8 @@ struct io_op_def { /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; }; static const struct io_op_def io_op_defs[] = { @@ -770,6 +781,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .buffer_select = 1, }, [IORING_OP_WRITE] = { .needs_mm = 1, @@ -794,6 +806,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .buffer_select = 1, }, [IORING_OP_OPENAT2] = { .needs_file = 1, @@ -1170,7 +1183,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, req->cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1194,7 +1207,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return cqe != NULL; } -static void io_cqring_fill_event(struct io_kiocb *req, long res) +static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; @@ -1210,7 +1223,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); - WRITE_ONCE(cqe->flags, 0); + WRITE_ONCE(cqe->flags, cflags); } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1222,23 +1235,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &ctx->cq_overflow_list); } } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_cqring_fill_event(struct io_kiocb *req, long res) +{ + __io_cqring_fill_event(req, res, 0); +} + +static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(req, res); + __io_cqring_fill_event(req, res, cflags); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } +static void io_cqring_add_event(struct io_kiocb *req, long res) +{ + __io_cqring_add_event(req, res, 0); +} + static inline bool io_is_fallback_req(struct io_kiocb *req) { return req == (struct io_kiocb *) @@ -1660,6 +1684,18 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) return true; } +static int io_put_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf = (struct io_buffer *) req->rw.addr; + int cflags; + + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->rw.addr = 0; + kfree(kbuf); + return cflags; +} + /* * Find and free completed poll iocbs */ @@ -1671,10 +1707,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { + int cflags = 0; + req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + + __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; if (refcount_dec_and_test(&req->refs) && @@ -1849,13 +1890,16 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + int cflags = 0; if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) req_set_fail_links(req); - io_cqring_add_event(req, res); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_kbuf(req); + __io_cqring_add_event(req, res, cflags); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -2033,7 +2077,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index */ + /* we own ->private, reuse it for the buffer index / buffer ID */ req->rw.kiocb.private = (void *) (unsigned long) READ_ONCE(sqe->buf_index); return 0; @@ -2146,8 +2190,61 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, return len; } +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +{ + if (needs_lock) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (needs_lock) + mutex_lock(&ctx->uring_lock); +} + +static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, + int bgid, struct io_buffer *kbuf, + bool needs_lock) +{ + struct io_buffer *head; + + if (req->flags & REQ_F_BUFFER_SELECTED) + return kbuf; + + io_ring_submit_lock(req->ctx, needs_lock); + + lockdep_assert_held(&req->ctx->uring_lock); + + head = idr_find(&req->ctx->io_buffer_idr, bgid); + if (head) { + if (!list_empty(&head->list)) { + kbuf = list_last_entry(&head->list, struct io_buffer, + list); + list_del(&kbuf->list); + } else { + kbuf = head; + idr_remove(&req->ctx->io_buffer_idr, bgid); + } + if (*len > kbuf->len) + *len = kbuf->len; + } else { + kbuf = ERR_PTR(-ENOBUFS); + } + + io_ring_submit_unlock(req->ctx, needs_lock); + + return kbuf; +} + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter) + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; @@ -2159,12 +2256,29 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return io_import_fixed(req, rw, iter); } - /* buffer index only valid with fixed read/write */ - if (req->rw.kiocb.private) + /* buffer index only valid with fixed read/write, or buffer select */ + if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { ssize_t ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + struct io_buffer *kbuf = (struct io_buffer *) req->rw.addr; + int bgid; + + bgid = (int) (unsigned long) req->rw.kiocb.private; + kbuf = io_buffer_select(req, &sqe_len, bgid, kbuf, + needs_lock); + if (IS_ERR(kbuf)) { + *iovec = NULL; + return PTR_ERR(kbuf); + } + req->rw.addr = (u64) kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + buf = u64_to_user_ptr(kbuf->addr); + } + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; return ret < 0 ? ret : sqe_len; @@ -2307,7 +2421,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2324,7 +2438,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) size_t iov_count; ssize_t io_size, ret; - ret = io_import_iovec(READ, req, &iovec, &iter); + ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; @@ -2396,7 +2510,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, io = req->io; io->rw.iov = io->rw.fast_iov; req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); req->io = io; if (ret < 0) return ret; @@ -2413,7 +2527,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) size_t iov_count; ssize_t ret, io_size; - ret = io_import_iovec(WRITE, req, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; @@ -2850,24 +2964,6 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) -{ - if (needs_lock) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) -{ - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (needs_lock) - mutex_lock(&ctx->uring_lock); -} - static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) { struct io_provide_buf *p = &req->pbuf; @@ -3390,6 +3486,27 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) #endif } +static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, + int *cflags, bool needs_lock) +{ + struct io_sr_msg *sr = &req->sr_msg; + struct io_buffer *kbuf; + + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return NULL; + + kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); + if (IS_ERR(kbuf)) + return kbuf; + + sr->kbuf = kbuf; + req->flags |= REQ_F_BUFFER_SELECTED; + + *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + *cflags |= IORING_CQE_F_BUFFER; + return kbuf; +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3401,6 +3518,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->bgid = READ_ONCE(sqe->buf_group); #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -3490,8 +3608,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) static int io_recv(struct io_kiocb *req, bool force_nonblock) { #if defined(CONFIG_NET) + struct io_buffer *kbuf = NULL; struct socket *sock; - int ret; + int ret, cflags = 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3499,15 +3618,25 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) sock = sock_from_file(req->file, &ret); if (sock) { struct io_sr_msg *sr = &req->sr_msg; + void __user *buf = sr->buf; struct msghdr msg; struct iovec iov; unsigned flags; - ret = import_single_range(READ, sr->buf, sr->len, &iov, + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + else if (kbuf) + buf = u64_to_user_ptr(kbuf->addr); + + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); - if (ret) + if (ret) { + kfree(kbuf); return ret; + } + req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -3528,7 +3657,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; } - io_cqring_add_event(req, ret); + kfree(kbuf); + req->flags &= ~REQ_F_NEED_CLEANUP; + __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); io_put_req(req); @@ -4566,6 +4697,9 @@ static void io_cleanup_req(struct io_kiocb *req) case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree((void *)(unsigned long)req->rw.addr); + /* fallthrough */ case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: @@ -4577,6 +4711,10 @@ static void io_cleanup_req(struct io_kiocb *req) if (io->msg.iov != io->msg.fast_iov) kfree(io->msg.iov); break; + case IORING_OP_RECV: + if (req->flags & REQ_F_BUFFER_SELECTED) + kfree(req->sr_msg.kbuf); + break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: case IORING_OP_STATX: @@ -5154,7 +5292,8 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) @@ -5171,6 +5310,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, goto err_req; } + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) { + ret = -EOPNOTSUPP; + goto err_req; + } + id = READ_ONCE(sqe->personality); if (id) { req->work.creds = idr_find(&ctx->personality_idr, id); @@ -5183,7 +5328,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | - IOSQE_ASYNC | IOSQE_FIXED_FILE); + IOSQE_ASYNC | IOSQE_FIXED_FILE | + IOSQE_BUFFER_SELECT); ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index bc34a57a660b..9b263d9b24e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -66,6 +66,7 @@ enum { IOSQE_IO_LINK_BIT, IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, + IOSQE_BUFFER_SELECT_BIT, }; /* @@ -81,6 +82,8 @@ enum { #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) /* always go async */ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) +/* select buffer from sqe->buf_group */ +#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) /* * io_uring_setup() flags @@ -155,6 +158,17 @@ struct io_uring_cqe { __u32 flags; }; +/* + * cqe->flags + * + * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + */ +#define IORING_CQE_F_BUFFER (1U << 0) + +enum { + IORING_CQE_BUFFER_SHIFT = 16, +}; + /* * Magic offsets for the application to mmap the data it needs */ -- cgit v1.2.3 From 067524e914cb23e20d59480b318fe2625eaee7c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 2 Mar 2020 16:32:28 -0700 Subject: io_uring: provide means of removing buffers We have IORING_OP_PROVIDE_BUFFERS, but the only way to remove buffers is to trigger IO on them. The usual case of shrinking a buffer pool would be to just not replenish the buffers when IO completes, and instead just free it. But it may be nice to have a way to manually remove a number of buffers from a given group, and IORING_OP_REMOVE_BUFFERS provides that functionality. Signed-off-by: Jens Axboe --- fs/io_uring.c | 102 ++++++++++++++++++++++++++++++++++-------- include/uapi/linux/io_uring.h | 1 + 2 files changed, 84 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 455d53fd840f..f131105fa12a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -827,6 +827,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, }, [IORING_OP_PROVIDE_BUFFERS] = {}, + [IORING_OP_REMOVE_BUFFERS] = {}, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -2995,6 +2996,75 @@ static int io_openat(struct io_kiocb *req, bool force_nonblock) return io_openat2(req, force_nonblock); } +static int io_remove_buffers_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = &req->pbuf; + u64 tmp; + + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -EINVAL; + + memset(p, 0, sizeof(*p)); + p->nbufs = tmp; + p->bgid = READ_ONCE(sqe->buf_group); + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, + int bgid, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + /* the head kbuf is the list itself */ + while (!list_empty(&buf->list)) { + struct io_buffer *nxt; + + nxt = list_first_entry(&buf->list, struct io_buffer, list); + list_del(&nxt->list); + kfree(nxt); + if (++i == nbufs) + return i; + } + i++; + kfree(buf); + idr_remove(&ctx->io_buffer_idr, bgid); + + return i; +} + +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +{ + struct io_provide_buf *p = &req->pbuf; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer *head; + int ret = 0; + + io_ring_submit_lock(ctx, !force_nonblock); + + lockdep_assert_held(&ctx->uring_lock); + + ret = -ENOENT; + head = idr_find(&ctx->io_buffer_idr, p->bgid); + if (head) + ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + + io_ring_submit_lock(ctx, !force_nonblock); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3070,15 +3140,7 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, GFP_KERNEL); if (ret < 0) { - while (!list_empty(&head->list)) { - struct io_buffer *buf; - - buf = list_first_entry(&head->list, - struct io_buffer, list); - list_del(&buf->list); - kfree(buf); - } - kfree(head); + __io_remove_buffers(ctx, head, p->bgid, -1U); goto out; } } @@ -4825,6 +4887,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_PROVIDE_BUFFERS: ret = io_provide_buffers_prep(req, sqe); break; + case IORING_OP_REMOVE_BUFFERS: + ret = io_remove_buffers_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -5120,6 +5185,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_provide_buffers(req, force_nonblock); break; + case IORING_OP_REMOVE_BUFFERS: + if (sqe) { + ret = io_remove_buffers_prep(req, sqe); + if (ret) + break; + } + ret = io_remove_buffers(req, force_nonblock); + break; default: ret = -EINVAL; break; @@ -6998,16 +7071,7 @@ static int __io_destroy_buffers(int id, void *p, void *data) struct io_ring_ctx *ctx = data; struct io_buffer *buf = p; - /* the head kbuf is the list itself */ - while (!list_empty(&buf->list)) { - struct io_buffer *nxt; - - nxt = list_first_entry(&buf->list, struct io_buffer, list); - list_del(&nxt->list); - kfree(nxt); - } - kfree(buf); - idr_remove(&ctx->io_buffer_idr, id); + __io_remove_buffers(ctx, buf, id, -1U); return 0; } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 9b263d9b24e6..cef4c0c0f26b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -128,6 +128,7 @@ enum { IORING_OP_EPOLL_CTL, IORING_OP_SPLICE, IORING_OP_PROVIDE_BUFFERS, + IORING_OP_REMOVE_BUFFERS, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 7c4a4d088283e02cc04252e88dd08b5cdf54e70f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 10 Mar 2020 15:18:02 -0700 Subject: dmaengine: idxd: Merge definition of dsa_batch_desc into dsa_hw_desc We don't need a special structure just for batch descriptors. The layout matches the general form for other descriptors. Merge the desc_list_addr field into the union of other aliases for the the third quadword in the structure. Create a union to alias "xfer_size" with "desc_count". Signed-off-by: Tony Luck Acked-by: Dave Jiang Link: https://lore.kernel.org/r/158387868208.35922.5895104426944263789.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- include/uapi/linux/idxd.h | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index 849ef1515d04..1f412fbf561b 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -83,21 +83,6 @@ enum dsa_completion_status { #define DSA_COMP_STATUS_MASK 0x7f #define DSA_COMP_STATUS_WRITE 0x80 -struct dsa_batch_desc { - uint32_t pasid:20; - uint32_t rsvd:11; - uint32_t priv:1; - uint32_t flags:24; - uint32_t opcode:8; - uint64_t completion_addr; - uint64_t desc_list_addr; - uint64_t rsvd1; - uint32_t desc_count; - uint16_t interrupt_handle; - uint16_t rsvd2; - uint8_t rsvd3[24]; -} __attribute__((packed)); - struct dsa_hw_desc { uint32_t pasid:20; uint32_t rsvd:11; @@ -109,6 +94,7 @@ struct dsa_hw_desc { uint64_t src_addr; uint64_t rdback_addr; uint64_t pattern; + uint64_t desc_list_addr; }; union { uint64_t dst_addr; @@ -116,7 +102,10 @@ struct dsa_hw_desc { uint64_t src2_addr; uint64_t comp_pattern; }; - uint32_t xfer_size; + union { + uint32_t xfer_size; + uint32_t desc_count; + }; uint16_t int_handle; uint16_t rsvd1; union { -- cgit v1.2.3 From bbbdeb4720a0759ec90e3bcb20ad28d19e531346 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Mar 2020 07:45:46 -0600 Subject: io_uring: dual license io_uring.h uapi header This just syncs the header it with the liburing version, so there's no confusion on the license of the header parts. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cef4c0c0f26b..6d9d2b1cc523 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT */ /* * Header file for the io_uring interface. * -- cgit v1.2.3 From 2677625387056136e256c743e3285b4fe3da87bb Mon Sep 17 00:00:00 2001 From: Paolo Lungaroni Date: Wed, 11 Mar 2020 17:54:06 +0100 Subject: seg6: fix SRv6 L2 tunnels to use IANA-assigned protocol number The Internet Assigned Numbers Authority (IANA) has recently assigned a protocol number value of 143 for Ethernet [1]. Before this assignment, encapsulation mechanisms such as Segment Routing used the IPv6-NoNxt protocol number (59) to indicate that the encapsulated payload is an Ethernet frame. In this patch, we add the definition of the Ethernet protocol number to the kernel headers and update the SRv6 L2 tunnels to use it. [1] https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml Signed-off-by: Paolo Lungaroni Reviewed-by: Andrea Mayer Acked-by: Ahmed Abdelsalam Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 2 ++ net/ipv6/seg6_iptunnel.c | 2 +- net/ipv6/seg6_local.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 1521073b6348..8533bf07450f 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -74,6 +74,8 @@ enum { #define IPPROTO_UDPLITE IPPROTO_UDPLITE IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */ #define IPPROTO_MPLS IPPROTO_MPLS + IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ +#define IPPROTO_ETHERNET IPPROTO_ETHERNET IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_MPTCP = 262, /* Multipath TCP connection */ diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index ab7f124ff5d7..8c52efe299cc 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -268,7 +268,7 @@ static int seg6_do_srh(struct sk_buff *skb) skb_mac_header_rebuild(skb); skb_push(skb, skb->mac_len); - err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE); + err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET); if (err) return err; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 7cbc19731997..8165802d8e05 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -282,7 +282,7 @@ static int input_action_end_dx2(struct sk_buff *skb, struct net_device *odev; struct ethhdr *eth; - if (!decap_and_validate(skb, NEXTHDR_NONE)) + if (!decap_and_validate(skb, IPPROTO_ETHERNET)) goto drop; if (!pskb_may_pull(skb, ETH_HLEN)) -- cgit v1.2.3 From f8c8ee61189b8151525e4f10828d53865ef847e4 Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Thu, 12 Mar 2020 11:31:38 +0100 Subject: media: v4l: link dt-bindings and uapi Since we expose the definition to the dt-bindings we need to keep those definitions in sync. To address this the patch adds a simple cross reference to the dt-bindings. Signed-off-by: Marco Felsch Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 5f9357dcb060..f48e47e230c3 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1242,6 +1242,10 @@ struct v4l2_selection { typedef __u64 v4l2_std_id; +/* + * Attention: Keep the V4L2_STD_* bit definitions in sync with + * include/dt-bindings/display/sdtv-standards.h SDTV_STD_* bit definitions. + */ /* one bit for each */ #define V4L2_STD_PAL_B ((v4l2_std_id)0x00000001) #define V4L2_STD_PAL_B1 ((v4l2_std_id)0x00000002) -- cgit v1.2.3 From 0524399d4612f5af38b8383680dde4df4bc4eea2 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:07:48 +0100 Subject: ethtool: provide netdev features with FEATURES_GET request Implement FEATURES_GET request to get network device features. These are traditionally available via ETHTOOL_GFEATURES ioctl request. v2: - style cleanup suggested by Jakub Kicinski Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 51 +++++++++-- include/uapi/linux/ethtool_netlink.h | 17 ++++ net/ethtool/Makefile | 2 +- net/ethtool/common.h | 2 + net/ethtool/features.c | 131 +++++++++++++++++++++++++++ net/ethtool/ioctl.c | 2 - net/ethtool/netlink.c | 8 ++ net/ethtool/netlink.h | 1 + 8 files changed, 202 insertions(+), 12 deletions(-) create mode 100644 net/ethtool/features.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index f1f868479ceb..5713abf98534 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -189,6 +189,7 @@ Userspace to kernel: ``ETHTOOL_MSG_DEBUG_SET`` set debugging settings ``ETHTOOL_MSG_WOL_GET`` get wake-on-lan settings ``ETHTOOL_MSG_WOL_SET`` set wake-on-lan settings + ``ETHTOOL_MSG_FEATURES_GET`` get device features ===================================== ================================ Kernel to userspace: @@ -204,6 +205,7 @@ Kernel to userspace: ``ETHTOOL_MSG_DEBUG_NTF`` debugging settings notification ``ETHTOOL_MSG_WOL_GET_REPLY`` wake-on-lan settings ``ETHTOOL_MSG_WOL_NTF`` wake-on-lan settings notification + ``ETHTOOL_MSG_FEATURES_GET_REPLY`` device features ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -521,6 +523,37 @@ Request contents: ``WAKE_MAGICSECURE`` mode. +FEATURES_GET +============ + +Gets netdev features like ``ETHTOOL_GFEATURES`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_FEATURES_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_FEATURES_HEADER`` nested reply header + ``ETHTOOL_A_FEATURES_HW`` bitset dev->hw_features + ``ETHTOOL_A_FEATURES_WANTED`` bitset dev->wanted_features + ``ETHTOOL_A_FEATURES_ACTIVE`` bitset dev->features + ``ETHTOOL_A_FEATURES_NOCHANGE`` bitset NETIF_F_NEVER_CHANGE + ==================================== ====== ========================== + +Bitmaps in kernel response have the same meaning as bitmaps used in ioctl +interference but attribute names are different (they are based on +corresponding members of struct net_device). Legacy "flags" are not provided, +if userspace needs them (most likely only ethtool for backward compatibility), +it can calculate their values from related feature bits itself. +ETHA_FEATURES_HW uses mask consisting of all features recognized by kernel (to +provide all names when using verbose bitmap format), the other three use no +mask (simple bit lists). + + Request translation =================== @@ -551,30 +584,30 @@ have their netlink replacement yet. ``ETHTOOL_SRINGPARAM`` n/a ``ETHTOOL_GPAUSEPARAM`` n/a ``ETHTOOL_SPAUSEPARAM`` n/a - ``ETHTOOL_GRXCSUM`` n/a + ``ETHTOOL_GRXCSUM`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SRXCSUM`` n/a - ``ETHTOOL_GTXCSUM`` n/a + ``ETHTOOL_GTXCSUM`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_STXCSUM`` n/a - ``ETHTOOL_GSG`` n/a + ``ETHTOOL_GSG`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SSG`` n/a ``ETHTOOL_TEST`` n/a ``ETHTOOL_GSTRINGS`` ``ETHTOOL_MSG_STRSET_GET`` ``ETHTOOL_PHYS_ID`` n/a ``ETHTOOL_GSTATS`` n/a - ``ETHTOOL_GTSO`` n/a + ``ETHTOOL_GTSO`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_STSO`` n/a ``ETHTOOL_GPERMADDR`` rtnetlink ``RTM_GETLINK`` - ``ETHTOOL_GUFO`` n/a + ``ETHTOOL_GUFO`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SUFO`` n/a - ``ETHTOOL_GGSO`` n/a + ``ETHTOOL_GGSO`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SGSO`` n/a - ``ETHTOOL_GFLAGS`` n/a + ``ETHTOOL_GFLAGS`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFLAGS`` n/a ``ETHTOOL_GPFLAGS`` n/a ``ETHTOOL_SPFLAGS`` n/a ``ETHTOOL_GRXFH`` n/a ``ETHTOOL_SRXFH`` n/a - ``ETHTOOL_GGRO`` n/a + ``ETHTOOL_GGRO`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SGRO`` n/a ``ETHTOOL_GRXRINGS`` n/a ``ETHTOOL_GRXCLSRLCNT`` n/a @@ -589,7 +622,7 @@ have their netlink replacement yet. ``ETHTOOL_GSSET_INFO`` ``ETHTOOL_MSG_STRSET_GET`` ``ETHTOOL_GRXFHINDIR`` n/a ``ETHTOOL_SRXFHINDIR`` n/a - ``ETHTOOL_GFEATURES`` n/a + ``ETHTOOL_GFEATURES`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFEATURES`` n/a ``ETHTOOL_GCHANNELS`` n/a ``ETHTOOL_SCHANNELS`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7e0b460f872c..d0cc7a0334c8 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -24,6 +24,7 @@ enum { ETHTOOL_MSG_DEBUG_SET, ETHTOOL_MSG_WOL_GET, ETHTOOL_MSG_WOL_SET, + ETHTOOL_MSG_FEATURES_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -43,6 +44,7 @@ enum { ETHTOOL_MSG_DEBUG_NTF, ETHTOOL_MSG_WOL_GET_REPLY, ETHTOOL_MSG_WOL_NTF, + ETHTOOL_MSG_FEATURES_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -228,6 +230,21 @@ enum { ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 }; +/* FEATURES */ + +enum { + ETHTOOL_A_FEATURES_UNSPEC, + ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEATURES_HW, /* bitset */ + ETHTOOL_A_FEATURES_WANTED, /* bitset */ + ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ + ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_FEATURES_CNT, + ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 424545a4aaec..5be8c9ab26d1 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -5,4 +5,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ - linkstate.o debug.o wol.o + linkstate.o debug.o wol.o features.o diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 40ba74e0b9bb..7dc1163800a7 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -6,6 +6,8 @@ #include #include +#define ETHTOOL_DEV_FEATURE_WORDS DIV_ROUND_UP(NETDEV_FEATURE_COUNT, 32) + /* compose link mode index from speed, type and duplex */ #define ETHTOOL_LINK_MODE(speed, type, duplex) \ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT diff --git a/net/ethtool/features.c b/net/ethtool/features.c new file mode 100644 index 000000000000..a0cc2b969053 --- /dev/null +++ b/net/ethtool/features.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct features_req_info { + struct ethnl_req_info base; +}; + +struct features_reply_data { + struct ethnl_reply_data base; + u32 hw[ETHTOOL_DEV_FEATURE_WORDS]; + u32 wanted[ETHTOOL_DEV_FEATURE_WORDS]; + u32 active[ETHTOOL_DEV_FEATURE_WORDS]; + u32 nochange[ETHTOOL_DEV_FEATURE_WORDS]; + u32 all[ETHTOOL_DEV_FEATURE_WORDS]; +}; + +#define FEATURES_REPDATA(__reply_base) \ + container_of(__reply_base, struct features_reply_data, base) + +static const struct nla_policy +features_get_policy[ETHTOOL_A_FEATURES_MAX + 1] = { + [ETHTOOL_A_FEATURES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEATURES_HW] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_ACTIVE] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_NOCHANGE] = { .type = NLA_REJECT }, +}; + +static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) +{ + unsigned int i; + + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++) + dest[i] = src >> (32 * i); +} + +static int features_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct features_reply_data *data = FEATURES_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + netdev_features_t all_features; + + ethnl_features_to_bitmap32(data->hw, dev->hw_features); + ethnl_features_to_bitmap32(data->wanted, dev->wanted_features); + ethnl_features_to_bitmap32(data->active, dev->features); + ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE); + all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0); + ethnl_features_to_bitmap32(data->all, all_features); + + return 0; +} + +static int features_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct features_reply_data *data = FEATURES_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + unsigned int len = 0; + int ret; + + ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + len += ret; + ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + len += ret; + ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + len += ret; + ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + len += ret; + + return len; +} + +static int features_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct features_reply_data *data = FEATURES_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw, + data->all, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted, + NULL, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active, + NULL, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + return ret; + return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE, + data->nochange, NULL, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); +} + +const struct ethnl_request_ops ethnl_features_request_ops = { + .request_cmd = ETHTOOL_MSG_FEATURES_GET, + .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEATURES_HEADER, + .max_attr = ETHTOOL_A_FEATURES_MAX, + .req_info_size = sizeof(struct features_req_info), + .reply_data_size = sizeof(struct features_reply_data), + .request_policy = features_get_policy, + + .prepare_data = features_prepare_data, + .reply_size = features_reply_size, + .fill_reply = features_fill_reply, +}; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index ae97c82c7052..45d1bf1764b7 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -56,8 +56,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info); /* Handlers for each ethtool command */ -#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) - static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 8eca55122ef3..e451a75e9577 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -215,6 +215,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, + [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -695,6 +696,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_wol, }, + { + .cmd = ETHTOOL_MSG_FEATURES_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 961708290074..be2325ea8493 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -337,6 +337,7 @@ extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; +extern const struct ethnl_request_ops ethnl_features_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From 0980bfcd6954f124e40a000b85335c197764de14 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:07:58 +0100 Subject: ethtool: set netdev features with FEATURES_SET request Implement FEATURES_SET netlink request to set network device features. These are traditionally set using ETHTOOL_SFEATURES ioctl request. Actual change is subject to netdev_change_features() sanity checks so that it can differ from what was requested. Unlike with most other SET requests, in addition to error code and optional extack, kernel provides an optional reply message (ETHTOOL_MSG_FEATURES_SET_REPLY) in the same format but with different semantics: information about difference between user request and actual result and difference between old and new state of dev->features. This reply message can be suppressed by setting ETHTOOL_FLAG_OMIT_REPLY flag in request header. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 56 +++++++-- include/uapi/linux/ethtool_netlink.h | 2 + net/ethtool/features.c | 169 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 5 + net/ethtool/netlink.h | 1 + 5 files changed, 224 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 5713abf98534..d6706c4aa972 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -190,6 +190,7 @@ Userspace to kernel: ``ETHTOOL_MSG_WOL_GET`` get wake-on-lan settings ``ETHTOOL_MSG_WOL_SET`` set wake-on-lan settings ``ETHTOOL_MSG_FEATURES_GET`` get device features + ``ETHTOOL_MSG_FEATURES_SET`` set device features ===================================== ================================ Kernel to userspace: @@ -206,6 +207,7 @@ Kernel to userspace: ``ETHTOOL_MSG_WOL_GET_REPLY`` wake-on-lan settings ``ETHTOOL_MSG_WOL_NTF`` wake-on-lan settings notification ``ETHTOOL_MSG_FEATURES_GET_REPLY`` device features + ``ETHTOOL_MSG_FEATURES_SET_REPLY`` optional reply to FEATURES_SET ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -554,6 +556,42 @@ provide all names when using verbose bitmap format), the other three use no mask (simple bit lists). +FEATURES_SET +============ + +Request to set netdev features like ``ETHTOOL_SFEATURES`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_FEATURES_HEADER`` nested request header + ``ETHTOOL_A_FEATURES_WANTED`` bitset requested features + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_FEATURES_HEADER`` nested reply header + ``ETHTOOL_A_FEATURES_WANTED`` bitset diff wanted vs. result + ``ETHTOOL_A_FEATURES_ACTIVE`` bitset diff old vs. new active + ==================================== ====== ========================== + +Request constains only one bitset which can be either value/mask pair (request +to change specific feature bits and leave the rest) or only a value (request +to set all features to specified set). + +As request is subject to netdev_change_features() sanity checks, optional +kernel reply (can be suppressed by ``ETHTOOL_FLAG_OMIT_REPLY`` flag in request +header) informs client about the actual result. ``ETHTOOL_A_FEATURES_WANTED`` +reports the difference between client request and actual result: mask consists +of bits which differ between requested features and result (dev->features +after the operation), value consists of values of these bits in the request +(i.e. negated values from resulting features). ``ETHTOOL_A_FEATURES_ACTIVE`` +reports the difference between old and new dev->features: mask consists of +bits which have changed, values are their values in new dev->features (after +the operation). + + Request translation =================== @@ -585,30 +623,30 @@ have their netlink replacement yet. ``ETHTOOL_GPAUSEPARAM`` n/a ``ETHTOOL_SPAUSEPARAM`` n/a ``ETHTOOL_GRXCSUM`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SRXCSUM`` n/a + ``ETHTOOL_SRXCSUM`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GTXCSUM`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_STXCSUM`` n/a + ``ETHTOOL_STXCSUM`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GSG`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SSG`` n/a + ``ETHTOOL_SSG`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_TEST`` n/a ``ETHTOOL_GSTRINGS`` ``ETHTOOL_MSG_STRSET_GET`` ``ETHTOOL_PHYS_ID`` n/a ``ETHTOOL_GSTATS`` n/a ``ETHTOOL_GTSO`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_STSO`` n/a + ``ETHTOOL_STSO`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GPERMADDR`` rtnetlink ``RTM_GETLINK`` ``ETHTOOL_GUFO`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SUFO`` n/a + ``ETHTOOL_SUFO`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GGSO`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SGSO`` n/a + ``ETHTOOL_SGSO`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GFLAGS`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SFLAGS`` n/a + ``ETHTOOL_SFLAGS`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GPFLAGS`` n/a ``ETHTOOL_SPFLAGS`` n/a ``ETHTOOL_GRXFH`` n/a ``ETHTOOL_SRXFH`` n/a ``ETHTOOL_GGRO`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SGRO`` n/a + ``ETHTOOL_SGRO`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GRXRINGS`` n/a ``ETHTOOL_GRXCLSRLCNT`` n/a ``ETHTOOL_GRXCLSRULE`` n/a @@ -623,7 +661,7 @@ have their netlink replacement yet. ``ETHTOOL_GRXFHINDIR`` n/a ``ETHTOOL_SRXFHINDIR`` n/a ``ETHTOOL_GFEATURES`` ``ETHTOOL_MSG_FEATURES_GET`` - ``ETHTOOL_SFEATURES`` n/a + ``ETHTOOL_SFEATURES`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GCHANNELS`` n/a ``ETHTOOL_SCHANNELS`` n/a ``ETHTOOL_SET_DUMP`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index d0cc7a0334c8..6f7aaa6b7f42 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -25,6 +25,7 @@ enum { ETHTOOL_MSG_WOL_GET, ETHTOOL_MSG_WOL_SET, ETHTOOL_MSG_FEATURES_GET, + ETHTOOL_MSG_FEATURES_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -45,6 +46,7 @@ enum { ETHTOOL_MSG_WOL_GET_REPLY, ETHTOOL_MSG_WOL_NTF, ETHTOOL_MSG_FEATURES_GET_REPLY, + ETHTOOL_MSG_FEATURES_SET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/features.c b/net/ethtool/features.c index a0cc2b969053..4ac1e05684ce 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -129,3 +129,172 @@ const struct ethnl_request_ops ethnl_features_request_ops = { .reply_size = features_reply_size, .fill_reply = features_fill_reply, }; + +/* FEATURES_SET */ + +static const struct nla_policy +features_set_policy[ETHTOOL_A_FEATURES_MAX + 1] = { + [ETHTOOL_A_FEATURES_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEATURES_HW] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEATURES_ACTIVE] = { .type = NLA_REJECT }, + [ETHTOOL_A_FEATURES_NOCHANGE] = { .type = NLA_REJECT }, +}; + +static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val) +{ + const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); + unsigned int i; + + bitmap_zero(dest, NETDEV_FEATURE_COUNT); + for (i = 0; i < words; i++) + dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG)); +} + +static netdev_features_t ethnl_bitmap_to_features(unsigned long *src) +{ + const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE; + const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); + netdev_features_t ret = 0; + unsigned int i; + + for (i = 0; i < words; i++) + ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG); + ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT); + return ret; +} + +static int features_send_reply(struct net_device *dev, struct genl_info *info, + const unsigned long *wanted, + const unsigned long *wanted_mask, + const unsigned long *active, + const unsigned long *active_mask, bool compact) +{ + struct sk_buff *rskb; + void *reply_payload; + int reply_len = 0; + int ret; + + reply_len = ethnl_reply_header_size(); + ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + goto err; + reply_len += ret; + ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + goto err; + reply_len += ret; + + ret = -ENOMEM; + rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY, + ETHTOOL_A_FEATURES_HEADER, info, + &reply_payload); + if (!rskb) + goto err; + + ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted, + wanted_mask, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + goto nla_put_failure; + ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active, + active_mask, NETDEV_FEATURE_COUNT, + netdev_features_strings, compact); + if (ret < 0) + goto nla_put_failure; + + genlmsg_end(rskb, reply_payload); + ret = genlmsg_reply(rskb, info); + return ret; + +nla_put_failure: + nlmsg_free(rskb); + WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n", + reply_len); +err: + GENL_SET_ERR_MSG(info, "failed to send reply message"); + return ret; +} + +int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) +{ + DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT); + DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT); + DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT); + DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT); + DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT); + DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT); + struct nlattr *tb[ETHTOOL_A_FEATURES_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_FEATURES_MAX, features_set_policy, + info->extack); + if (ret < 0) + return ret; + if (!tb[ETHTOOL_A_FEATURES_WANTED]) + return -EINVAL; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_FEATURES_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + + rtnl_lock(); + ethnl_features_to_bitmap(old_active, dev->features); + ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, + tb[ETHTOOL_A_FEATURES_WANTED], + netdev_features_strings, info->extack); + if (ret < 0) + goto out_rtnl; + if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { + GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); + ret = -EINVAL; + goto out_rtnl; + } + + /* set req_wanted bits not in req_mask from old_active */ + bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT); + bitmap_andnot(new_active, old_active, req_mask, NETDEV_FEATURE_COUNT); + bitmap_or(req_wanted, new_active, req_wanted, NETDEV_FEATURE_COUNT); + if (bitmap_equal(req_wanted, old_active, NETDEV_FEATURE_COUNT)) { + ret = 0; + goto out_rtnl; + } + + dev->wanted_features = ethnl_bitmap_to_features(req_wanted); + __netdev_update_features(dev); + ethnl_features_to_bitmap(new_active, dev->features); + + ret = 0; + if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) { + bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS; + + bitmap_xor(wanted_diff_mask, req_wanted, new_active, + NETDEV_FEATURE_COUNT); + bitmap_xor(active_diff_mask, old_active, new_active, + NETDEV_FEATURE_COUNT); + bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask, + NETDEV_FEATURE_COUNT); + bitmap_and(req_wanted, req_wanted, wanted_diff_mask, + NETDEV_FEATURE_COUNT); + bitmap_and(new_active, new_active, active_diff_mask, + NETDEV_FEATURE_COUNT); + + ret = features_send_reply(dev, info, req_wanted, + wanted_diff_mask, new_active, + active_diff_mask, compact); + } + +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e451a75e9577..757ea3fc98a0 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -703,6 +703,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_FEATURES_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_features, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index be2325ea8493..135836201e89 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -343,5 +343,6 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 9c6451ef4881c2f528c625766e8bb9af5e40941a Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:03 +0100 Subject: ethtool: add FEATURES_NTF notification Send ETHTOOL_MSG_FEATURES_NTF notification whenever network device features are modified using ETHTOOL_MSG_FEATURES_SET netlink message, ethtool ioctl request or any other way resulting in call to netdev_update_features() or netdev_change_features() Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 6 ++++++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/features.c | 4 ++++ net/ethtool/netlink.c | 29 +++++++++++++++++++++++++++- 4 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d6706c4aa972..47542f042e9d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -208,6 +208,7 @@ Kernel to userspace: ``ETHTOOL_MSG_WOL_NTF`` wake-on-lan settings notification ``ETHTOOL_MSG_FEATURES_GET_REPLY`` device features ``ETHTOOL_MSG_FEATURES_SET_REPLY`` optional reply to FEATURES_SET + ``ETHTOOL_MSG_FEATURES_NTF`` netdev features notification ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -591,6 +592,11 @@ reports the difference between old and new dev->features: mask consists of bits which have changed, values are their values in new dev->features (after the operation). +``ETHTOOL_MSG_FEATURES_NTF`` notification is sent not only if device features +are modified using ``ETHTOOL_MSG_FEATURES_SET`` request or on of ethtool ioctl +request but also each time features are modified with netdev_update_features() +or netdev_change_features(). + Request translation =================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 6f7aaa6b7f42..3d0204cf96a6 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -47,6 +47,7 @@ enum { ETHTOOL_MSG_WOL_NTF, ETHTOOL_MSG_FEATURES_GET_REPLY, ETHTOOL_MSG_FEATURES_SET_REPLY, + ETHTOOL_MSG_FEATURES_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 4ac1e05684ce..4e632dc987d8 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -230,6 +230,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) struct nlattr *tb[ETHTOOL_A_FEATURES_MAX + 1]; struct ethnl_req_info req_info = {}; struct net_device *dev; + bool mod; int ret; ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, @@ -272,6 +273,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) dev->wanted_features = ethnl_bitmap_to_features(req_wanted); __netdev_update_features(dev); ethnl_features_to_bitmap(new_active, dev->features); + mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT); ret = 0; if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) { @@ -292,6 +294,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) wanted_diff_mask, new_active, active_diff_mask, compact); } + if (mod) + ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); out_rtnl: rtnl_unlock(); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 757ea3fc98a0..5c0e361bfd66 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -528,6 +528,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_DEBUG_NTF] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_NTF] = ðnl_wol_request_ops, + [ETHTOOL_MSG_FEATURES_NTF] = ðnl_features_request_ops, }; /* default notification handler */ @@ -613,6 +614,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -630,6 +632,29 @@ void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) } EXPORT_SYMBOL(ethtool_notify); +static void ethnl_notify_features(struct netdev_notifier_info *info) +{ + struct net_device *dev = netdev_notifier_info_to_dev(info); + + ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); +} + +static int ethnl_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + switch (event) { + case NETDEV_FEAT_CHANGE: + ethnl_notify_features(ptr); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block ethnl_netdev_notifier = { + .notifier_call = ethnl_netdev_event, +}; + /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { @@ -736,7 +761,9 @@ static int __init ethnl_init(void) return ret; ethnl_ok = true; - return 0; + ret = register_netdevice_notifier(ðnl_netdev_notifier); + WARN(ret < 0, "ethtool: net device notifier registration failed"); + return ret; } subsys_initcall(ethnl_init); -- cgit v1.2.3 From e16c3386fc4d9611b6a2abaa639318c7ae793370 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:08 +0100 Subject: ethtool: provide private flags with PRIVFLAGS_GET request Implement PRIVFLAGS_GET request to get private flags for a network device. These are traditionally available via ETHTOOL_GPFLAGS ioctl request. Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 30 +++++- include/uapi/linux/ethtool_netlink.h | 14 +++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 8 ++ net/ethtool/netlink.h | 1 + net/ethtool/privflags.c | 136 +++++++++++++++++++++++++++ 6 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/privflags.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 47542f042e9d..7bba4c940ef7 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -191,6 +191,7 @@ Userspace to kernel: ``ETHTOOL_MSG_WOL_SET`` set wake-on-lan settings ``ETHTOOL_MSG_FEATURES_GET`` get device features ``ETHTOOL_MSG_FEATURES_SET`` set device features + ``ETHTOOL_MSG_PRIVFLAGS_GET`` get private flags ===================================== ================================ Kernel to userspace: @@ -209,6 +210,7 @@ Kernel to userspace: ``ETHTOOL_MSG_FEATURES_GET_REPLY`` device features ``ETHTOOL_MSG_FEATURES_SET_REPLY`` optional reply to FEATURES_SET ``ETHTOOL_MSG_FEATURES_NTF`` netdev features notification + ``ETHTOOL_MSG_PRIVFLAGS_GET_REPLY`` private flags ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -598,6 +600,32 @@ request but also each time features are modified with netdev_update_features() or netdev_change_features(). +PRIVFLAGS_GET +============= + +Gets private flags like ``ETHTOOL_GPFLAGS`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_PRIVFLAGS_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_PRIVFLAGS_HEADER`` nested reply header + ``ETHTOOL_A_PRIVFLAGS_FLAGS`` bitset private flags + ==================================== ====== ========================== + +``ETHTOOL_A_PRIVFLAGS_FLAGS`` is a bitset with values of device private flags. +These flags are defined by driver, their number and names (and also meaning) +are device dependent. For compact bitset format, names can be retrieved as +``ETH_SS_PRIV_FLAGS`` string set. If verbose bitset format is requested, +response uses all private flags supported by the device as mask so that client +gets the full information without having to fetch the string set with names. + + Request translation =================== @@ -647,7 +675,7 @@ have their netlink replacement yet. ``ETHTOOL_SGSO`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GFLAGS`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFLAGS`` ``ETHTOOL_MSG_FEATURES_SET`` - ``ETHTOOL_GPFLAGS`` n/a + ``ETHTOOL_GPFLAGS`` ``ETHTOOL_MSG_PRIVFLAGS_GET`` ``ETHTOOL_SPFLAGS`` n/a ``ETHTOOL_GRXFH`` n/a ``ETHTOOL_SRXFH`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 3d0204cf96a6..d94bbf5e4d1c 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -26,6 +26,7 @@ enum { ETHTOOL_MSG_WOL_SET, ETHTOOL_MSG_FEATURES_GET, ETHTOOL_MSG_FEATURES_SET, + ETHTOOL_MSG_PRIVFLAGS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -48,6 +49,7 @@ enum { ETHTOOL_MSG_FEATURES_GET_REPLY, ETHTOOL_MSG_FEATURES_SET_REPLY, ETHTOOL_MSG_FEATURES_NTF, + ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -248,6 +250,18 @@ enum { ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 }; +/* PRIVFLAGS */ + +enum { + ETHTOOL_A_PRIVFLAGS_UNSPEC, + ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_PRIVFLAGS_CNT, + ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 5be8c9ab26d1..58708bcc2968 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -5,4 +5,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ - linkstate.o debug.o wol.o features.o + linkstate.o debug.o wol.o features.o privflags.o diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5c0e361bfd66..9cbb1d8b4d23 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -216,6 +216,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, + [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -733,6 +734,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_features, }, + { + .cmd = ETHTOOL_MSG_PRIVFLAGS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 135836201e89..36cf077c3085 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -338,6 +338,7 @@ extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; +extern const struct ethnl_request_ops ethnl_privflags_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c new file mode 100644 index 000000000000..169dd4a832f6 --- /dev/null +++ b/net/ethtool/privflags.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct privflags_req_info { + struct ethnl_req_info base; +}; + +struct privflags_reply_data { + struct ethnl_reply_data base; + const char (*priv_flag_names)[ETH_GSTRING_LEN]; + unsigned int n_priv_flags; + u32 priv_flags; +}; + +#define PRIVFLAGS_REPDATA(__reply_base) \ + container_of(__reply_base, struct privflags_reply_data, base) + +static const struct nla_policy +privflags_get_policy[ETHTOOL_A_PRIVFLAGS_MAX + 1] = { + [ETHTOOL_A_PRIVFLAGS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_PRIVFLAGS_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_REJECT }, +}; + +static int ethnl_get_priv_flags_info(struct net_device *dev, + unsigned int *count, + const char (**names)[ETH_GSTRING_LEN]) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int nflags; + + nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); + if (nflags < 0) + return nflags; + + if (names) { + *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL); + if (!*names) + return -ENOMEM; + ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names); + } + + /* We can pass more than 32 private flags to userspace via netlink but + * we cannot get more with ethtool_ops::get_priv_flags(). Note that we + * must not adjust nflags before allocating the space for flag names + * as the buffer must be large enough for all flags. + */ + if (WARN_ONCE(nflags > 32, + "device %s reports more than 32 private flags (%d)\n", + netdev_name(dev), nflags)) + nflags = 32; + *count = nflags; + + return 0; +} + +static int privflags_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + const char (*names)[ETH_GSTRING_LEN]; + const struct ethtool_ops *ops; + unsigned int nflags; + int ret; + + ops = dev->ethtool_ops; + if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = ethnl_get_priv_flags_info(dev, &nflags, &names); + if (ret < 0) + goto out_ops; + data->priv_flags = ops->get_priv_flags(dev); + data->priv_flag_names = names; + data->n_priv_flags = nflags; + +out_ops: + ethnl_ops_complete(dev); + return ret; +} + +static int privflags_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); + + return ethnl_bitset32_size(&data->priv_flags, &all_flags, + data->n_priv_flags, + data->priv_flag_names, compact); +} + +static int privflags_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); + + return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS, + &data->priv_flags, &all_flags, + data->n_priv_flags, data->priv_flag_names, + compact); +} + +static void privflags_cleanup_data(struct ethnl_reply_data *reply_data) +{ + struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data); + + kfree(data->priv_flag_names); +} + +const struct ethnl_request_ops ethnl_privflags_request_ops = { + .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, + .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, + .max_attr = ETHTOOL_A_PRIVFLAGS_MAX, + .req_info_size = sizeof(struct privflags_req_info), + .reply_data_size = sizeof(struct privflags_reply_data), + .request_policy = privflags_get_policy, + + .prepare_data = privflags_prepare_data, + .reply_size = privflags_reply_size, + .fill_reply = privflags_fill_reply, + .cleanup_data = privflags_cleanup_data, +}; -- cgit v1.2.3 From f265d799596aff80fe19b0b3896a4abe13cdb534 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:13 +0100 Subject: ethtool: set device private flags with PRIVFLAGS_SET request Implement PRIVFLAGS_SET netlink request to set private flags of a network device. These are traditionally set with ETHTOOL_SPFLAGS ioctl request. Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 21 ++++++++- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/netlink.c | 5 ++ net/ethtool/netlink.h | 1 + net/ethtool/privflags.c | 70 ++++++++++++++++++++++++++++ 5 files changed, 97 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 7bba4c940ef7..e553112fa2d7 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -192,6 +192,7 @@ Userspace to kernel: ``ETHTOOL_MSG_FEATURES_GET`` get device features ``ETHTOOL_MSG_FEATURES_SET`` set device features ``ETHTOOL_MSG_PRIVFLAGS_GET`` get private flags + ``ETHTOOL_MSG_PRIVFLAGS_SET`` set private flags ===================================== ================================ Kernel to userspace: @@ -211,6 +212,7 @@ Kernel to userspace: ``ETHTOOL_MSG_FEATURES_SET_REPLY`` optional reply to FEATURES_SET ``ETHTOOL_MSG_FEATURES_NTF`` netdev features notification ``ETHTOOL_MSG_PRIVFLAGS_GET_REPLY`` private flags + ``ETHTOOL_MSG_PRIVFLAGS_NTF`` private flags ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -626,6 +628,23 @@ response uses all private flags supported by the device as mask so that client gets the full information without having to fetch the string set with names. +PRIVFLAGS_SET +============= + +Sets or modifies values of device private flags like ``ETHTOOL_SPFLAGS`` +ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_PRIVFLAGS_HEADER`` nested request header + ``ETHTOOL_A_PRIVFLAGS_FLAGS`` bitset private flags + ==================================== ====== ========================== + +``ETHTOOL_A_PRIVFLAGS_FLAGS`` can either set the whole set of private flags or +modify only values of some of them. + + Request translation =================== @@ -676,7 +695,7 @@ have their netlink replacement yet. ``ETHTOOL_GFLAGS`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFLAGS`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GPFLAGS`` ``ETHTOOL_MSG_PRIVFLAGS_GET`` - ``ETHTOOL_SPFLAGS`` n/a + ``ETHTOOL_SPFLAGS`` ``ETHTOOL_MSG_PRIVFLAGS_SET`` ``ETHTOOL_GRXFH`` n/a ``ETHTOOL_SRXFH`` n/a ``ETHTOOL_GGRO`` ``ETHTOOL_MSG_FEATURES_GET`` diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index d94bbf5e4d1c..13e631c86825 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -27,6 +27,7 @@ enum { ETHTOOL_MSG_FEATURES_GET, ETHTOOL_MSG_FEATURES_SET, ETHTOOL_MSG_PRIVFLAGS_GET, + ETHTOOL_MSG_PRIVFLAGS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 9cbb1d8b4d23..b6795aad7ccb 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -741,6 +741,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_privflags, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 36cf077c3085..ca789f587ef3 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -345,5 +345,6 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index 169dd4a832f6..dfa76d552277 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -134,3 +134,73 @@ const struct ethnl_request_ops ethnl_privflags_request_ops = { .fill_reply = privflags_fill_reply, .cleanup_data = privflags_cleanup_data, }; + +/* PRIVFLAGS_SET */ + +static const struct nla_policy +privflags_set_policy[ETHTOOL_A_PRIVFLAGS_MAX + 1] = { + [ETHTOOL_A_PRIVFLAGS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_PRIVFLAGS_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED }, +}; + +int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_PRIVFLAGS_MAX + 1]; + const char (*names)[ETH_GSTRING_LEN] = NULL; + struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + unsigned int nflags; + bool mod = false; + bool compact; + u32 flags; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_PRIVFLAGS_MAX, privflags_set_policy, + info->extack); + if (ret < 0) + return ret; + if (!tb[ETHTOOL_A_PRIVFLAGS_FLAGS]) + return -EINVAL; + ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_PRIVFLAGS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + if (!ops->get_priv_flags || !ops->set_priv_flags || + !ops->get_sset_count || !ops->get_strings) + return -EOPNOTSUPP; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names); + if (ret < 0) + goto out_ops; + flags = ops->get_priv_flags(dev); + + ret = ethnl_update_bitset32(&flags, nflags, + tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names, + info->extack, &mod); + if (ret < 0 || !mod) + goto out_free; + ret = ops->set_priv_flags(dev, flags); + +out_free: + kfree(names); +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} -- cgit v1.2.3 From 111dcba3c694b4ca7dc8b6abfd575745dd51be3d Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:18 +0100 Subject: ethtool: add PRIVFLAGS_NTF notification Send ETHTOOL_MSG_PRIVFLAGS_NTF notification whenever private flags of a network device are modified using ETHTOOL_MSG_PRIVFLAGS_SET netlink message or ETHTOOL_SPFLAGS ioctl request. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/ioctl.c | 2 ++ net/ethtool/netlink.c | 2 ++ net/ethtool/privflags.c | 3 +++ 4 files changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 13e631c86825..7f23a7f0fca1 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -51,6 +51,7 @@ enum { ETHTOOL_MSG_FEATURES_SET_REPLY, ETHTOOL_MSG_FEATURES_NTF, ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + ETHTOOL_MSG_PRIVFLAGS_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 45d1bf1764b7..298822289496 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2716,6 +2716,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_priv_flags); + if (!rc) + ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); break; case ETHTOOL_SPFLAGS: rc = ethtool_set_value(dev, useraddr, diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index b6795aad7ccb..dec82b0b80bd 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -530,6 +530,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_DEBUG_NTF] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_NTF] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_NTF] = ðnl_features_request_ops, + [ETHTOOL_MSG_PRIVFLAGS_NTF] = ðnl_privflags_request_ops, }; /* default notification handler */ @@ -616,6 +617,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index dfa76d552277..e8f03b33db9b 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -194,6 +194,9 @@ int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info) if (ret < 0 || !mod) goto out_free; ret = ops->set_priv_flags(dev, flags); + if (ret < 0) + goto out_free; + ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); out_free: kfree(names); -- cgit v1.2.3 From e4a1717b677c5cb285e9b425c569e261084a484c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:23 +0100 Subject: ethtool: provide ring sizes with RINGS_GET request Implement RINGS_GET request to get ring sizes of a network device. These are traditionally available via ETHTOOL_GRINGPARAM ioctl request. Omit attributes for ring types which are not supported by driver or device (zero reported for maximum). v2: (all suggested by Jakub Kicinski) - minor cleanup in rings_prepare_data() - more descriptive rings_reply_size() - omit attributes with zero max size Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 30 +++++++- include/uapi/linux/ethtool_netlink.h | 21 ++++++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 8 ++ net/ethtool/netlink.h | 1 + net/ethtool/rings.c | 108 +++++++++++++++++++++++++++ 6 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/rings.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index e553112fa2d7..798c2f97d89b 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -193,6 +193,7 @@ Userspace to kernel: ``ETHTOOL_MSG_FEATURES_SET`` set device features ``ETHTOOL_MSG_PRIVFLAGS_GET`` get private flags ``ETHTOOL_MSG_PRIVFLAGS_SET`` set private flags + ``ETHTOOL_MSG_RINGS_GET`` get ring sizes ===================================== ================================ Kernel to userspace: @@ -213,6 +214,7 @@ Kernel to userspace: ``ETHTOOL_MSG_FEATURES_NTF`` netdev features notification ``ETHTOOL_MSG_PRIVFLAGS_GET_REPLY`` private flags ``ETHTOOL_MSG_PRIVFLAGS_NTF`` private flags + ``ETHTOOL_MSG_RINGS_GET_REPLY`` ring sizes ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -645,6 +647,32 @@ Request contents: modify only values of some of them. +RINGS_GET +========= + +Gets ring sizes like ``ETHTOOL_GRINGPARAM`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_RINGS_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_RINGS_HEADER`` nested reply header + ``ETHTOOL_A_RINGS_RX_MAX`` u32 max size of RX ring + ``ETHTOOL_A_RINGS_RX_MINI_MAX`` u32 max size of RX mini ring + ``ETHTOOL_A_RINGS_RX_JUMBO_MAX`` u32 max size of RX jumbo ring + ``ETHTOOL_A_RINGS_TX_MAX`` u32 max size of TX ring + ``ETHTOOL_A_RINGS_RX`` u32 size of RX ring + ``ETHTOOL_A_RINGS_RX_MINI`` u32 size of RX mini ring + ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring + ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring + ==================================== ====== ========================== + + Request translation =================== @@ -671,7 +699,7 @@ have their netlink replacement yet. ``ETHTOOL_SEEPROM`` n/a ``ETHTOOL_GCOALESCE`` n/a ``ETHTOOL_SCOALESCE`` n/a - ``ETHTOOL_GRINGPARAM`` n/a + ``ETHTOOL_GRINGPARAM`` ``ETHTOOL_MSG_RINGS_GET`` ``ETHTOOL_SRINGPARAM`` n/a ``ETHTOOL_GPAUSEPARAM`` n/a ``ETHTOOL_SPAUSEPARAM`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7f23a7f0fca1..7cd220f8cf73 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -28,6 +28,7 @@ enum { ETHTOOL_MSG_FEATURES_SET, ETHTOOL_MSG_PRIVFLAGS_GET, ETHTOOL_MSG_PRIVFLAGS_SET, + ETHTOOL_MSG_RINGS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -52,6 +53,7 @@ enum { ETHTOOL_MSG_FEATURES_NTF, ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, ETHTOOL_MSG_PRIVFLAGS_NTF, + ETHTOOL_MSG_RINGS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -264,6 +266,25 @@ enum { ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 }; +/* RINGS */ + +enum { + ETHTOOL_A_RINGS_UNSPEC, + ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_RINGS_RX_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ + ETHTOOL_A_RINGS_TX_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX, /* u32 */ + ETHTOOL_A_RINGS_RX_MINI, /* u32 */ + ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ + ETHTOOL_A_RINGS_TX, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_RINGS_CNT, + ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 58708bcc2968..b48b70ef4ed0 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -5,4 +5,4 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ - linkstate.o debug.o wol.o features.o privflags.o + linkstate.o debug.o wol.o features.o privflags.o rings.o diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index dec82b0b80bd..0dc25a490450 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -217,6 +217,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, + [ETHTOOL_MSG_RINGS_GET] = ðnl_rings_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -748,6 +749,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_privflags, }, + { + .cmd = ETHTOOL_MSG_RINGS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index ca789f587ef3..01761932ed15 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -339,6 +339,7 @@ extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; +extern const struct ethnl_request_ops ethnl_rings_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c new file mode 100644 index 000000000000..d3129d8a252d --- /dev/null +++ b/net/ethtool/rings.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct rings_req_info { + struct ethnl_req_info base; +}; + +struct rings_reply_data { + struct ethnl_reply_data base; + struct ethtool_ringparam ringparam; +}; + +#define RINGS_REPDATA(__reply_base) \ + container_of(__reply_base, struct rings_reply_data, base) + +static const struct nla_policy +rings_get_policy[ETHTOOL_A_RINGS_MAX + 1] = { + [ETHTOOL_A_RINGS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_RINGS_RX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX_MINI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX_JUMBO_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_TX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_TX] = { .type = NLA_REJECT }, +}; + +static int rings_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct rings_reply_data *data = RINGS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + dev->ethtool_ops->get_ringparam(dev, &data->ringparam); + ethnl_ops_complete(dev); + + return 0; +} + +static int rings_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_RX */ + nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ + nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ + nla_total_size(sizeof(u32)); /* _RINGS_TX */ +} + +static int rings_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct rings_reply_data *data = RINGS_REPDATA(reply_base); + const struct ethtool_ringparam *ringparam = &data->ringparam; + + if ((ringparam->rx_max_pending && + (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, + ringparam->rx_max_pending) || + nla_put_u32(skb, ETHTOOL_A_RINGS_RX, + ringparam->rx_pending))) || + (ringparam->rx_mini_max_pending && + (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, + ringparam->rx_mini_max_pending) || + nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, + ringparam->rx_mini_pending))) || + (ringparam->rx_jumbo_max_pending && + (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, + ringparam->rx_jumbo_max_pending) || + nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, + ringparam->rx_jumbo_pending))) || + (ringparam->tx_max_pending && + (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, + ringparam->tx_max_pending) || + nla_put_u32(skb, ETHTOOL_A_RINGS_TX, + ringparam->tx_pending)))) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_rings_request_ops = { + .request_cmd = ETHTOOL_MSG_RINGS_GET, + .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, + .hdr_attr = ETHTOOL_A_RINGS_HEADER, + .max_attr = ETHTOOL_A_RINGS_MAX, + .req_info_size = sizeof(struct rings_req_info), + .reply_data_size = sizeof(struct rings_reply_data), + .request_policy = rings_get_policy, + + .prepare_data = rings_prepare_data, + .reply_size = rings_reply_size, + .fill_reply = rings_fill_reply, +}; -- cgit v1.2.3 From 2fc2929e807211a9535a6541f24b57fa1c469728 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:28 +0100 Subject: ethtool: set device ring sizes with RINGS_SET request Implement RINGS_SET netlink request to set ring sizes of a network device. These are traditionally set with ETHTOOL_SRINGPARAM ioctl request. Like the ioctl implementation, the generic ethtool code checks if supplied values do not exceed driver defined limits; if they do, first offending attribute is reported using extack. v2: - fix netdev reference leak in error path (found by Jakub Kicinsky) Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 23 ++++++- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/netlink.c | 5 ++ net/ethtool/netlink.h | 1 + net/ethtool/rings.c | 89 ++++++++++++++++++++++++++++ 5 files changed, 118 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 798c2f97d89b..ba31ae8f1feb 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -194,6 +194,7 @@ Userspace to kernel: ``ETHTOOL_MSG_PRIVFLAGS_GET`` get private flags ``ETHTOOL_MSG_PRIVFLAGS_SET`` set private flags ``ETHTOOL_MSG_RINGS_GET`` get ring sizes + ``ETHTOOL_MSG_RINGS_SET`` set ring sizes ===================================== ================================ Kernel to userspace: @@ -673,6 +674,26 @@ Kernel response contents: ==================================== ====== ========================== +RINGS_SET +========= + +Sets ring sizes like ``ETHTOOL_SRINGPARAM`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_RINGS_HEADER`` nested reply header + ``ETHTOOL_A_RINGS_RX`` u32 size of RX ring + ``ETHTOOL_A_RINGS_RX_MINI`` u32 size of RX mini ring + ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring + ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring + ==================================== ====== ========================== + +Kernel checks that requested ring sizes do not exceed limits reported by +driver. Driver may impose additional constraints and may not suspport all +attributes. + + Request translation =================== @@ -700,7 +721,7 @@ have their netlink replacement yet. ``ETHTOOL_GCOALESCE`` n/a ``ETHTOOL_SCOALESCE`` n/a ``ETHTOOL_GRINGPARAM`` ``ETHTOOL_MSG_RINGS_GET`` - ``ETHTOOL_SRINGPARAM`` n/a + ``ETHTOOL_SRINGPARAM`` ``ETHTOOL_MSG_RINGS_SET`` ``ETHTOOL_GPAUSEPARAM`` n/a ``ETHTOOL_SPAUSEPARAM`` n/a ``ETHTOOL_GRXCSUM`` ``ETHTOOL_MSG_FEATURES_GET`` diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7cd220f8cf73..ae71801b7aac 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -29,6 +29,7 @@ enum { ETHTOOL_MSG_PRIVFLAGS_GET, ETHTOOL_MSG_PRIVFLAGS_SET, ETHTOOL_MSG_RINGS_GET, + ETHTOOL_MSG_RINGS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0dc25a490450..6a1ac8897a7e 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -756,6 +756,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_RINGS_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_rings, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 01761932ed15..b30426d01890 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -347,5 +347,6 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index d3129d8a252d..93f428e9a6c2 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -106,3 +106,92 @@ const struct ethnl_request_ops ethnl_rings_request_ops = { .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, }; + +/* RINGS_SET */ + +static const struct nla_policy +rings_set_policy[ETHTOOL_A_RINGS_MAX + 1] = { + [ETHTOOL_A_RINGS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_RINGS_RX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX_MINI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX_JUMBO_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_TX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, + [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, + [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, + [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, +}; + +int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_RINGS_MAX + 1]; + struct ethtool_ringparam ringparam = {}; + struct ethnl_req_info req_info = {}; + const struct nlattr *err_attr; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_RINGS_MAX, rings_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_RINGS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_ringparam || !ops->set_ringparam) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ops->get_ringparam(dev, &ringparam); + + ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); + ethnl_update_u32(&ringparam.rx_mini_pending, + tb[ETHTOOL_A_RINGS_RX_MINI], &mod); + ethnl_update_u32(&ringparam.rx_jumbo_pending, + tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); + ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); + ret = 0; + if (!mod) + goto out_ops; + + /* ensure new ring parameters are within limits */ + if (ringparam.rx_pending > ringparam.rx_max_pending) + err_attr = tb[ETHTOOL_A_RINGS_RX]; + else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) + err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; + else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) + err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; + else if (ringparam.tx_pending > ringparam.tx_max_pending) + err_attr = tb[ETHTOOL_A_RINGS_TX]; + else + err_attr = NULL; + if (err_attr) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, err_attr, + "requested ring size exceeeds maximum"); + goto out_ops; + } + + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} -- cgit v1.2.3 From bc9d1c995ecb8cccaac8ead2ed570544c2c885eb Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:33 +0100 Subject: ethtool: add RINGS_NTF notification Send ETHTOOL_MSG_RINGS_NTF notification whenever ring sizes of a network device are modified using ETHTOOL_MSG_RINGS_SET netlink message or ETHTOOL_SRINGPARAM ioctl request. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 1 + include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/ioctl.c | 6 +++++- net/ethtool/netlink.c | 2 ++ net/ethtool/rings.c | 3 +++ 5 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ba31ae8f1feb..026a5fd4a08b 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -216,6 +216,7 @@ Kernel to userspace: ``ETHTOOL_MSG_PRIVFLAGS_GET_REPLY`` private flags ``ETHTOOL_MSG_PRIVFLAGS_NTF`` private flags ``ETHTOOL_MSG_RINGS_GET_REPLY`` ring sizes + ``ETHTOOL_MSG_RINGS_NTF`` ring sizes ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index ae71801b7aac..abfc8fd626da 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -55,6 +55,7 @@ enum { ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, ETHTOOL_MSG_PRIVFLAGS_NTF, ETHTOOL_MSG_RINGS_GET_REPLY, + ETHTOOL_MSG_RINGS_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 298822289496..1d5c1b6b81a4 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1635,6 +1635,7 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; + int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; @@ -1651,7 +1652,10 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) ringparam.tx_pending > max.tx_max_pending) return -EINVAL; - return dev->ethtool_ops->set_ringparam(dev, &ringparam); + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam); + if (!ret) + ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); + return ret; } static noinline_for_stack int ethtool_get_channels(struct net_device *dev, diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 6a1ac8897a7e..653e009216cd 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -532,6 +532,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_WOL_NTF] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_NTF] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ðnl_privflags_request_ops, + [ETHTOOL_MSG_RINGS_NTF] = ðnl_rings_request_ops, }; /* default notification handler */ @@ -619,6 +620,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 93f428e9a6c2..c2ebf72be217 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -186,6 +186,9 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) } ret = dev->ethtool_ops->set_ringparam(dev, &ringparam); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); out_ops: ethnl_ops_complete(dev); -- cgit v1.2.3 From 0c84979c951a85fcb5c77ac2480fd476e283a07c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:38 +0100 Subject: ethtool: provide channel counts with CHANNELS_GET request Implement CHANNELS_GET request to get channel counts of a network device. These are traditionally available via ETHTOOL_GCHANNELS ioctl request. Omit attributes for channel types which are not supported by driver or device (zero reported for maximum). v2: (all suggested by Jakub Kicinski) - minor cleanup in channels_prepare_data() - more descriptive channels_reply_size() - omit attributes with zero max count Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 30 +++++++- include/uapi/linux/ethtool_netlink.h | 21 ++++++ net/ethtool/Makefile | 3 +- net/ethtool/channels.c | 108 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 8 ++ net/ethtool/netlink.h | 1 + 6 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/channels.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 026a5fd4a08b..decbbddfd8be 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -195,6 +195,7 @@ Userspace to kernel: ``ETHTOOL_MSG_PRIVFLAGS_SET`` set private flags ``ETHTOOL_MSG_RINGS_GET`` get ring sizes ``ETHTOOL_MSG_RINGS_SET`` set ring sizes + ``ETHTOOL_MSG_CHANNELS_GET`` get channel counts ===================================== ================================ Kernel to userspace: @@ -217,6 +218,7 @@ Kernel to userspace: ``ETHTOOL_MSG_PRIVFLAGS_NTF`` private flags ``ETHTOOL_MSG_RINGS_GET_REPLY`` ring sizes ``ETHTOOL_MSG_RINGS_NTF`` ring sizes + ``ETHTOOL_MSG_CHANNELS_GET_REPLY`` channel counts ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -695,6 +697,32 @@ driver. Driver may impose additional constraints and may not suspport all attributes. +CHANNELS_GET +============ + +Gets channel counts like ``ETHTOOL_GCHANNELS`` ioctl request. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_CHANNELS_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_CHANNELS_HEADER`` nested reply header + ``ETHTOOL_A_CHANNELS_RX_MAX`` u32 max receive channels + ``ETHTOOL_A_CHANNELS_TX_MAX`` u32 max transmit channels + ``ETHTOOL_A_CHANNELS_OTHER_MAX`` u32 max other channels + ``ETHTOOL_A_CHANNELS_COMBINED_MAX`` u32 max combined channels + ``ETHTOOL_A_CHANNELS_RX_COUNT`` u32 receive channel count + ``ETHTOOL_A_CHANNELS_TX_COUNT`` u32 transmit channel count + ``ETHTOOL_A_CHANNELS_OTHER_COUNT`` u32 other channel count + ``ETHTOOL_A_CHANNELS_COMBINED_COUNT`` u32 combined channel count + ===================================== ====== ========================== + + Request translation =================== @@ -765,7 +793,7 @@ have their netlink replacement yet. ``ETHTOOL_SRXFHINDIR`` n/a ``ETHTOOL_GFEATURES`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFEATURES`` ``ETHTOOL_MSG_FEATURES_SET`` - ``ETHTOOL_GCHANNELS`` n/a + ``ETHTOOL_GCHANNELS`` ``ETHTOOL_MSG_CHANNELS_GET`` ``ETHTOOL_SCHANNELS`` n/a ``ETHTOOL_SET_DUMP`` n/a ``ETHTOOL_GET_DUMP_FLAG`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index abfc8fd626da..2270eb115eca 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -30,6 +30,7 @@ enum { ETHTOOL_MSG_PRIVFLAGS_SET, ETHTOOL_MSG_RINGS_GET, ETHTOOL_MSG_RINGS_SET, + ETHTOOL_MSG_CHANNELS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -56,6 +57,7 @@ enum { ETHTOOL_MSG_PRIVFLAGS_NTF, ETHTOOL_MSG_RINGS_GET_REPLY, ETHTOOL_MSG_RINGS_NTF, + ETHTOOL_MSG_CHANNELS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -287,6 +289,25 @@ enum { ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) }; +/* CHANNELS */ + +enum { + ETHTOOL_A_CHANNELS_UNSPEC, + ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_CHANNELS_CNT, + ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index b48b70ef4ed0..b0bd3decad02 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -5,4 +5,5 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ - linkstate.o debug.o wol.o features.o privflags.o rings.o + linkstate.o debug.o wol.o features.o privflags.o rings.o \ + channels.o diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c new file mode 100644 index 000000000000..500dd5ad250b --- /dev/null +++ b/net/ethtool/channels.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct channels_req_info { + struct ethnl_req_info base; +}; + +struct channels_reply_data { + struct ethnl_reply_data base; + struct ethtool_channels channels; +}; + +#define CHANNELS_REPDATA(__reply_base) \ + container_of(__reply_base, struct channels_reply_data, base) + +static const struct nla_policy +channels_get_policy[ETHTOOL_A_CHANNELS_MAX + 1] = { + [ETHTOOL_A_CHANNELS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CHANNELS_RX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_TX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_OTHER_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_COMBINED_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_REJECT }, +}; + +static int channels_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + dev->ethtool_ops->get_channels(dev, &data->channels); + ethnl_ops_complete(dev); + + return 0; +} + +static int channels_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */ + nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */ + nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */ + nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */ + nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */ + nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */ + nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */ + nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */ +} + +static int channels_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); + const struct ethtool_channels *channels = &data->channels; + + if ((channels->max_rx && + (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX, + channels->max_rx) || + nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT, + channels->rx_count))) || + (channels->max_tx && + (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX, + channels->max_tx) || + nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT, + channels->tx_count))) || + (channels->max_other && + (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX, + channels->max_other) || + nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT, + channels->other_count))) || + (channels->max_combined && + (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX, + channels->max_combined) || + nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT, + channels->combined_count)))) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_channels_request_ops = { + .request_cmd = ETHTOOL_MSG_CHANNELS_GET, + .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, + .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, + .max_attr = ETHTOOL_A_CHANNELS_MAX, + .req_info_size = sizeof(struct channels_req_info), + .reply_data_size = sizeof(struct channels_reply_data), + .request_policy = channels_get_policy, + + .prepare_data = channels_prepare_data, + .reply_size = channels_reply_size, + .fill_reply = channels_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 653e009216cd..ac35513abeed 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -218,6 +218,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = ðnl_rings_request_ops, + [ETHTOOL_MSG_CHANNELS_GET] = ðnl_channels_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -763,6 +764,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_rings, }, + { + .cmd = ETHTOOL_MSG_CHANNELS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index b30426d01890..53bfe720ff1a 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -340,6 +340,7 @@ extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; +extern const struct ethnl_request_ops ethnl_channels_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From e19c591eafad4d723a2eed385e253eca0506cc25 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:43 +0100 Subject: ethtool: set device channel counts with CHANNELS_SET request Implement CHANNELS_SET netlink request to set channel counts of a network device. These are traditionally set with ETHTOOL_SCHANNELS ioctl request. Like the ioctl implementation, the generic ethtool code checks if supplied values do not exceed driver defined limits; if they do, first offending attribute is reported using extack. Checks preventing removing channels used for RX indirection table or zerocopy AF_XDP socket are also implemented. Move ethtool_get_max_rxfh_channel() helper into common.c so that it can be used by both ioctl and netlink code. v2: - fix netdev reference leak in error path (found by Jakub Kicinsky) Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 23 +++++- include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/channels.c | 116 +++++++++++++++++++++++++++ net/ethtool/common.c | 31 +++++++ net/ethtool/common.h | 1 + net/ethtool/ioctl.c | 31 ------- net/ethtool/netlink.c | 5 ++ net/ethtool/netlink.h | 1 + 8 files changed, 177 insertions(+), 32 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index decbbddfd8be..7df7476cf310 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -196,6 +196,7 @@ Userspace to kernel: ``ETHTOOL_MSG_RINGS_GET`` get ring sizes ``ETHTOOL_MSG_RINGS_SET`` set ring sizes ``ETHTOOL_MSG_CHANNELS_GET`` get channel counts + ``ETHTOOL_MSG_CHANNELS_SET`` set channel counts ===================================== ================================ Kernel to userspace: @@ -723,6 +724,26 @@ Kernel response contents: ===================================== ====== ========================== +CHANNELS_SET +============ + +Sets channel counts like ``ETHTOOL_SCHANNELS`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_CHANNELS_HEADER`` nested request header + ``ETHTOOL_A_CHANNELS_RX_COUNT`` u32 receive channel count + ``ETHTOOL_A_CHANNELS_TX_COUNT`` u32 transmit channel count + ``ETHTOOL_A_CHANNELS_OTHER_COUNT`` u32 other channel count + ``ETHTOOL_A_CHANNELS_COMBINED_COUNT`` u32 combined channel count + ===================================== ====== ========================== + +Kernel checks that requested channel counts do not exceed limits reported by +driver. Driver may impose additional constraints and may not suspport all +attributes. + + Request translation =================== @@ -794,7 +815,7 @@ have their netlink replacement yet. ``ETHTOOL_GFEATURES`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFEATURES`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GCHANNELS`` ``ETHTOOL_MSG_CHANNELS_GET`` - ``ETHTOOL_SCHANNELS`` n/a + ``ETHTOOL_SCHANNELS`` ``ETHTOOL_MSG_CHANNELS_SET`` ``ETHTOOL_SET_DUMP`` n/a ``ETHTOOL_GET_DUMP_FLAG`` n/a ``ETHTOOL_GET_DUMP_DATA`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 2270eb115eca..f1384a8f3534 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -31,6 +31,7 @@ enum { ETHTOOL_MSG_RINGS_GET, ETHTOOL_MSG_RINGS_SET, ETHTOOL_MSG_CHANNELS_GET, + ETHTOOL_MSG_CHANNELS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 500dd5ad250b..ee232c11acae 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include + #include "netlink.h" #include "common.h" @@ -106,3 +108,117 @@ const struct ethnl_request_ops ethnl_channels_request_ops = { .reply_size = channels_reply_size, .fill_reply = channels_fill_reply, }; + +/* CHANNELS_SET */ + +static const struct nla_policy +channels_set_policy[ETHTOOL_A_CHANNELS_MAX + 1] = { + [ETHTOOL_A_CHANNELS_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CHANNELS_RX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_TX_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_OTHER_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_COMBINED_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 }, + [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 }, + [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 }, + [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 }, +}; + +int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CHANNELS_MAX + 1]; + unsigned int from_channel, old_total, i; + struct ethtool_channels channels = {}; + struct ethnl_req_info req_info = {}; + const struct nlattr *err_attr; + const struct ethtool_ops *ops; + struct net_device *dev; + u32 max_rx_in_use = 0; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CHANNELS_MAX, channels_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CHANNELS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_channels || !ops->set_channels) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ops->get_channels(dev, &channels); + old_total = channels.combined_count + + max(channels.rx_count, channels.tx_count); + + ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT], + &mod); + ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT], + &mod); + ethnl_update_u32(&channels.other_count, + tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod); + ethnl_update_u32(&channels.combined_count, + tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod); + ret = 0; + if (!mod) + goto out_ops; + + /* ensure new channel counts are within limits */ + if (channels.rx_count > channels.max_rx) + err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT]; + else if (channels.tx_count > channels.max_tx) + err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT]; + else if (channels.other_count > channels.max_other) + err_attr = tb[ETHTOOL_A_CHANNELS_OTHER_COUNT]; + else if (channels.combined_count > channels.max_combined) + err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT]; + else + err_attr = NULL; + if (err_attr) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, err_attr, + "requested channel count exceeeds maximum"); + goto out_ops; + } + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table settings + */ + if (netif_is_rxfh_configured(dev) && + !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && + (channels.combined_count + channels.rx_count) <= max_rx_in_use) { + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); + return -EINVAL; + } + + /* Disabling channels, query zero-copy AF_XDP sockets */ + from_channel = channels.combined_count + + min(channels.rx_count, channels.tx_count); + for (i = from_channel; i < old_total; i++) + if (xdp_get_umem_from_qid(dev, i)) { + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); + return -EINVAL; + } + + ret = dev->ethtool_ops->set_channels(dev, &channels); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 7b6969af5ae7..0b22741b2f8f 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -258,3 +258,34 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } + +int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) +{ + u32 dev_size, current_max = 0; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + if (ret) + goto out; + + while (dev_size--) + current_max = max(current_max, indir[dev_size]); + + *max = current_max; + +out: + kfree(indir); + return ret; +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 7dc1163800a7..03946e16e623 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -29,5 +29,6 @@ int __ethtool_get_link(struct net_device *dev); bool convert_legacy_settings_to_link_ksettings( struct ethtool_link_ksettings *link_ksettings, const struct ethtool_cmd *legacy_settings); +int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max); #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 1d5c1b6b81a4..06224a03139e 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -929,37 +929,6 @@ void netdev_rss_key_fill(void *buffer, size_t len) } EXPORT_SYMBOL(netdev_rss_key_fill); -static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) -{ - u32 dev_size, current_max = 0; - u32 *indir; - int ret; - - if (!dev->ethtool_ops->get_rxfh_indir_size || - !dev->ethtool_ops->get_rxfh) - return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); - if (dev_size == 0) - return -EOPNOTSUPP; - - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) - return -ENOMEM; - - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); - if (ret) - goto out; - - while (dev_size--) - current_max = max(current_max, indir[dev_size]); - - *max = current_max; - -out: - kfree(indir); - return ret; -} - static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index ac35513abeed..f61654b8f210 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -771,6 +771,11 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_CHANNELS_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_channels, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 53bfe720ff1a..45aad99a6021 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -349,5 +349,6 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 546379b9a01b6bdb5b267f8e7433944d5364cbf2 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 12 Mar 2020 21:08:48 +0100 Subject: ethtool: add CHANNELS_NTF notification Send ETHTOOL_MSG_CHANNELS_NTF notification whenever channel counts of a network device are modified using ETHTOOL_MSG_CHANNELS_SET netlink message or ETHTOOL_SCHANNELS ioctl request. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 1 + include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/channels.c | 3 +++ net/ethtool/ioctl.c | 6 +++++- net/ethtool/netlink.c | 2 ++ 5 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 7df7476cf310..31a601cafa3f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -220,6 +220,7 @@ Kernel to userspace: ``ETHTOOL_MSG_RINGS_GET_REPLY`` ring sizes ``ETHTOOL_MSG_RINGS_NTF`` ring sizes ``ETHTOOL_MSG_CHANNELS_GET_REPLY`` channel counts + ``ETHTOOL_MSG_CHANNELS_NTF`` channel counts ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index f1384a8f3534..c7c7a1a550af 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -59,6 +59,7 @@ enum { ETHTOOL_MSG_RINGS_GET_REPLY, ETHTOOL_MSG_RINGS_NTF, ETHTOOL_MSG_CHANNELS_GET_REPLY, + ETHTOOL_MSG_CHANNELS_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index ee232c11acae..8dc5485333a4 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -213,6 +213,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) } ret = dev->ethtool_ops->set_channels(dev, &channels); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); out_ops: ethnl_ops_complete(dev); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 06224a03139e..258840b19fb5 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1649,6 +1649,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, u16 from_channel, to_channel; u32 max_rx_in_use = 0; unsigned int i; + int ret; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; @@ -1680,7 +1681,10 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (xdp_get_umem_from_qid(dev, i)) return -EINVAL; - return dev->ethtool_ops->set_channels(dev, &channels); + ret = dev->ethtool_ops->set_channels(dev, &channels); + if (!ret) + ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); + return ret; } static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index f61654b8f210..55c8ce4019d9 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -534,6 +534,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_FEATURES_NTF] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_NTF] = ðnl_rings_request_ops, + [ETHTOOL_MSG_CHANNELS_NTF] = ðnl_channels_request_ops, }; /* default notification handler */ @@ -622,6 +623,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) -- cgit v1.2.3 From b4490c5c4e023f09b7d27c9a9d3e7ad7d09ea6bf Mon Sep 17 00:00:00 2001 From: Carlos Neira Date: Wed, 4 Mar 2020 17:41:56 -0300 Subject: bpf: Added new helper bpf_get_ns_current_pid_tgid New bpf helper bpf_get_ns_current_pid_tgid, This helper will return pid and tgid from current task which namespace matches dev_t and inode number provided, this will allows us to instrument a process inside a container. Signed-off-by: Carlos Neira Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200304204157.58695-3-cneirabustos@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 20 ++++++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 45 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 1 + tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++- 7 files changed, 88 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fd91b7c95ea..4ec835334a1f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1497,6 +1497,7 @@ extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; +extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40b2d9476268..15b239da775b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2914,6 +2914,19 @@ union bpf_attr { * of sizeof(struct perf_branch_entry). * * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3035,7 +3048,8 @@ union bpf_attr { FN(tcp_send_ack), \ FN(send_signal_thread), \ FN(jiffies64), \ - FN(read_branch_records), + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3829,4 +3843,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 973a20d49749..0f9ca46e1978 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2149,6 +2149,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d8b7b110a1c5..01878db15eaf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "../../lib/kstrtox.h" @@ -499,3 +501,46 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg4_type = ARG_PTR_TO_LONG, }; #endif + +BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, + struct bpf_pidns_info *, nsdata, u32, size) +{ + struct task_struct *task = current; + struct pid_namespace *pidns; + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_pidns_info))) + goto clear; + + if (unlikely((u64)(dev_t)dev != dev)) + goto clear; + + if (unlikely(!task)) + goto clear; + + pidns = task_active_pid_ns(task); + if (unlikely(!pidns)) { + err = -ENOENT; + goto clear; + } + + if (!ns_match(&pidns->ns, (dev_t)dev, ino)) + goto clear; + + nsdata->pid = task_pid_nr_ns(task, pidns); + nsdata->tgid = task_tgid_nr_ns(task, pidns); + return 0; +clear: + memset((void *)nsdata, 0, (size_t) size); + return err; +} + +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { + .func = bpf_get_ns_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6a490d8ce9de..b5071c7e93ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -843,6 +843,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index cebed6fb5bbb..c1e2b5410faa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -435,6 +435,7 @@ class PrinterHelpers(Printer): 'struct bpf_fib_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 40b2d9476268..15b239da775b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2914,6 +2914,19 @@ union bpf_attr { * of sizeof(struct perf_branch_entry). * * **-ENOENT** if architecture does not support branch records. + * + * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3035,7 +3048,8 @@ union bpf_attr { FN(tcp_send_ack), \ FN(send_signal_thread), \ FN(jiffies64), \ - FN(read_branch_records), + FN(read_branch_records), \ + FN(get_ns_current_pid_tgid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3829,4 +3843,8 @@ struct bpf_sockopt { __s32 retval; }; +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From d831ee84bfc9173eecf30dbbc2553ae81b996c60 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 6 Mar 2020 08:59:23 +0000 Subject: bpf: Add bpf_xdp_output() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce new helper that reuses existing xdp perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct xdp_buff *' as a tracepoint argument. Signed-off-by: Eelco Chaudron Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158348514556.2239.11050972434793741444.stgit@xdp-tutorial --- include/uapi/linux/bpf.h | 26 ++++++++++- kernel/bpf/verifier.c | 4 +- kernel/trace/bpf_trace.c | 3 ++ net/core/filter.c | 16 ++++++- tools/include/uapi/linux/bpf.h | 26 ++++++++++- .../testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c | 53 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/test_xdp_bpf2bpf.c | 24 ++++++++++ 7 files changed, 148 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 15b239da775b..5d01c5c7e598 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2927,6 +2927,29 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3049,7 +3072,8 @@ union bpf_attr { FN(send_signal_thread), \ FN(jiffies64), \ FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 55d376c53f7d..745f3cfdf3b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3650,7 +3650,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && func_id != BPF_FUNC_skb_output && - func_id != BPF_FUNC_perf_event_read_value) + func_id != BPF_FUNC_perf_event_read_value && + func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -3740,6 +3741,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: + case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b5071c7e93ca..e619eedb5919 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1145,6 +1145,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { }; extern const struct bpf_func_proto bpf_skb_output_proto; +extern const struct bpf_func_proto bpf_xdp_output_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1220,6 +1221,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; + case BPF_FUNC_xdp_output: + return &bpf_xdp_output_proto; #endif default: return raw_tp_prog_func_proto(func_id, prog); diff --git a/net/core/filter.c b/net/core/filter.c index cd0a532db4e7..22219544410f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4061,7 +4061,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + if (unlikely(!xdp || + xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp->data, @@ -4079,6 +4080,19 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static int bpf_xdp_output_btf_ids[5]; +const struct bpf_func_proto bpf_xdp_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_xdp_output_btf_ids, +}; + BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? sock_gen_cookie(skb->sk) : 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 15b239da775b..5d01c5c7e598 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2927,6 +2927,29 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * + * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3049,7 +3072,8 @@ union bpf_attr { FN(send_signal_thread), \ FN(jiffies64), \ FN(read_branch_records), \ - FN(get_ns_current_pid_tgid), + FN(get_ns_current_pid_tgid), \ + FN(xdp_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 4ba011031d4c..a0f688c37023 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -4,17 +4,51 @@ #include "test_xdp.skel.h" #include "test_xdp_bpf2bpf.skel.h" +struct meta { + int ifindex; + int pkt_len; +}; + +static void on_sample(void *ctx, int cpu, void *data, __u32 size) +{ + int duration = 0; + struct meta *meta = (struct meta *)data; + struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta); + + if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta), + "check_size", "size %u < %zu\n", + size, sizeof(pkt_v4) + sizeof(*meta))) + return; + + if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex", + "meta->ifindex = %d\n", meta->ifindex)) + return; + + if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len", + "meta->pkt_len = %zd\n", sizeof(pkt_v4))) + return; + + if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), + "check_packet_content", "content not the same\n")) + return; + + *(bool *)ctx = true; +} + void test_xdp_bpf2bpf(void) { __u32 duration = 0, retval, size; char buf[128]; int err, pkt_fd, map_fd; + bool passed = false; struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); struct iptnl_info value4 = {.family = AF_INET}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; struct vip key4 = {.protocol = 6, .family = AF_INET}; struct bpf_program *prog; + struct perf_buffer *pb = NULL; + struct perf_buffer_opts pb_opts = {}; /* Load XDP program to introspect */ pkt_skel = test_xdp__open_and_load(); @@ -50,6 +84,14 @@ void test_xdp_bpf2bpf(void) if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err)) goto out; + /* Set up perf buffer */ + pb_opts.sample_cb = on_sample; + pb_opts.ctx = &passed; + pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), + 1, &pb_opts); + if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + goto out; + /* Run test program */ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); @@ -60,6 +102,15 @@ void test_xdp_bpf2bpf(void) err, errno, retval, size)) goto out; + /* Make sure bpf_xdp_output() was triggered and it sent the expected + * data to the perf ring buffer. + */ + err = perf_buffer__poll(pb, 100); + if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) + goto out; + + CHECK_FAIL(!passed); + /* Verify test results */ if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"), "result", "fentry failed err %llu\n", @@ -70,6 +121,8 @@ void test_xdp_bpf2bpf(void) "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit); out: + if (pb) + perf_buffer__free(pb); test_xdp__destroy(pkt_skel); test_xdp_bpf2bpf__destroy(ftrace_skel); } diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c index 42dd2fedd588..a038e827f850 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -3,6 +3,8 @@ #include #include +char _license[] SEC("license") = "GPL"; + struct net_device { /* Structure does not need to contain all entries, * as "preserve_access_index" will use BTF to fix this... @@ -27,10 +29,32 @@ struct xdp_buff { struct xdp_rxq_info *rxq; } __attribute__((preserve_access_index)); +struct meta { + int ifindex; + int pkt_len; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} perf_buf_map SEC(".maps"); + __u64 test_result_fentry = 0; SEC("fentry/FUNC") int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) { + struct meta meta; + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + + meta.ifindex = xdp->rxq->dev->ifindex; + meta.pkt_len = data_end - data; + bpf_xdp_output(xdp, &perf_buf_map, + ((__u64) meta.pkt_len << 32) | + BPF_F_CURRENT_CPU, + &meta, sizeof(meta)); + test_result_fentry = xdp->rxq->dev->ifindex; return 0; } -- cgit v1.2.3 From 14bc175d9c885c86239de3d730eea85ad67bfe7b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 13 Mar 2020 01:10:56 +0200 Subject: net: sched: Allow extending set of supported RED flags The qdiscs RED, GRED, SFQ and CHOKE use different subsets of the same pool of global RED flags. These are passed in tc_red_qopt.flags. However none of these qdiscs validate the flag field, and just copy it over wholesale to internal structures, and later dump it back. (An exception is GRED, which does validate for VQs -- however not for the main setup.) A broken userspace can therefore configure a qdisc with arbitrary unsupported flags, and later expect to see the flags on qdisc dump. The current ABI therefore allows storage of several bits of custom data to qdisc instances of the types mentioned above. How many bits, depends on which flags are meaningful for the qdisc in question. E.g. SFQ recognizes flags ECN and HARDDROP, and the rest is not interpreted. If SFQ ever needs to support ADAPTATIVE, it needs another way of doing it, and at the same time it needs to retain the possibility to store 6 bits of uninterpreted data. Likewise RED, which adds a new flag later in this patchset. To that end, this patch adds a new function, red_get_flags(), to split the passed flags of RED-like qdiscs to flags and user bits, and red_validate_flags() to validate the resulting configuration. It further adds a new attribute, TCA_RED_FLAGS, to pass arbitrary flags. Signed-off-by: Petr Machata Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/red.h | 33 ++++++++++++++++++++++++++++++++ include/uapi/linux/pkt_sched.h | 16 ++++++++++++++++ net/sched/sch_red.c | 43 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 89 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/red.h b/include/net/red.h index 9665582c4687..6a2aaa6c7c41 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -179,6 +179,39 @@ static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog) return true; } +static inline int red_get_flags(unsigned char qopt_flags, + unsigned char historic_mask, + struct nlattr *flags_attr, + unsigned char supported_mask, + struct nla_bitfield32 *p_flags, + unsigned char *p_userbits, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 flags; + + if (qopt_flags && flags_attr) { + NL_SET_ERR_MSG_MOD(extack, "flags should be passed either through qopt, or through a dedicated attribute"); + return -EINVAL; + } + + if (flags_attr) { + flags = nla_get_bitfield32(flags_attr); + } else { + flags.selector = historic_mask; + flags.value = qopt_flags & historic_mask; + } + + *p_flags = flags; + *p_userbits = qopt_flags & ~historic_mask; + return 0; +} + +static inline int red_validate_flags(unsigned char flags, + struct netlink_ext_ack *extack) +{ + return 0; +} + static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab, u32 max_P) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bbe791b24168..6325507935ea 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -256,6 +256,7 @@ enum { TCA_RED_PARMS, TCA_RED_STAB, TCA_RED_MAX_P, + TCA_RED_FLAGS, /* bitfield32 */ __TCA_RED_MAX, }; @@ -268,12 +269,27 @@ struct tc_red_qopt { unsigned char Wlog; /* log(W) */ unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ unsigned char Scell_log; /* cell size for idle damping */ + + /* This field can be used for flags that a RED-like qdisc has + * historically supported. E.g. when configuring RED, it can be used for + * ECN, HARDDROP and ADAPTATIVE. For SFQ it can be used for ECN, + * HARDDROP. Etc. Because this field has not been validated, and is + * copied back on dump, any bits besides those to which a given qdisc + * has assigned a historical meaning need to be considered for free use + * by userspace tools. + * + * Any further flags need to be passed differently, e.g. through an + * attribute (such as TCA_RED_FLAGS above). Such attribute should allow + * passing both recent and historic flags in one value. + */ unsigned char flags; #define TC_RED_ECN 1 #define TC_RED_HARDDROP 2 #define TC_RED_ADAPTATIVE 4 }; +#define TC_RED_HISTORIC_FLAGS (TC_RED_ECN | TC_RED_HARDDROP | TC_RED_ADAPTATIVE) + struct tc_red_xstats { __u32 early; /* Early drops */ __u32 pdrop; /* Drops due to queue limits */ diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 1695421333e3..d4ce111704dc 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -35,7 +35,11 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ + unsigned char flags; + /* Non-flags in tc_red_qopt.flags. */ + unsigned char userbits; + struct timer_list adapt_timer; struct Qdisc *sch; struct red_parms parms; @@ -44,6 +48,8 @@ struct red_sched_data { struct Qdisc *qdisc; }; +static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS; + static inline int red_use_ecn(struct red_sched_data *q) { return q->flags & TC_RED_ECN; @@ -183,9 +189,12 @@ static void red_destroy(struct Qdisc *sch) } static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { + [TCA_RED_UNSPEC] = { .strict_start_type = TCA_RED_FLAGS }, [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, + [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &red_supported_flags }, }; static int red_change(struct Qdisc *sch, struct nlattr *opt, @@ -194,7 +203,10 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; + struct nla_bitfield32 flags_bf; struct tc_red_qopt *ctl; + unsigned char userbits; + unsigned char flags; int err; u32 max_P; @@ -216,6 +228,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) return -EINVAL; + err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, + tb[TCA_RED_FLAGS], red_supported_flags, + &flags_bf, &userbits, extack); + if (err) + return err; + if (ctl->limit > 0) { child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit, extack); @@ -227,7 +245,14 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, } sch_tree_lock(sch); - q->flags = ctl->flags; + + flags = (q->flags & ~flags_bf.selector) | flags_bf.value; + err = red_validate_flags(flags, extack); + if (err) + goto unlock_out; + + q->flags = flags; + q->userbits = userbits; q->limit = ctl->limit; if (child) { qdisc_tree_flush_backlog(q->qdisc); @@ -256,6 +281,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (old_child) qdisc_put(old_child); return 0; + +unlock_out: + sch_tree_unlock(sch); + if (child) + qdisc_put(child); + return err; } static inline void red_adaptative_timer(struct timer_list *t) @@ -299,10 +330,15 @@ static int red_dump_offload_stats(struct Qdisc *sch) static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); + struct nla_bitfield32 flags_bf = { + .selector = red_supported_flags, + .value = q->flags, + }; struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = q->limit, - .flags = q->flags, + .flags = (q->flags & TC_RED_HISTORIC_FLAGS) | + q->userbits, .qth_min = q->parms.qth_min >> q->parms.Wlog, .qth_max = q->parms.qth_max >> q->parms.Wlog, .Wlog = q->parms.Wlog, @@ -319,7 +355,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || - nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P)) + nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || + nla_put(skb, TCA_RED_FLAGS, sizeof(flags_bf), &flags_bf)) goto nla_put_failure; return nla_nest_end(skb, opts); -- cgit v1.2.3 From 0a7fad2376ba6b37c6b1a1072ed2a2381d82cd18 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 13 Mar 2020 01:10:57 +0200 Subject: net: sched: RED: Introduce an ECN nodrop mode When the RED Qdisc is currently configured to enable ECN, the RED algorithm is used to decide whether a certain SKB should be marked. If that SKB is not ECN-capable, it is early-dropped. It is also possible to keep all traffic in the queue, and just mark the ECN-capable subset of it, as appropriate under the RED algorithm. Some switches support this mode, and some installations make use of it. To that end, add a new RED flag, TC_RED_NODROP. When the Qdisc is configured with this flag, non-ECT traffic is enqueued instead of being early-dropped. Signed-off-by: Petr Machata Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + include/net/red.h | 5 +++++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_red.c | 31 +++++++++++++++++++++++++------ 4 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index dbc89452f90b..1db8b27d4515 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -740,6 +740,7 @@ struct tc_red_qopt_offload_params { u32 limit; bool is_ecn; bool is_harddrop; + bool is_nodrop; struct gnet_stats_queue *qstats; }; diff --git a/include/net/red.h b/include/net/red.h index 6a2aaa6c7c41..fc455445f4b2 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -209,6 +209,11 @@ static inline int red_get_flags(unsigned char qopt_flags, static inline int red_validate_flags(unsigned char flags, struct netlink_ext_ack *extack) { + if ((flags & TC_RED_NODROP) && !(flags & TC_RED_ECN)) { + NL_SET_ERR_MSG_MOD(extack, "nodrop mode is only meaningful with ECN"); + return -EINVAL; + } + return 0; } diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 6325507935ea..ea39287d59c8 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -286,6 +286,7 @@ struct tc_red_qopt { #define TC_RED_ECN 1 #define TC_RED_HARDDROP 2 #define TC_RED_ADAPTATIVE 4 +#define TC_RED_NODROP 8 }; #define TC_RED_HISTORIC_FLAGS (TC_RED_ECN | TC_RED_HARDDROP | TC_RED_ADAPTATIVE) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index d4ce111704dc..3ef0a4f7399b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -48,7 +48,7 @@ struct red_sched_data { struct Qdisc *qdisc; }; -static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS; +static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP; static inline int red_use_ecn(struct red_sched_data *q) { @@ -60,6 +60,11 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } +static int red_use_nodrop(struct red_sched_data *q) +{ + return q->flags & TC_RED_NODROP; +} + static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -80,23 +85,36 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_PROB_MARK: qdisc_qstats_overlimit(sch); - if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + if (!red_use_ecn(q)) { q->stats.prob_drop++; goto congestion_drop; } - q->stats.prob_mark++; + if (INET_ECN_set_ce(skb)) { + q->stats.prob_mark++; + } else if (!red_use_nodrop(q)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + /* Non-ECT packet in ECN nodrop mode: queue it. */ break; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); - if (red_use_harddrop(q) || !red_use_ecn(q) || - !INET_ECN_set_ce(skb)) { + if (red_use_harddrop(q) || !red_use_ecn(q)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + if (INET_ECN_set_ce(skb)) { + q->stats.forced_mark++; + } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; } - q->stats.forced_mark++; + /* Non-ECT packet in ECN nodrop mode: queue it. */ break; } @@ -171,6 +189,7 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.set.limit = q->limit; opt.set.is_ecn = red_use_ecn(q); opt.set.is_harddrop = red_use_harddrop(q); + opt.set.is_nodrop = red_use_nodrop(q); opt.set.qstats = &sch->qstats; } else { opt.command = TC_RED_DESTROY; -- cgit v1.2.3 From f2c2e717642c66f7fe7e5dd69b2e8ff5849f4d10 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 24 Feb 2020 17:13:03 +0100 Subject: usb: gadget: add raw-gadget interface USB Raw Gadget is a kernel module that provides a userspace interface for the USB Gadget subsystem. Essentially it allows to emulate USB devices from userspace. Enabled with CONFIG_USB_RAW_GADGET. Raw Gadget is currently a strictly debugging feature and shouldn't be used in production. Raw Gadget is similar to GadgetFS, but provides a more low-level and direct access to the USB Gadget layer for the userspace. The key differences are: 1. Every USB request is passed to the userspace to get a response, while GadgetFS responds to some USB requests internally based on the provided descriptors. However note, that the UDC driver might respond to some requests on its own and never forward them to the Gadget layer. 2. GadgetFS performs some sanity checks on the provided USB descriptors, while Raw Gadget allows you to provide arbitrary data as responses to USB requests. 3. Raw Gadget provides a way to select a UDC device/driver to bind to, while GadgetFS currently binds to the first available UDC. 4. Raw Gadget uses predictable endpoint names (handles) across different UDCs (as long as UDCs have enough endpoints of each required transfer type). 5. Raw Gadget has ioctl-based interface instead of a filesystem-based one. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Andrey Konovalov Signed-off-by: Felipe Balbi --- Documentation/usb/index.rst | 1 + Documentation/usb/raw-gadget.rst | 61 ++ drivers/usb/gadget/legacy/Kconfig | 11 + drivers/usb/gadget/legacy/Makefile | 1 + drivers/usb/gadget/legacy/raw_gadget.c | 1078 ++++++++++++++++++++++++++++++++ include/uapi/linux/usb/raw_gadget.h | 167 +++++ 6 files changed, 1319 insertions(+) create mode 100644 Documentation/usb/raw-gadget.rst create mode 100644 drivers/usb/gadget/legacy/raw_gadget.c create mode 100644 include/uapi/linux/usb/raw_gadget.h (limited to 'include/uapi/linux') diff --git a/Documentation/usb/index.rst b/Documentation/usb/index.rst index 36b6ebd9a9d9..b656c9be23ed 100644 --- a/Documentation/usb/index.rst +++ b/Documentation/usb/index.rst @@ -22,6 +22,7 @@ USB support misc_usbsevseg mtouchusb ohci + raw-gadget usbip_protocol usbmon usb-serial diff --git a/Documentation/usb/raw-gadget.rst b/Documentation/usb/raw-gadget.rst new file mode 100644 index 000000000000..9e78cb858f86 --- /dev/null +++ b/Documentation/usb/raw-gadget.rst @@ -0,0 +1,61 @@ +============== +USB Raw Gadget +============== + +USB Raw Gadget is a kernel module that provides a userspace interface for +the USB Gadget subsystem. Essentially it allows to emulate USB devices +from userspace. Enabled with CONFIG_USB_RAW_GADGET. Raw Gadget is +currently a strictly debugging feature and shouldn't be used in +production, use GadgetFS instead. + +Comparison to GadgetFS +~~~~~~~~~~~~~~~~~~~~~~ + +Raw Gadget is similar to GadgetFS, but provides a more low-level and +direct access to the USB Gadget layer for the userspace. The key +differences are: + +1. Every USB request is passed to the userspace to get a response, while + GadgetFS responds to some USB requests internally based on the provided + descriptors. However note, that the UDC driver might respond to some + requests on its own and never forward them to the Gadget layer. + +2. GadgetFS performs some sanity checks on the provided USB descriptors, + while Raw Gadget allows you to provide arbitrary data as responses to + USB requests. + +3. Raw Gadget provides a way to select a UDC device/driver to bind to, + while GadgetFS currently binds to the first available UDC. + +4. Raw Gadget uses predictable endpoint names (handles) across different + UDCs (as long as UDCs have enough endpoints of each required transfer + type). + +5. Raw Gadget has ioctl-based interface instead of a filesystem-based one. + +Userspace interface +~~~~~~~~~~~~~~~~~~~ + +To create a Raw Gadget instance open /dev/raw-gadget. Multiple raw-gadget +instances (bound to different UDCs) can be used at the same time. The +interaction with the opened file happens through the ioctl() calls, see +comments in include/uapi/linux/usb/raw_gadget.h for details. + +The typical usage of Raw Gadget looks like: + +1. Open Raw Gadget instance via /dev/raw-gadget. +2. Initialize the instance via USB_RAW_IOCTL_INIT. +3. Launch the instance with USB_RAW_IOCTL_RUN. +4. In a loop issue USB_RAW_IOCTL_EVENT_FETCH calls to receive events from + Raw Gadget and react to those depending on what kind of USB device + needs to be emulated. + +Potential future improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Implement ioctl's for setting/clearing halt status on endpoints. + +- Reporting more events (suspend, resume, etc.) through + USB_RAW_IOCTL_EVENT_FETCH. + +- Support O_NONBLOCK I/O. diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig index 6e7e1a9202e6..a7064e21d9f2 100644 --- a/drivers/usb/gadget/legacy/Kconfig +++ b/drivers/usb/gadget/legacy/Kconfig @@ -516,4 +516,15 @@ config USB_G_WEBCAM Say "y" to link the driver statically, or "m" to build a dynamically linked module called "g_webcam". +config USB_RAW_GADGET + tristate "USB Raw Gadget" + help + USB Raw Gadget is a kernel module that provides a userspace interface + for the USB Gadget subsystem. Essentially it allows to emulate USB + devices from userspace. See Documentation/usb/raw-gadget.rst for + details. + + Say "y" to link the driver statically, or "m" to build a + dynamically linked module called "raw_gadget". + endchoice diff --git a/drivers/usb/gadget/legacy/Makefile b/drivers/usb/gadget/legacy/Makefile index abd0c3e66a05..4d864bf82799 100644 --- a/drivers/usb/gadget/legacy/Makefile +++ b/drivers/usb/gadget/legacy/Makefile @@ -43,3 +43,4 @@ obj-$(CONFIG_USB_G_WEBCAM) += g_webcam.o obj-$(CONFIG_USB_G_NCM) += g_ncm.o obj-$(CONFIG_USB_G_ACM_MS) += g_acm_ms.o obj-$(CONFIG_USB_GADGET_TARGET) += tcm_usb_gadget.o +obj-$(CONFIG_USB_RAW_GADGET) += raw_gadget.o diff --git a/drivers/usb/gadget/legacy/raw_gadget.c b/drivers/usb/gadget/legacy/raw_gadget.c new file mode 100644 index 000000000000..76406343fbe5 --- /dev/null +++ b/drivers/usb/gadget/legacy/raw_gadget.c @@ -0,0 +1,1078 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * USB Raw Gadget driver. + * See Documentation/usb/raw-gadget.rst for more details. + * + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define DRIVER_DESC "USB Raw Gadget" +#define DRIVER_NAME "raw-gadget" + +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_AUTHOR("Andrey Konovalov"); +MODULE_LICENSE("GPL"); + +/*----------------------------------------------------------------------*/ + +#define RAW_EVENT_QUEUE_SIZE 16 + +struct raw_event_queue { + /* See the comment in raw_event_queue_fetch() for locking details. */ + spinlock_t lock; + struct semaphore sema; + struct usb_raw_event *events[RAW_EVENT_QUEUE_SIZE]; + int size; +}; + +static void raw_event_queue_init(struct raw_event_queue *queue) +{ + spin_lock_init(&queue->lock); + sema_init(&queue->sema, 0); + queue->size = 0; +} + +static int raw_event_queue_add(struct raw_event_queue *queue, + enum usb_raw_event_type type, size_t length, const void *data) +{ + unsigned long flags; + struct usb_raw_event *event; + + spin_lock_irqsave(&queue->lock, flags); + if (WARN_ON(queue->size >= RAW_EVENT_QUEUE_SIZE)) { + spin_unlock_irqrestore(&queue->lock, flags); + return -ENOMEM; + } + event = kmalloc(sizeof(*event) + length, GFP_ATOMIC); + if (!event) { + spin_unlock_irqrestore(&queue->lock, flags); + return -ENOMEM; + } + event->type = type; + event->length = length; + if (event->length) + memcpy(&event->data[0], data, length); + queue->events[queue->size] = event; + queue->size++; + up(&queue->sema); + spin_unlock_irqrestore(&queue->lock, flags); + return 0; +} + +static struct usb_raw_event *raw_event_queue_fetch( + struct raw_event_queue *queue) +{ + unsigned long flags; + struct usb_raw_event *event; + + /* + * This function can be called concurrently. We first check that + * there's at least one event queued by decrementing the semaphore, + * and then take the lock to protect queue struct fields. + */ + if (down_interruptible(&queue->sema)) + return NULL; + spin_lock_irqsave(&queue->lock, flags); + if (WARN_ON(!queue->size)) + return NULL; + event = queue->events[0]; + queue->size--; + memmove(&queue->events[0], &queue->events[1], + queue->size * sizeof(queue->events[0])); + spin_unlock_irqrestore(&queue->lock, flags); + return event; +} + +static void raw_event_queue_destroy(struct raw_event_queue *queue) +{ + int i; + + for (i = 0; i < queue->size; i++) + kfree(queue->events[i]); + queue->size = 0; +} + +/*----------------------------------------------------------------------*/ + +struct raw_dev; + +#define USB_RAW_MAX_ENDPOINTS 32 + +enum ep_state { + STATE_EP_DISABLED, + STATE_EP_ENABLED, +}; + +struct raw_ep { + struct raw_dev *dev; + enum ep_state state; + struct usb_ep *ep; + struct usb_request *req; + bool urb_queued; + bool disabling; + ssize_t status; +}; + +enum dev_state { + STATE_DEV_INVALID = 0, + STATE_DEV_OPENED, + STATE_DEV_INITIALIZED, + STATE_DEV_RUNNING, + STATE_DEV_CLOSED, + STATE_DEV_FAILED +}; + +struct raw_dev { + struct kref count; + spinlock_t lock; + + const char *udc_name; + struct usb_gadget_driver driver; + + /* Reference to misc device: */ + struct device *dev; + + /* Protected by lock: */ + enum dev_state state; + bool gadget_registered; + struct usb_gadget *gadget; + struct usb_request *req; + bool ep0_in_pending; + bool ep0_out_pending; + bool ep0_urb_queued; + ssize_t ep0_status; + struct raw_ep eps[USB_RAW_MAX_ENDPOINTS]; + + struct completion ep0_done; + struct raw_event_queue queue; +}; + +static struct raw_dev *dev_new(void) +{ + struct raw_dev *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + /* Matches kref_put() in raw_release(). */ + kref_init(&dev->count); + spin_lock_init(&dev->lock); + init_completion(&dev->ep0_done); + raw_event_queue_init(&dev->queue); + return dev; +} + +static void dev_free(struct kref *kref) +{ + struct raw_dev *dev = container_of(kref, struct raw_dev, count); + int i; + + kfree(dev->udc_name); + kfree(dev->driver.udc_name); + if (dev->req) { + if (dev->ep0_urb_queued) + usb_ep_dequeue(dev->gadget->ep0, dev->req); + usb_ep_free_request(dev->gadget->ep0, dev->req); + } + raw_event_queue_destroy(&dev->queue); + for (i = 0; i < USB_RAW_MAX_ENDPOINTS; i++) { + if (dev->eps[i].state != STATE_EP_ENABLED) + continue; + usb_ep_disable(dev->eps[i].ep); + usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req); + kfree(dev->eps[i].ep->desc); + dev->eps[i].state = STATE_EP_DISABLED; + } + kfree(dev); +} + +/*----------------------------------------------------------------------*/ + +static int raw_queue_event(struct raw_dev *dev, + enum usb_raw_event_type type, size_t length, const void *data) +{ + int ret = 0; + unsigned long flags; + + ret = raw_event_queue_add(&dev->queue, type, length, data); + if (ret < 0) { + spin_lock_irqsave(&dev->lock, flags); + dev->state = STATE_DEV_FAILED; + spin_unlock_irqrestore(&dev->lock, flags); + } + return ret; +} + +static void gadget_ep0_complete(struct usb_ep *ep, struct usb_request *req) +{ + struct raw_dev *dev = req->context; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + if (req->status) + dev->ep0_status = req->status; + else + dev->ep0_status = req->actual; + if (dev->ep0_in_pending) + dev->ep0_in_pending = false; + else + dev->ep0_out_pending = false; + spin_unlock_irqrestore(&dev->lock, flags); + + complete(&dev->ep0_done); +} + +static int gadget_bind(struct usb_gadget *gadget, + struct usb_gadget_driver *driver) +{ + int ret = 0; + struct raw_dev *dev = container_of(driver, struct raw_dev, driver); + struct usb_request *req; + unsigned long flags; + + if (strcmp(gadget->name, dev->udc_name) != 0) + return -ENODEV; + + set_gadget_data(gadget, dev); + req = usb_ep_alloc_request(gadget->ep0, GFP_KERNEL); + if (!req) { + dev_err(&gadget->dev, "usb_ep_alloc_request failed\n"); + set_gadget_data(gadget, NULL); + return -ENOMEM; + } + + spin_lock_irqsave(&dev->lock, flags); + dev->req = req; + dev->req->context = dev; + dev->req->complete = gadget_ep0_complete; + dev->gadget = gadget; + spin_unlock_irqrestore(&dev->lock, flags); + + /* Matches kref_put() in gadget_unbind(). */ + kref_get(&dev->count); + + ret = raw_queue_event(dev, USB_RAW_EVENT_CONNECT, 0, NULL); + if (ret < 0) + dev_err(&gadget->dev, "failed to queue event\n"); + + return ret; +} + +static void gadget_unbind(struct usb_gadget *gadget) +{ + struct raw_dev *dev = get_gadget_data(gadget); + + set_gadget_data(gadget, NULL); + /* Matches kref_get() in gadget_bind(). */ + kref_put(&dev->count, dev_free); +} + +static int gadget_setup(struct usb_gadget *gadget, + const struct usb_ctrlrequest *ctrl) +{ + int ret = 0; + struct raw_dev *dev = get_gadget_data(gadget); + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_err(&gadget->dev, "ignoring, device is not running\n"); + ret = -ENODEV; + goto out_unlock; + } + if (dev->ep0_in_pending || dev->ep0_out_pending) { + dev_dbg(&gadget->dev, "stalling, request already pending\n"); + ret = -EBUSY; + goto out_unlock; + } + if ((ctrl->bRequestType & USB_DIR_IN) && ctrl->wLength) + dev->ep0_in_pending = true; + else + dev->ep0_out_pending = true; + spin_unlock_irqrestore(&dev->lock, flags); + + ret = raw_queue_event(dev, USB_RAW_EVENT_CONTROL, sizeof(*ctrl), ctrl); + if (ret < 0) + dev_err(&gadget->dev, "failed to queue event\n"); + goto out; + +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); +out: + return ret; +} + +/* These are currently unused but present in case UDC driver requires them. */ +static void gadget_disconnect(struct usb_gadget *gadget) { } +static void gadget_suspend(struct usb_gadget *gadget) { } +static void gadget_resume(struct usb_gadget *gadget) { } +static void gadget_reset(struct usb_gadget *gadget) { } + +/*----------------------------------------------------------------------*/ + +static struct miscdevice raw_misc_device; + +static int raw_open(struct inode *inode, struct file *fd) +{ + struct raw_dev *dev; + + /* Nonblocking I/O is not supported yet. */ + if (fd->f_flags & O_NONBLOCK) + return -EINVAL; + + dev = dev_new(); + if (!dev) + return -ENOMEM; + fd->private_data = dev; + dev->state = STATE_DEV_OPENED; + dev->dev = raw_misc_device.this_device; + return 0; +} + +static int raw_release(struct inode *inode, struct file *fd) +{ + int ret = 0; + struct raw_dev *dev = fd->private_data; + unsigned long flags; + bool unregister = false; + + spin_lock_irqsave(&dev->lock, flags); + dev->state = STATE_DEV_CLOSED; + if (!dev->gadget) { + spin_unlock_irqrestore(&dev->lock, flags); + goto out_put; + } + if (dev->gadget_registered) + unregister = true; + dev->gadget_registered = false; + spin_unlock_irqrestore(&dev->lock, flags); + + if (unregister) { + ret = usb_gadget_unregister_driver(&dev->driver); + if (ret != 0) + dev_err(dev->dev, + "usb_gadget_unregister_driver() failed with %d\n", + ret); + /* Matches kref_get() in raw_ioctl_run(). */ + kref_put(&dev->count, dev_free); + } + +out_put: + /* Matches dev_new() in raw_open(). */ + kref_put(&dev->count, dev_free); + return ret; +} + +/*----------------------------------------------------------------------*/ + +static int raw_ioctl_init(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + struct usb_raw_init arg; + char *udc_driver_name; + char *udc_device_name; + unsigned long flags; + + ret = copy_from_user(&arg, (void __user *)value, sizeof(arg)); + if (ret) + return ret; + + switch (arg.speed) { + case USB_SPEED_UNKNOWN: + arg.speed = USB_SPEED_HIGH; + break; + case USB_SPEED_LOW: + case USB_SPEED_FULL: + case USB_SPEED_HIGH: + case USB_SPEED_SUPER: + break; + default: + return -EINVAL; + } + + udc_driver_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL); + if (!udc_driver_name) + return -ENOMEM; + ret = strscpy(udc_driver_name, &arg.driver_name[0], + UDC_NAME_LENGTH_MAX); + if (ret < 0) { + kfree(udc_driver_name); + return ret; + } + ret = 0; + + udc_device_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL); + if (!udc_device_name) { + kfree(udc_driver_name); + return -ENOMEM; + } + ret = strscpy(udc_device_name, &arg.device_name[0], + UDC_NAME_LENGTH_MAX); + if (ret < 0) { + kfree(udc_driver_name); + kfree(udc_device_name); + return ret; + } + ret = 0; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_OPENED) { + dev_dbg(dev->dev, "fail, device is not opened\n"); + kfree(udc_driver_name); + kfree(udc_device_name); + ret = -EINVAL; + goto out_unlock; + } + dev->udc_name = udc_driver_name; + + dev->driver.function = DRIVER_DESC; + dev->driver.max_speed = arg.speed; + dev->driver.setup = gadget_setup; + dev->driver.disconnect = gadget_disconnect; + dev->driver.bind = gadget_bind; + dev->driver.unbind = gadget_unbind; + dev->driver.suspend = gadget_suspend; + dev->driver.resume = gadget_resume; + dev->driver.reset = gadget_reset; + dev->driver.driver.name = DRIVER_NAME; + dev->driver.udc_name = udc_device_name; + dev->driver.match_existing_only = 1; + + dev->state = STATE_DEV_INITIALIZED; + +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static int raw_ioctl_run(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + unsigned long flags; + + if (value) + return -EINVAL; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_INITIALIZED) { + dev_dbg(dev->dev, "fail, device is not initialized\n"); + ret = -EINVAL; + goto out_unlock; + } + spin_unlock_irqrestore(&dev->lock, flags); + + ret = usb_gadget_probe_driver(&dev->driver); + + spin_lock_irqsave(&dev->lock, flags); + if (ret) { + dev_err(dev->dev, + "fail, usb_gadget_probe_driver returned %d\n", ret); + dev->state = STATE_DEV_FAILED; + goto out_unlock; + } + dev->gadget_registered = true; + dev->state = STATE_DEV_RUNNING; + /* Matches kref_put() in raw_release(). */ + kref_get(&dev->count); + +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static int raw_ioctl_event_fetch(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + struct usb_raw_event arg; + unsigned long flags; + struct usb_raw_event *event; + uint32_t length; + + ret = copy_from_user(&arg, (void __user *)value, sizeof(arg)); + if (ret) + return ret; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + spin_unlock_irqrestore(&dev->lock, flags); + return -EINVAL; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + spin_unlock_irqrestore(&dev->lock, flags); + return -EBUSY; + } + spin_unlock_irqrestore(&dev->lock, flags); + + event = raw_event_queue_fetch(&dev->queue); + if (!event) { + dev_dbg(&dev->gadget->dev, "event fetching interrupted\n"); + return -EINTR; + } + length = min(arg.length, event->length); + ret = copy_to_user((void __user *)value, event, + sizeof(*event) + length); + return ret; +} + +static void *raw_alloc_io_data(struct usb_raw_ep_io *io, void __user *ptr, + bool get_from_user) +{ + int ret; + void *data; + + ret = copy_from_user(io, ptr, sizeof(*io)); + if (ret) + return ERR_PTR(ret); + if (io->ep >= USB_RAW_MAX_ENDPOINTS) + return ERR_PTR(-EINVAL); + if (!usb_raw_io_flags_valid(io->flags)) + return ERR_PTR(-EINVAL); + if (io->length > PAGE_SIZE) + return ERR_PTR(-EINVAL); + if (get_from_user) + data = memdup_user(ptr + sizeof(*io), io->length); + else { + data = kmalloc(io->length, GFP_KERNEL); + if (!data) + data = ERR_PTR(-ENOMEM); + } + return data; +} + +static int raw_process_ep0_io(struct raw_dev *dev, struct usb_raw_ep_io *io, + void *data, bool in) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + ret = -EINVAL; + goto out_unlock; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + ret = -EBUSY; + goto out_unlock; + } + if (dev->ep0_urb_queued) { + dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); + ret = -EBUSY; + goto out_unlock; + } + if ((in && !dev->ep0_in_pending) || + (!in && !dev->ep0_out_pending)) { + dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); + ret = -EBUSY; + goto out_unlock; + } + if (WARN_ON(in && dev->ep0_out_pending)) { + ret = -ENODEV; + dev->state = STATE_DEV_FAILED; + goto out_done; + } + if (WARN_ON(!in && dev->ep0_in_pending)) { + ret = -ENODEV; + dev->state = STATE_DEV_FAILED; + goto out_done; + } + + dev->req->buf = data; + dev->req->length = io->length; + dev->req->zero = usb_raw_io_flags_zero(io->flags); + dev->ep0_urb_queued = true; + spin_unlock_irqrestore(&dev->lock, flags); + + ret = usb_ep_queue(dev->gadget->ep0, dev->req, GFP_KERNEL); + if (ret) { + dev_err(&dev->gadget->dev, + "fail, usb_ep_queue returned %d\n", ret); + spin_lock_irqsave(&dev->lock, flags); + dev->state = STATE_DEV_FAILED; + goto out_done; + } + + ret = wait_for_completion_interruptible(&dev->ep0_done); + if (ret) { + dev_dbg(&dev->gadget->dev, "wait interrupted\n"); + usb_ep_dequeue(dev->gadget->ep0, dev->req); + wait_for_completion(&dev->ep0_done); + spin_lock_irqsave(&dev->lock, flags); + goto out_done; + } + + spin_lock_irqsave(&dev->lock, flags); + ret = dev->ep0_status; + +out_done: + dev->ep0_urb_queued = false; +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static int raw_ioctl_ep0_write(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + void *data; + struct usb_raw_ep_io io; + + data = raw_alloc_io_data(&io, (void __user *)value, true); + if (IS_ERR(data)) + return PTR_ERR(data); + ret = raw_process_ep0_io(dev, &io, data, true); + kfree(data); + return ret; +} + +static int raw_ioctl_ep0_read(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + void *data; + struct usb_raw_ep_io io; + unsigned int length; + + data = raw_alloc_io_data(&io, (void __user *)value, false); + if (IS_ERR(data)) + return PTR_ERR(data); + ret = raw_process_ep0_io(dev, &io, data, false); + if (ret < 0) { + kfree(data); + return ret; + } + length = min(io.length, (unsigned int)ret); + ret = copy_to_user((void __user *)(value + sizeof(io)), data, length); + kfree(data); + return ret; +} + +static bool check_ep_caps(struct usb_ep *ep, + struct usb_endpoint_descriptor *desc) +{ + switch (usb_endpoint_type(desc)) { + case USB_ENDPOINT_XFER_ISOC: + if (!ep->caps.type_iso) + return false; + break; + case USB_ENDPOINT_XFER_BULK: + if (!ep->caps.type_bulk) + return false; + break; + case USB_ENDPOINT_XFER_INT: + if (!ep->caps.type_int) + return false; + break; + default: + return false; + } + + if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in) + return false; + if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out) + return false; + + return true; +} + +static int raw_ioctl_ep_enable(struct raw_dev *dev, unsigned long value) +{ + int ret = 0, i; + unsigned long flags; + struct usb_endpoint_descriptor *desc; + struct usb_ep *ep = NULL; + + desc = memdup_user((void __user *)value, sizeof(*desc)); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + /* + * Endpoints with a maxpacket length of 0 can cause crashes in UDC + * drivers. + */ + if (usb_endpoint_maxp(desc) == 0) { + dev_dbg(dev->dev, "fail, bad endpoint maxpacket\n"); + kfree(desc); + return -EINVAL; + } + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + ret = -EINVAL; + goto out_free; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + ret = -EBUSY; + goto out_free; + } + + for (i = 0; i < USB_RAW_MAX_ENDPOINTS; i++) { + if (dev->eps[i].state == STATE_EP_ENABLED) + continue; + break; + } + if (i == USB_RAW_MAX_ENDPOINTS) { + dev_dbg(&dev->gadget->dev, + "fail, no device endpoints available\n"); + ret = -EBUSY; + goto out_free; + } + + gadget_for_each_ep(ep, dev->gadget) { + if (ep->enabled) + continue; + if (!check_ep_caps(ep, desc)) + continue; + ep->desc = desc; + ret = usb_ep_enable(ep); + if (ret < 0) { + dev_err(&dev->gadget->dev, + "fail, usb_ep_enable returned %d\n", ret); + goto out_free; + } + dev->eps[i].req = usb_ep_alloc_request(ep, GFP_ATOMIC); + if (!dev->eps[i].req) { + dev_err(&dev->gadget->dev, + "fail, usb_ep_alloc_request failed\n"); + usb_ep_disable(ep); + ret = -ENOMEM; + goto out_free; + } + dev->eps[i].ep = ep; + dev->eps[i].state = STATE_EP_ENABLED; + ep->driver_data = &dev->eps[i]; + ret = i; + goto out_unlock; + } + + dev_dbg(&dev->gadget->dev, "fail, no gadget endpoints available\n"); + ret = -EBUSY; + +out_free: + kfree(desc); +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static int raw_ioctl_ep_disable(struct raw_dev *dev, unsigned long value) +{ + int ret = 0, i = value; + unsigned long flags; + const void *desc; + + if (i < 0 || i >= USB_RAW_MAX_ENDPOINTS) + return -EINVAL; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + ret = -EINVAL; + goto out_unlock; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + ret = -EBUSY; + goto out_unlock; + } + if (dev->eps[i].state != STATE_EP_ENABLED) { + dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + if (dev->eps[i].disabling) { + dev_dbg(&dev->gadget->dev, + "fail, disable already in progress\n"); + ret = -EINVAL; + goto out_unlock; + } + if (dev->eps[i].urb_queued) { + dev_dbg(&dev->gadget->dev, + "fail, waiting for urb completion\n"); + ret = -EINVAL; + goto out_unlock; + } + dev->eps[i].disabling = true; + spin_unlock_irqrestore(&dev->lock, flags); + + usb_ep_disable(dev->eps[i].ep); + + spin_lock_irqsave(&dev->lock, flags); + usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req); + desc = dev->eps[i].ep->desc; + dev->eps[i].ep = NULL; + dev->eps[i].state = STATE_EP_DISABLED; + kfree(desc); + dev->eps[i].disabling = false; + +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static void gadget_ep_complete(struct usb_ep *ep, struct usb_request *req) +{ + struct raw_ep *r_ep = (struct raw_ep *)ep->driver_data; + struct raw_dev *dev = r_ep->dev; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + if (req->status) + r_ep->status = req->status; + else + r_ep->status = req->actual; + spin_unlock_irqrestore(&dev->lock, flags); + + complete((struct completion *)req->context); +} + +static int raw_process_ep_io(struct raw_dev *dev, struct usb_raw_ep_io *io, + void *data, bool in) +{ + int ret = 0; + unsigned long flags; + struct raw_ep *ep = &dev->eps[io->ep]; + DECLARE_COMPLETION_ONSTACK(done); + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + ret = -EINVAL; + goto out_unlock; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + ret = -EBUSY; + goto out_unlock; + } + if (ep->state != STATE_EP_ENABLED) { + dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); + ret = -EBUSY; + goto out_unlock; + } + if (ep->disabling) { + dev_dbg(&dev->gadget->dev, + "fail, endpoint is already being disabled\n"); + ret = -EBUSY; + goto out_unlock; + } + if (ep->urb_queued) { + dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); + ret = -EBUSY; + goto out_unlock; + } + if ((in && !ep->ep->caps.dir_in) || (!in && ep->ep->caps.dir_in)) { + dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); + ret = -EINVAL; + goto out_unlock; + } + + ep->dev = dev; + ep->req->context = &done; + ep->req->complete = gadget_ep_complete; + ep->req->buf = data; + ep->req->length = io->length; + ep->req->zero = usb_raw_io_flags_zero(io->flags); + ep->urb_queued = true; + spin_unlock_irqrestore(&dev->lock, flags); + + ret = usb_ep_queue(ep->ep, ep->req, GFP_KERNEL); + if (ret) { + dev_err(&dev->gadget->dev, + "fail, usb_ep_queue returned %d\n", ret); + spin_lock_irqsave(&dev->lock, flags); + dev->state = STATE_DEV_FAILED; + goto out_done; + } + + ret = wait_for_completion_interruptible(&done); + if (ret) { + dev_dbg(&dev->gadget->dev, "wait interrupted\n"); + usb_ep_dequeue(ep->ep, ep->req); + wait_for_completion(&done); + spin_lock_irqsave(&dev->lock, flags); + goto out_done; + } + + spin_lock_irqsave(&dev->lock, flags); + ret = ep->status; + +out_done: + ep->urb_queued = false; +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static int raw_ioctl_ep_write(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + char *data; + struct usb_raw_ep_io io; + + data = raw_alloc_io_data(&io, (void __user *)value, true); + if (IS_ERR(data)) + return PTR_ERR(data); + ret = raw_process_ep_io(dev, &io, data, true); + kfree(data); + return ret; +} + +static int raw_ioctl_ep_read(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + char *data; + struct usb_raw_ep_io io; + unsigned int length; + + data = raw_alloc_io_data(&io, (void __user *)value, false); + if (IS_ERR(data)) + return PTR_ERR(data); + ret = raw_process_ep_io(dev, &io, data, false); + if (ret < 0) { + kfree(data); + return ret; + } + length = min(io.length, (unsigned int)ret); + ret = copy_to_user((void __user *)(value + sizeof(io)), data, length); + kfree(data); + return ret; +} + +static int raw_ioctl_configure(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + unsigned long flags; + + if (value) + return -EINVAL; + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + ret = -EINVAL; + goto out_unlock; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + ret = -EBUSY; + goto out_unlock; + } + usb_gadget_set_state(dev->gadget, USB_STATE_CONFIGURED); + +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static int raw_ioctl_vbus_draw(struct raw_dev *dev, unsigned long value) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + if (dev->state != STATE_DEV_RUNNING) { + dev_dbg(dev->dev, "fail, device is not running\n"); + ret = -EINVAL; + goto out_unlock; + } + if (!dev->gadget) { + dev_dbg(dev->dev, "fail, gadget is not bound\n"); + ret = -EBUSY; + goto out_unlock; + } + usb_gadget_vbus_draw(dev->gadget, 2 * value); + +out_unlock: + spin_unlock_irqrestore(&dev->lock, flags); + return ret; +} + +static long raw_ioctl(struct file *fd, unsigned int cmd, unsigned long value) +{ + struct raw_dev *dev = fd->private_data; + int ret = 0; + + if (!dev) + return -EBUSY; + + switch (cmd) { + case USB_RAW_IOCTL_INIT: + ret = raw_ioctl_init(dev, value); + break; + case USB_RAW_IOCTL_RUN: + ret = raw_ioctl_run(dev, value); + break; + case USB_RAW_IOCTL_EVENT_FETCH: + ret = raw_ioctl_event_fetch(dev, value); + break; + case USB_RAW_IOCTL_EP0_WRITE: + ret = raw_ioctl_ep0_write(dev, value); + break; + case USB_RAW_IOCTL_EP0_READ: + ret = raw_ioctl_ep0_read(dev, value); + break; + case USB_RAW_IOCTL_EP_ENABLE: + ret = raw_ioctl_ep_enable(dev, value); + break; + case USB_RAW_IOCTL_EP_DISABLE: + ret = raw_ioctl_ep_disable(dev, value); + break; + case USB_RAW_IOCTL_EP_WRITE: + ret = raw_ioctl_ep_write(dev, value); + break; + case USB_RAW_IOCTL_EP_READ: + ret = raw_ioctl_ep_read(dev, value); + break; + case USB_RAW_IOCTL_CONFIGURE: + ret = raw_ioctl_configure(dev, value); + break; + case USB_RAW_IOCTL_VBUS_DRAW: + ret = raw_ioctl_vbus_draw(dev, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/*----------------------------------------------------------------------*/ + +static const struct file_operations raw_fops = { + .open = raw_open, + .unlocked_ioctl = raw_ioctl, + .compat_ioctl = raw_ioctl, + .release = raw_release, + .llseek = no_llseek, +}; + +static struct miscdevice raw_misc_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = DRIVER_NAME, + .fops = &raw_fops, +}; + +module_misc_device(raw_misc_device); diff --git a/include/uapi/linux/usb/raw_gadget.h b/include/uapi/linux/usb/raw_gadget.h new file mode 100644 index 000000000000..00cbded71061 --- /dev/null +++ b/include/uapi/linux/usb/raw_gadget.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * USB Raw Gadget driver. + * + * See Documentation/usb/raw-gadget.rst for more details. + */ + +#ifndef _UAPI__LINUX_USB_RAW_GADGET_H +#define _UAPI__LINUX_USB_RAW_GADGET_H + +#include +#include +#include + +/* Maximum length of driver_name/device_name in the usb_raw_init struct. */ +#define UDC_NAME_LENGTH_MAX 128 + +/* + * struct usb_raw_init - argument for USB_RAW_IOCTL_INIT ioctl. + * @speed: The speed of the emulated USB device, takes the same values as + * the usb_device_speed enum: USB_SPEED_FULL, USB_SPEED_HIGH, etc. + * @driver_name: The name of the UDC driver. + * @device_name: The name of a UDC instance. + * + * The last two fields identify a UDC the gadget driver should bind to. + * For example, Dummy UDC has "dummy_udc" as its driver_name and "dummy_udc.N" + * as its device_name, where N in the index of the Dummy UDC instance. + * At the same time the dwc2 driver that is used on Raspberry Pi Zero, has + * "20980000.usb" as both driver_name and device_name. + */ +struct usb_raw_init { + __u8 driver_name[UDC_NAME_LENGTH_MAX]; + __u8 device_name[UDC_NAME_LENGTH_MAX]; + __u8 speed; +}; + +/* The type of event fetched with the USB_RAW_IOCTL_EVENT_FETCH ioctl. */ +enum usb_raw_event_type { + USB_RAW_EVENT_INVALID = 0, + + /* This event is queued when the driver has bound to a UDC. */ + USB_RAW_EVENT_CONNECT = 1, + + /* This event is queued when a new control request arrived to ep0. */ + USB_RAW_EVENT_CONTROL = 2, + + /* The list might grow in the future. */ +}; + +/* + * struct usb_raw_event - argument for USB_RAW_IOCTL_EVENT_FETCH ioctl. + * @type: The type of the fetched event. + * @length: Length of the data buffer. Updated by the driver and set to the + * actual length of the fetched event data. + * @data: A buffer to store the fetched event data. + * + * Currently the fetched data buffer is empty for USB_RAW_EVENT_CONNECT, + * and contains struct usb_ctrlrequest for USB_RAW_EVENT_CONTROL. + */ +struct usb_raw_event { + __u32 type; + __u32 length; + __u8 data[0]; +}; + +#define USB_RAW_IO_FLAGS_ZERO 0x0001 +#define USB_RAW_IO_FLAGS_MASK 0x0001 + +static int usb_raw_io_flags_valid(__u16 flags) +{ + return (flags & ~USB_RAW_IO_FLAGS_MASK) == 0; +} + +static int usb_raw_io_flags_zero(__u16 flags) +{ + return (flags & USB_RAW_IO_FLAGS_ZERO); +} + +/* + * struct usb_raw_ep_io - argument for USB_RAW_IOCTL_EP0/EP_WRITE/READ ioctls. + * @ep: Endpoint handle as returned by USB_RAW_IOCTL_EP_ENABLE for + * USB_RAW_IOCTL_EP_WRITE/READ. Ignored for USB_RAW_IOCTL_EP0_WRITE/READ. + * @flags: When USB_RAW_IO_FLAGS_ZERO is specified, the zero flag is set on + * the submitted USB request, see include/linux/usb/gadget.h for details. + * @length: Length of data. + * @data: Data to send for USB_RAW_IOCTL_EP0/EP_WRITE. Buffer to store received + * data for USB_RAW_IOCTL_EP0/EP_READ. + */ +struct usb_raw_ep_io { + __u16 ep; + __u16 flags; + __u32 length; + __u8 data[0]; +}; + +/* + * Initializes a Raw Gadget instance. + * Accepts a pointer to the usb_raw_init struct as an argument. + * Returns 0 on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_INIT _IOW('U', 0, struct usb_raw_init) + +/* + * Instructs Raw Gadget to bind to a UDC and start emulating a USB device. + * Returns 0 on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_RUN _IO('U', 1) + +/* + * A blocking ioctl that waits for an event and returns fetched event data to + * the user. + * Accepts a pointer to the usb_raw_event struct. + * Returns 0 on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_EVENT_FETCH _IOR('U', 2, struct usb_raw_event) + +/* + * Queues an IN (OUT for READ) urb as a response to the last control request + * received on endpoint 0, provided that was an IN (OUT for READ) request and + * waits until the urb is completed. Copies received data to user for READ. + * Accepts a pointer to the usb_raw_ep_io struct as an argument. + * Returns length of trasferred data on success or negative error code on + * failure. + */ +#define USB_RAW_IOCTL_EP0_WRITE _IOW('U', 3, struct usb_raw_ep_io) +#define USB_RAW_IOCTL_EP0_READ _IOWR('U', 4, struct usb_raw_ep_io) + +/* + * Finds an endpoint that supports the transfer type specified in the + * descriptor and enables it. + * Accepts a pointer to the usb_endpoint_descriptor struct as an argument. + * Returns enabled endpoint handle on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_EP_ENABLE _IOW('U', 5, struct usb_endpoint_descriptor) + +/* Disables specified endpoint. + * Accepts endpoint handle as an argument. + * Returns 0 on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_EP_DISABLE _IOW('U', 6, __u32) + +/* + * Queues an IN (OUT for READ) urb as a response to the last control request + * received on endpoint usb_raw_ep_io.ep, provided that was an IN (OUT for READ) + * request and waits until the urb is completed. Copies received data to user + * for READ. + * Accepts a pointer to the usb_raw_ep_io struct as an argument. + * Returns length of trasferred data on success or negative error code on + * failure. + */ +#define USB_RAW_IOCTL_EP_WRITE _IOW('U', 7, struct usb_raw_ep_io) +#define USB_RAW_IOCTL_EP_READ _IOWR('U', 8, struct usb_raw_ep_io) + +/* + * Switches the gadget into the configured state. + * Returns 0 on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_CONFIGURE _IO('U', 9) + +/* + * Constrains UDC VBUS power usage. + * Accepts current limit in 2 mA units as an argument. + * Returns 0 on success or negative error code on failure. + */ +#define USB_RAW_IOCTL_VBUS_DRAW _IOW('U', 10, __u32) + +#endif /* _UAPI__LINUX_USB_RAW_GADGET_H */ -- cgit v1.2.3 From 68983a354a655c35d3fb204489d383a2a051fda7 Mon Sep 17 00:00:00 2001 From: Manoj Basapathi Date: Thu, 6 Feb 2020 16:37:29 +0530 Subject: netfilter: xtables: Add snapshot of hardidletimer target This is a snapshot of hardidletimer netfilter target. This patch implements a hardidletimer Xtables target that can be used to identify when interfaces have been idle for a certain period of time. Timers are identified by labels and are created when a rule is set with a new label. The rules also take a timeout value (in seconds) as an option. If more than one rule uses the same timer label, the timer will be restarted whenever any of the rules get a hit. One entry for each timer is created in sysfs. This attribute contains the timer remaining for the timer to expire. The attributes are located under the xt_idletimer class: /sys/class/xt_idletimer/timers/