From 9bc95a95abbe91e9315c1fe27dc124019bd2592c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:28:37 -0700 Subject: bpf: Mark BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE deprecated Now 'BPF_MAP_TYPE_CGRP_STORAGE + local percpu ptr' can cover all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE functionality and more. So mark BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE deprecated. Also make changes in selftests/bpf/test_bpftool_synctypes.py and selftest libbpf_str to fix otherwise test errors. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152837.2003563-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8790b3962e4b..73b155e52204 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -932,7 +932,14 @@ enum bpf_map_type { */ BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs + * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE + + * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * deprecated. + */ + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, -- cgit v1.2.3 From 5add321c329b1746589b51359259666ca3dbe219 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 29 Aug 2023 12:17:43 +0200 Subject: wifi: cfg80211: remove scan_width support There really isn't any support for scanning at different channel widths than 20 MHz since there's no way to set it. Remove this support for now, if somebody wants to maintain this whole thing later we can revisit how it should work. Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/wmi.c | 2 - .../broadcom/brcm80211/brcmfmac/cfg80211.c | 1 - include/net/cfg80211.h | 63 +------------------ include/uapi/linux/nl80211.h | 2 +- net/mac80211/ibss.c | 30 ++------- net/mac80211/ieee80211_i.h | 3 +- net/mac80211/ocb.c | 5 +- net/mac80211/scan.c | 71 +++++----------------- net/wireless/mesh.c | 5 +- net/wireless/nl80211.c | 1 - net/wireless/scan.c | 23 +------ net/wireless/trace.h | 6 +- net/wireless/util.c | 14 ++--- 13 files changed, 35 insertions(+), 191 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index 6a5976a2944c..6fdb77d4c59e 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -870,7 +870,6 @@ static void wmi_evt_rx_mgmt(struct wil6210_vif *vif, int id, void *d, int len) struct cfg80211_bss *bss; struct cfg80211_inform_bss bss_data = { .chan = channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, .boottime_ns = ktime_to_ns(ktime_get_boottime()), }; @@ -1389,7 +1388,6 @@ wmi_evt_sched_scan_result(struct wil6210_vif *vif, int id, void *d, int len) u32 d_len; struct cfg80211_bss *bss; struct cfg80211_inform_bss bss_data = { - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .boottime_ns = ktime_to_ns(ktime_get_boottime()), }; diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 2a90bb24ba77..94b4a7b8793d 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3367,7 +3367,6 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg, freq = ieee80211_channel_to_frequency(channel, band); bss_data.chan = ieee80211_get_channel(wiphy, freq); - bss_data.scan_width = NL80211_BSS_CHAN_WIDTH_20; bss_data.boottime_ns = ktime_to_ns(ktime_get_boottime()); notify_capability = le16_to_cpu(bi->capability); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 134d9e0b73c9..2d3fa4a29781 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2536,7 +2536,6 @@ struct cfg80211_scan_6ghz_params { * @n_ssids: number of SSIDs * @channels: channels to scan on. * @n_channels: total number of channels to scan - * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @duration: how long to listen on each channel, in TUs. If @@ -2566,7 +2565,6 @@ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; - enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u16 duration; @@ -2661,7 +2659,6 @@ struct cfg80211_bss_select_adjust { * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans) * @n_ssids: number of SSIDs * @n_channels: total number of channels to scan - * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @flags: control flags from &enum nl80211_scan_flags @@ -2709,7 +2706,6 @@ struct cfg80211_sched_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; - enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u32 flags; @@ -2757,7 +2753,6 @@ enum cfg80211_signal_type { /** * struct cfg80211_inform_bss - BSS inform data * @chan: channel the frame was received on - * @scan_width: scan width that was used * @signal: signal strength value, according to the wiphy's * signal type * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was @@ -2777,7 +2772,6 @@ enum cfg80211_signal_type { */ struct cfg80211_inform_bss { struct ieee80211_channel *chan; - enum nl80211_bss_scan_width scan_width; s32 signal; u64 boottime_ns; u64 parent_tsf; @@ -2811,7 +2805,6 @@ struct cfg80211_bss_ies { * for use in scan results and similar. * * @channel: channel this BSS is on - * @scan_width: width of the control channel * @bssid: BSSID of the BSS * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order @@ -2841,7 +2834,6 @@ struct cfg80211_bss_ies { */ struct cfg80211_bss { struct ieee80211_channel *channel; - enum nl80211_bss_scan_width scan_width; const struct cfg80211_bss_ies __rcu *ies; const struct cfg80211_bss_ies __rcu *beacon_ies; @@ -6321,13 +6313,11 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband, /** * ieee80211_mandatory_rates - get mandatory rates for a given band * @sband: the band to look for rates in - * @scan_width: width of the control channel * * This function returns a bitmap of the mandatory rates for the given * band, bits are set according to the rate position in the bitrates array. */ -u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, - enum nl80211_bss_scan_width scan_width); +u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband); /* * Radiotap parsing functions -- for controlled injection support @@ -6988,22 +6978,6 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp); -static inline struct cfg80211_bss * __must_check -cfg80211_inform_bss_width_frame(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp) -{ - struct cfg80211_inform_bss data = { - .chan = rx_channel, - .scan_width = scan_width, - .signal = signal, - }; - - return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp); -} - static inline struct cfg80211_bss * __must_check cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, @@ -7012,7 +6986,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, { struct cfg80211_inform_bss data = { .chan = rx_channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, }; @@ -7114,26 +7087,6 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, u16 beacon_interval, const u8 *ie, size_t ielen, gfp_t gfp); -static inline struct cfg80211_bss * __must_check -cfg80211_inform_bss_width(struct wiphy *wiphy, - struct ieee80211_channel *rx_channel, - enum nl80211_bss_scan_width scan_width, - enum cfg80211_bss_frame_type ftype, - const u8 *bssid, u64 tsf, u16 capability, - u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp) -{ - struct cfg80211_inform_bss data = { - .chan = rx_channel, - .scan_width = scan_width, - .signal = signal, - }; - - return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf, - capability, beacon_interval, ie, ielen, - gfp); -} - static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, @@ -7144,7 +7097,6 @@ cfg80211_inform_bss(struct wiphy *wiphy, { struct cfg80211_inform_bss data = { .chan = rx_channel, - .scan_width = NL80211_BSS_CHAN_WIDTH_20, .signal = signal, }; @@ -7229,19 +7181,6 @@ void cfg80211_bss_iter(struct wiphy *wiphy, void *data), void *iter_data); -static inline enum nl80211_bss_scan_width -cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef) -{ - switch (chandef->width) { - case NL80211_CHAN_WIDTH_5: - return NL80211_BSS_CHAN_WIDTH_5; - case NL80211_CHAN_WIDTH_10: - return NL80211_BSS_CHAN_WIDTH_10; - default: - return NL80211_BSS_CHAN_WIDTH_20; - } -} - /** * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame * @dev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 88eb85c63029..b628126e06fa 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5038,7 +5038,7 @@ enum nl80211_bss_scan_width { * elements from a Beacon frame (bin); not present if no Beacon frame has * yet been received * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel - * (u32, enum nl80211_bss_scan_width) + * (u32, enum nl80211_bss_scan_width) - No longer used! * @NL80211_BSS_BEACON_TSF: TSF of the last received beacon (u64) * (not present if no beacon frame has been received yet) * @NL80211_BSS_PRESP_DATA: the data in @NL80211_BSS_INFORMATION_ELEMENTS and diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9907cea6457c..55ec34602b53 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -377,7 +377,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); bss_meta.chan = chan; - bss_meta.scan_width = cfg80211_chandef_to_scan_width(&chandef); bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, presp->head_len, GFP_KERNEL); @@ -595,7 +594,6 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; - enum nl80211_bss_scan_width scan_width; int band; /* @@ -624,7 +622,6 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, if (WARN_ON_ONCE(!chanctx_conf)) return NULL; band = chanctx_conf->def.chan->band; - scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_KERNEL); @@ -636,7 +633,7 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband, scan_width); + ieee80211_mandatory_rates(sband); return ieee80211_ibss_finish_sta(sta); } @@ -975,7 +972,6 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, { struct sta_info *sta; enum nl80211_band band = rx_status->band; - enum nl80211_bss_scan_width scan_width; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; bool rates_updated = false; @@ -1001,15 +997,9 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, u32 prev_rates; prev_rates = sta->sta.deflink.supp_rates[band]; - /* make sure mandatory rates are always added */ - scan_width = NL80211_BSS_CHAN_WIDTH_20; - if (rx_status->bw == RATE_INFO_BW_5) - scan_width = NL80211_BSS_CHAN_WIDTH_5; - else if (rx_status->bw == RATE_INFO_BW_10) - scan_width = NL80211_BSS_CHAN_WIDTH_10; sta->sta.deflink.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband, scan_width); + ieee80211_mandatory_rates(sband); if (sta->sta.deflink.supp_rates[band] != prev_rates) { ibss_dbg(sdata, "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", @@ -1196,7 +1186,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; - enum nl80211_bss_scan_width scan_width; int band; /* @@ -1222,7 +1211,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, return; } band = chanctx_conf->def.chan->band; - scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); @@ -1232,7 +1220,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband, scan_width); + ieee80211_mandatory_rates(sband); spin_lock(&ifibss->incomplete_lock); list_add(&sta->list, &ifibss->incomplete_stations); @@ -1282,7 +1270,6 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - enum nl80211_bss_scan_width scan_width; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -1304,9 +1291,8 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) sdata_info(sdata, "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n"); - scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, - NULL, 0, scan_width); + NULL, 0); } static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) @@ -1424,7 +1410,6 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) struct cfg80211_bss *cbss; struct ieee80211_channel *chan = NULL; const u8 *bssid = NULL; - enum nl80211_bss_scan_width scan_width; int active_ibss; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -1483,8 +1468,6 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) sdata_info(sdata, "Trigger new scan to find an IBSS to join\n"); - scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); - if (ifibss->fixed_channel) { num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy, &ifibss->chandef, @@ -1492,11 +1475,10 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) ARRAY_SIZE(channels)); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, channels, - num, scan_width); + num); } else { ieee80211_request_ibss_scan(sdata, ifibss->ssid, - ifibss->ssid_len, NULL, - 0, scan_width); + ifibss->ssid_len, NULL, 0); } } else { int interval = IEEE80211_SCAN_INTERVAL; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e7dc4cdcdcde..e443a8e5e9be 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1911,8 +1911,7 @@ void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, - unsigned int n_channels, - enum nl80211_bss_scan_width scan_width); + unsigned int n_channels); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); void ieee80211_scan_cancel(struct ieee80211_local *local); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 6e2965ffb809..449af4e1cca4 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -44,7 +44,6 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; - enum nl80211_bss_scan_width scan_width; struct sta_info *sta; int band; @@ -66,7 +65,6 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, return; } band = chanctx_conf->def.chan->band; - scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def); rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); @@ -75,8 +73,7 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, /* Add only mandatory rates for now */ sband = local->hw.wiphy->bands[band]; - sta->sta.deflink.supp_rates[band] = - ieee80211_mandatory_rates(sband, scan_width); + sta->sta.deflink.supp_rates[band] = ieee80211_mandatory_rates(sband); spin_lock(&ifocb->incomplete_lock); list_add(&sta->list, &ifocb->incomplete_stations); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 58d525e41f6b..24fa06105378 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -187,12 +187,6 @@ ieee80211_bss_info_update(struct ieee80211_local *local, else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; - bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_20; - if (rx_status->bw == RATE_INFO_BW_5) - bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_5; - else if (rx_status->bw == RATE_INFO_BW_10) - bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10; - bss_meta.chan = channel; rcu_read_lock(); @@ -315,22 +309,11 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) ieee80211_rx_bss_put(local, bss); } -static void -ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef, - enum nl80211_bss_scan_width scan_width) +static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); - switch (scan_width) { - case NL80211_BSS_CHAN_WIDTH_5: - chandef->width = NL80211_CHAN_WIDTH_5; - break; - case NL80211_BSS_CHAN_WIDTH_10: - chandef->width = NL80211_CHAN_WIDTH_10; - break; - default: - chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - break; - } + + chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ @@ -378,7 +361,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) } local->hw_scan_req->req.n_channels = n_chans; - ieee80211_prepare_scan_chandef(&chandef, req->scan_width); + ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; @@ -919,7 +902,6 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, { int skip; struct ieee80211_channel *chan; - enum nl80211_bss_scan_width oper_scan_width; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, @@ -933,42 +915,21 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; - /* For scanning on the S1G band, ignore scan_width (which is constant - * across all channels) for now since channel width is specific to each - * channel. Detect the required channel width here and likely revisit - * later. Maybe scan_width could be used to build the channel scan list? + /* For scanning on the S1G band, detect the channel width according to + * the channel being scanned. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = ieee80211_s1g_channel_width(chan); goto set_channel; } - switch (scan_req->scan_width) { - case NL80211_BSS_CHAN_WIDTH_5: - local->scan_chandef.width = NL80211_CHAN_WIDTH_5; - break; - case NL80211_BSS_CHAN_WIDTH_10: - local->scan_chandef.width = NL80211_CHAN_WIDTH_10; - break; - default: - case NL80211_BSS_CHAN_WIDTH_20: - /* If scanning on oper channel, use whatever channel-type - * is currently in use. - */ - oper_scan_width = cfg80211_chandef_to_scan_width( - &local->_oper_chandef); - if (chan == local->_oper_chandef.chan && - oper_scan_width == scan_req->scan_width) - local->scan_chandef = local->_oper_chandef; - else - local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; - break; - case NL80211_BSS_CHAN_WIDTH_1: - case NL80211_BSS_CHAN_WIDTH_2: - /* shouldn't get here, S1G handled above */ - WARN_ON(1); - break; - } + /* If scanning on oper channel, use whatever channel-type + * is currently in use. + */ + if (chan == local->_oper_chandef.chan) + local->scan_chandef = local->_oper_chandef; + else + local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL)) @@ -1152,8 +1113,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, - unsigned int n_channels, - enum nl80211_bss_scan_width scan_width) + unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int ret = -EBUSY, i, n_ch = 0; @@ -1210,7 +1170,6 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; - local->int_scan_req->scan_width = scan_width; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; @@ -1311,7 +1270,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, goto out; } - ieee80211_prepare_scan_chandef(&chandef, req->scan_width); + ieee80211_prepare_scan_chandef(&chandef); ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index dc75abdb8f2e..83306979fbe2 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -172,7 +172,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, * basic rates */ if (!setup->basic_rates) { - enum nl80211_bss_scan_width scan_width; struct ieee80211_supported_band *sband = rdev->wiphy.bands[setup->chandef.chan->band]; @@ -193,9 +192,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, } } } else { - scan_width = cfg80211_chandef_to_scan_width(&setup->chandef); - setup->basic_rates = ieee80211_mandatory_rates(sband, - scan_width); + setup->basic_rates = ieee80211_mandatory_rates(sband); } } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ab0aea7dca7d..f2dd4c85a10f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10283,7 +10283,6 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET, res->channel->freq_offset) || - nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ae4d000009fe..a5758edf53b8 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1638,8 +1638,6 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, continue; if (bss->pub.channel != new->pub.channel) continue; - if (bss->pub.scan_width != new->pub.scan_width) - continue; if (rcu_access_pointer(bss->pub.beacon_ies)) continue; ies = rcu_access_pointer(bss->pub.ies); @@ -1936,8 +1934,7 @@ EXPORT_SYMBOL(cfg80211_get_ies_channel_number); */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, - struct ieee80211_channel *channel, - enum nl80211_bss_scan_width scan_width) + struct ieee80211_channel *channel) { u32 freq; int channel_number; @@ -1977,16 +1974,6 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, return channel; } - if (scan_width == NL80211_BSS_CHAN_WIDTH_10 || - scan_width == NL80211_BSS_CHAN_WIDTH_5) { - /* - * Ignore channel number in 5 and 10 MHz channels where there - * may not be an n:1 or 1:n mapping between frequencies and - * channel numbers. - */ - return channel; - } - /* * Use the channel determined through the payload channel number * instead of the RX channel reported by the driver. @@ -2046,14 +2033,12 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, channel = data->channel; if (!channel) channel = cfg80211_get_bss_channel(wiphy, data->ie, data->ielen, - drv_data->chan, - drv_data->scan_width); + drv_data->chan); if (!channel) return NULL; memcpy(tmp.pub.bssid, data->bssid, ETH_ALEN); tmp.pub.channel = channel; - tmp.pub.scan_width = drv_data->scan_width; if (data->bss_source != BSS_SOURCE_STA_PROFILE) tmp.pub.signal = drv_data->signal; else @@ -2814,8 +2799,7 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, variable = ext->u.s1g_beacon.variable; } - channel = cfg80211_get_bss_channel(wiphy, variable, - ielen, data->chan, data->scan_width); + channel = cfg80211_get_bss_channel(wiphy, variable, ielen, data->chan); if (!channel) return NULL; @@ -2868,7 +2852,6 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, tmp.pub.beacon_interval = beacon_int; tmp.pub.capability = capability; tmp.pub.channel = channel; - tmp.pub.scan_width = data->scan_width; tmp.pub.signal = data->signal; tmp.ts_boottime = data->boottime_ns; tmp.parent_tsf = data->parent_tsf; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 617c0d0dfa96..126c3a03e43e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3590,7 +3590,6 @@ TRACE_EVENT(cfg80211_inform_bss_frame, TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY - __field(enum nl80211_bss_scan_width, scan_width) __dynamic_array(u8, mgmt, len) __field(s32, signal) __field(u64, ts_boottime) @@ -3600,7 +3599,6 @@ TRACE_EVENT(cfg80211_inform_bss_frame, TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(data->chan); - __entry->scan_width = data->scan_width; if (mgmt) memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = data->signal; @@ -3609,8 +3607,8 @@ TRACE_EVENT(cfg80211_inform_bss_frame, MAC_ASSIGN(parent_bssid, data->parent_bssid); ), TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT - "(scan_width: %d) signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM", - WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + "signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM", + WIPHY_PR_ARG, CHAN_PR_ARG, __entry->signal, (unsigned long long)__entry->ts_boottime, (unsigned long long)__entry->parent_tsf, __entry->parent_bssid) diff --git a/net/wireless/util.c b/net/wireless/util.c index 56cbd9979a3f..213c9405e645 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -43,8 +43,7 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_get_response_rate); -u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, - enum nl80211_bss_scan_width scan_width) +u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; @@ -54,15 +53,10 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, if (WARN_ON(!sband)) return 1; - if (sband->band == NL80211_BAND_2GHZ) { - if (scan_width == NL80211_BSS_CHAN_WIDTH_5 || - scan_width == NL80211_BSS_CHAN_WIDTH_10) - mandatory_flag = IEEE80211_RATE_MANDATORY_G; - else - mandatory_flag = IEEE80211_RATE_MANDATORY_B; - } else { + if (sband->band == NL80211_BAND_2GHZ) + mandatory_flag = IEEE80211_RATE_MANDATORY_B; + else mandatory_flag = IEEE80211_RATE_MANDATORY_A; - } bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) -- cgit v1.2.3 From 0cfaec25995ad3be316631b945be7ced81daa4e7 Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Thu, 27 Jul 2023 10:40:56 -0700 Subject: wifi: nl80211: fixes to FILS discovery updates Add a new flag 'update' which is set to true during start_ap() if (and only if) one of the following two conditions are met: - Userspace passed an empty nested attribute which indicates that the feature should be disabled and templates deleted. - Userspace passed all the parameters for the nested attribute. Existing configuration will not be changed while the flag remains false. Add similar changes for unsolicited broadcast probe response transmission. Signed-off-by: Aloka Dixit Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230727174100.11721-2-quic_alokad@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 11 +++++++---- net/wireless/nl80211.c | 16 +++++++++++++++- 3 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2d3fa4a29781..aa8c4538f93d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1338,6 +1338,7 @@ struct cfg80211_acl_data { * struct cfg80211_fils_discovery - FILS discovery parameters from * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. * + * @update: Set to true if the feature configuration should be updated. * @min_interval: Minimum packet interval in TUs (0 - 10000) * @max_interval: Maximum packet interval in TUs (0 - 10000) * @tmpl_len: Template length @@ -1345,6 +1346,7 @@ struct cfg80211_acl_data { * frame headers. */ struct cfg80211_fils_discovery { + bool update; u32 min_interval; u32 max_interval; size_t tmpl_len; @@ -1355,6 +1357,7 @@ struct cfg80211_fils_discovery { * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe * response parameters in 6GHz. * + * @update: Set to true if the feature configuration should be updated. * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned * in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive * scanning @@ -1362,6 +1365,7 @@ struct cfg80211_fils_discovery { * @tmpl: Template data for probe response */ struct cfg80211_unsol_bcast_probe_resp { + bool update; u32 interval; size_t tmpl_len; const u8 *tmpl; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b628126e06fa..f797ab7a6547 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2690,11 +2690,13 @@ enum nl80211_commands { * * @NL80211_ATTR_FILS_DISCOVERY: Optional parameter to configure FILS * discovery. It is a nested attribute, see - * &enum nl80211_fils_discovery_attributes. + * &enum nl80211_fils_discovery_attributes. Userspace should pass an empty + * nested attribute to disable this feature and delete the templates. * * @NL80211_ATTR_UNSOL_BCAST_PROBE_RESP: Optional parameter to configure * unsolicited broadcast probe response. It is a nested attribute, see - * &enum nl80211_unsol_bcast_probe_resp_attributes. + * &enum nl80211_unsol_bcast_probe_resp_attributes. Userspace should pass an empty + * nested attribute to disable this feature and delete the templates. * * @NL80211_ATTR_S1G_CAPABILITY: S1G Capability information element (from * association request when used with NL80211_CMD_NEW_STATION) @@ -7606,7 +7608,7 @@ enum nl80211_iftype_akm_attributes { * @NL80211_FILS_DISCOVERY_ATTR_INT_MIN: Minimum packet interval (u32, TU). * Allowed range: 0..10000 (TU = Time Unit) * @NL80211_FILS_DISCOVERY_ATTR_INT_MAX: Maximum packet interval (u32, TU). - * Allowed range: 0..10000 (TU = Time Unit) + * Allowed range: 0..10000 (TU = Time Unit). If set to 0, the feature is disabled. * @NL80211_FILS_DISCOVERY_ATTR_TMPL: Template data for FILS discovery action * frame including the headers. * @@ -7639,7 +7641,8 @@ enum nl80211_fils_discovery_attributes { * * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT: Maximum packet interval (u32, TU). * Allowed range: 0..20 (TU = Time Unit). IEEE P802.11ax/D6.0 - * 26.17.2.3.2 (AP behavior for fast passive scanning). + * 26.17.2.3.2 (AP behavior for fast passive scanning). If set to 0, the feature is + * disabled. * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL: Unsolicited broadcast probe response * frame template (binary). * diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f2dd4c85a10f..53618a147907 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5647,6 +5647,13 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, if (ret) return ret; + if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] && + !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] && + !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) { + fd->update = true; + return 0; + } + if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] || !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] || !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) @@ -5656,7 +5663,7 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]); fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]); - + fd->update = true; return 0; } @@ -5679,6 +5686,12 @@ nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, if (ret) return ret; + if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] && + !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) { + presp->update = true; + return 0; + } + if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] || !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) return -EINVAL; @@ -5686,6 +5699,7 @@ nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]); + presp->update = true; return 0; } -- cgit v1.2.3 From a9c2a608549bb1a2363d289d63907640afcf22af Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 13 Sep 2023 10:13:49 -0700 Subject: bpf: expose information about supported xdp metadata kfunc Add new xdp-rx-metadata-features member to netdev netlink which exports a bitmask of supported kfuncs. Most of the patch is autogenerated (headers), the only relevant part is netdev.yaml and the changes in netdev-genl.c to marshal into netlink. Example output on veth: $ ip link add veth0 type veth peer name veth1 # ifndex == 12 $ ./tools/net/ynl/samples/netdev 12 Select ifc ($ifindex; or 0 = dump; or -2 ntf check): 12 veth1[12] xdp-features (23): basic redirect rx-sg xdp-rx-metadata-features (3): timestamp hash xdp-zc-max-segs=0 Cc: netdev@vger.kernel.org Cc: Willem de Bruijn Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230913171350.369987-3-sdf@google.com Signed-off-by: Martin KaFai Lau --- Documentation/netlink/specs/netdev.yaml | 21 +++++++++++++++++++++ Documentation/networking/xdp-rx-metadata.rst | 7 +++++++ include/net/xdp.h | 5 ++++- include/uapi/linux/netdev.h | 16 ++++++++++++++++ kernel/bpf/offload.c | 2 +- net/core/netdev-genl.c | 12 +++++++++++- net/core/xdp.c | 4 ++-- tools/include/uapi/linux/netdev.h | 16 ++++++++++++++++ 8 files changed, 78 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 1c7284fd535b..c46fcc78fc04 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -42,6 +42,19 @@ definitions: doc: This feature informs if netdev implements non-linear XDP buffer support in ndo_xdp_xmit callback. + - + type: flags + name: xdp-rx-metadata + render-max: true + entries: + - + name: timestamp + doc: + Device is capable of exposing receive HW timestamp via bpf_xdp_metadata_rx_timestamp(). + - + name: hash + doc: + Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash(). attribute-sets: - @@ -68,6 +81,13 @@ attribute-sets: type: u32 checks: min: 1 + - + name: xdp-rx-metadata-features + doc: Bitmask of supported XDP receive metadata features. + See Documentation/networking/xdp-rx-metadata.rst for more details. + type: u64 + enum: xdp-rx-metadata + enum-as-flags: true operations: list: @@ -84,6 +104,7 @@ operations: - ifindex - xdp-features - xdp-zc-max-segs + - xdp-rx-metadata-features dump: reply: *dev-all - diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst index 25ce72af81c2..205696780b78 100644 --- a/Documentation/networking/xdp-rx-metadata.rst +++ b/Documentation/networking/xdp-rx-metadata.rst @@ -105,6 +105,13 @@ bpf_tail_call Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY`` is currently not supported. +Supported Devices +================= + +It is possible to query which kfunc the particular netdev implements via +netlink. See ``xdp-rx-metadata-features`` attribute set in +``Documentation/netlink/specs/netdev.yaml``. + Example ======= diff --git a/include/net/xdp.h b/include/net/xdp.h index d59e12f8f311..349c36fb5fd8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -386,19 +386,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum + * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ + NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ + NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ enum xdp_rx_metadata { -#define XDP_METADATA_KFUNC(name, _, __) name, +#define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index c1634b95c223..2943a151d4f1 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -38,11 +38,27 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_MASK = 127, }; +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6aa6de8d715d..e7a1752b5a09 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -845,7 +845,7 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) if (!ops) goto out; -#define XDP_METADATA_KFUNC(name, _, xmo) \ +#define XDP_METADATA_KFUNC(name, _, __, xmo) \ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index c1aea8b756b6..fe61f85bcf33 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "netdev-genl-gen.h" @@ -12,15 +13,24 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; +#define XDP_METADATA_KFUNC(_, flag, __, xmo) \ + if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ + xdp_rx_meta |= flag; +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, - netdev->xdp_features, NETDEV_A_DEV_PAD)) { + netdev->xdp_features, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + xdp_rx_meta, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } diff --git a/net/core/xdp.c b/net/core/xdp.c index bab563b2f812..df4789ab512d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, __diag_pop(); BTF_SET8_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) @@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { }; BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) -#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str) +#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index c1634b95c223..2943a151d4f1 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -38,11 +38,27 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_MASK = 127, }; +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) -- cgit v1.2.3 From 3868ab0f192581eff978501a05f3dc2e01541d77 Mon Sep 17 00:00:00 2001 From: Aananth V Date: Thu, 14 Sep 2023 14:36:21 +0000 Subject: tcp: new TCP_INFO stats for RTO events The 2023 SIGCOMM paper "Improving Network Availability with Protective ReRoute" has indicated Linux TCP's RTO-triggered txhash rehashing can effectively reduce application disruption during outages. To better measure the efficacy of this feature, this patch adds three more detailed stats during RTO recovery and exports via TCP_INFO. Applications and monitoring systems can leverage this data to measure the network path diversity and end-to-end repair latency during network outages to improve their network infrastructure. The following counters are added to tcp_sock in order to track RTO events over the lifetime of a TCP socket. 1. u16 total_rto - Counts the total number of RTO timeouts. 2. u16 total_rto_recoveries - Counts the total number of RTO recoveries. 3. u32 total_rto_time - Counts the total time spent (ms) in RTO recoveries. (time spent in CA_Loss and CA_Recovery states) To compute total_rto_time, we add a new u32 rto_stamp field to tcp_sock. rto_stamp records the start timestamp (ms) of the last RTO recovery (CA_Loss). Corresponding fields are also added to the tcp_info struct. Signed-off-by: Aananth V Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 8 ++++++++ include/uapi/linux/tcp.h | 12 ++++++++++++ net/ipv4/tcp.c | 9 +++++++++ net/ipv4/tcp_input.c | 15 +++++++++++++++ net/ipv4/tcp_minisocks.c | 4 ++++ net/ipv4/tcp_timer.c | 17 +++++++++++++++-- 6 files changed, 63 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 44d946161d4a..e15452df9804 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -377,6 +377,14 @@ struct tcp_sock { * Total data bytes retransmitted */ u32 total_retrans; /* Total retransmits for entire connection */ + u32 rto_stamp; /* Start time (ms) of last CA_Loss recovery */ + u16 total_rto; /* Total number of RTO timeouts, including + * SYN/SYN-ACK and recurring timeouts. + */ + u16 total_rto_recoveries; /* Total number of RTO recoveries, + * including any unfinished recovery. + */ + u32 total_rto_time; /* ms spent in (completed) RTO recoveries. */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 879eeb0a084b..d1d08da6331a 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -289,6 +289,18 @@ struct tcp_info { */ __u32 tcpi_rehash; /* PLB or timeout triggered rehash attempts */ + + __u16 tcpi_total_rto; /* Total number of RTO timeouts, including + * SYN/SYN-ACK and recurring timeouts. + */ + __u16 tcpi_total_rto_recoveries; /* Total number of RTO + * recoveries, including any + * unfinished recovery. + */ + __u32 tcpi_total_rto_time; /* Total time spent in RTO recoveries + * in milliseconds, including any + * unfinished recovery. + */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c3040a63ebd..69b8d7073708 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3818,6 +3818,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_wnd = tp->rcv_wnd; info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; + + info->tcpi_total_rto = tp->total_rto; + info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; + info->tcpi_total_rto_time = tp->total_rto_time; + if (tp->rto_stamp) { + info->tcpi_total_rto_time += tcp_time_stamp_raw() - + tp->rto_stamp; + } + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8d2c91703158..584825ddd0a0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2088,6 +2088,10 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->undo_marker = 0; tp->undo_retrans = -1; tp->sacked_out = 0; + tp->rto_stamp = 0; + tp->total_rto = 0; + tp->total_rto_recoveries = 0; + tp->total_rto_time = 0; } static inline void tcp_init_undo(struct tcp_sock *tp) @@ -2825,6 +2829,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) tcp_set_ca_state(sk, TCP_CA_Recovery); } +static void tcp_update_rto_time(struct tcp_sock *tp) +{ + if (tp->rto_stamp) { + tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp; + tp->rto_stamp = 0; + } +} + /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ @@ -3029,6 +3041,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); + if (icsk->icsk_ca_state != TCP_CA_Loss) + tcp_update_rto_time(tp); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -6454,6 +6468,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) tcp_try_undo_recovery(sk); /* Reset rtx states to prevent spurious retransmits_timed_out() */ + tcp_update_rto_time(tp); tp->retrans_stamp = 0; inet_csk(sk)->icsk_retransmits = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b98d476f1594..eee8ab1bfa0e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -565,6 +565,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->undo_marker = treq->snt_isn; newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); + newtp->total_rto = req->num_timeout; + newtp->total_rto_recoveries = 1; + newtp->total_rto_time = tcp_time_stamp_raw() - + newtp->retrans_stamp; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d7d64682b068..3f61c6a70a1f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -415,6 +415,19 @@ abort: tcp_write_err(sk); } } +static void tcp_update_rto_stats(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!icsk->icsk_retransmits) { + tp->total_rto_recoveries++; + tp->rto_stamp = tcp_time_stamp(tp); + } + icsk->icsk_retransmits++; + tp->total_rto++; +} + /* * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. @@ -447,7 +460,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) */ inet_rtx_syn_ack(sk, req); req->num_timeout++; - icsk->icsk_retransmits++; + tcp_update_rto_stats(sk); if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -575,7 +588,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - icsk->icsk_retransmits++; + tcp_update_rto_stats(sk); if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. -- cgit v1.2.3 From 3badff3a25d815e915d89565a0c82dec608a8d2b Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 13 Sep 2023 21:49:36 +0100 Subject: dpll: spec: Add Netlink spec in YAML Add a protocol spec for DPLL. Add code generated from the spec. Signed-off-by: Jakub Kicinski Signed-off-by: Michal Michalik Signed-off-by: Vadim Fedorenko Signed-off-by: Arkadiusz Kubalewski Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/netlink/specs/dpll.yaml | 488 ++++++++++++++++++++++++++++++++++ drivers/dpll/dpll_nl.c | 162 +++++++++++ drivers/dpll/dpll_nl.h | 51 ++++ include/uapi/linux/dpll.h | 201 ++++++++++++++ 4 files changed, 902 insertions(+) create mode 100644 Documentation/netlink/specs/dpll.yaml create mode 100644 drivers/dpll/dpll_nl.c create mode 100644 drivers/dpll/dpll_nl.h create mode 100644 include/uapi/linux/dpll.h (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml new file mode 100644 index 000000000000..8b86b28b47a6 --- /dev/null +++ b/Documentation/netlink/specs/dpll.yaml @@ -0,0 +1,488 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) + +name: dpll + +doc: DPLL subsystem. + +definitions: + - + type: enum + name: mode + doc: | + working modes a dpll can support, differentiates if and how dpll selects + one of its inputs to syntonize with it, valid values for DPLL_A_MODE + attribute + entries: + - + name: manual + doc: input can be only selected by sending a request to dpll + value: 1 + - + name: automatic + doc: highest prio input pin auto selected by dpll + render-max: true + - + type: enum + name: lock-status + doc: | + provides information of dpll device lock status, valid values for + DPLL_A_LOCK_STATUS attribute + entries: + - + name: unlocked + doc: | + dpll was not yet locked to any valid input (or forced by setting + DPLL_A_MODE to DPLL_MODE_DETACHED) + value: 1 + - + name: locked + doc: | + dpll is locked to a valid signal, but no holdover available + - + name: locked-ho-acq + doc: | + dpll is locked and holdover acquired + - + name: holdover + doc: | + dpll is in holdover state - lost a valid lock or was forced + by disconnecting all the pins (latter possible only + when dpll lock-state was already DPLL_LOCK_STATUS_LOCKED_HO_ACQ, + if dpll lock-state was not DPLL_LOCK_STATUS_LOCKED_HO_ACQ, the + dpll's lock-state shall remain DPLL_LOCK_STATUS_UNLOCKED) + render-max: true + - + type: const + name: temp-divider + value: 1000 + doc: | + temperature divider allowing userspace to calculate the + temperature as float with three digit decimal precision. + Value of (DPLL_A_TEMP / DPLL_TEMP_DIVIDER) is integer part of + temperature value. + Value of (DPLL_A_TEMP % DPLL_TEMP_DIVIDER) is fractional part of + temperature value. + - + type: enum + name: type + doc: type of dpll, valid values for DPLL_A_TYPE attribute + entries: + - + name: pps + doc: dpll produces Pulse-Per-Second signal + value: 1 + - + name: eec + doc: dpll drives the Ethernet Equipment Clock + render-max: true + - + type: enum + name: pin-type + doc: | + defines possible types of a pin, valid values for DPLL_A_PIN_TYPE + attribute + entries: + - + name: mux + doc: aggregates another layer of selectable pins + value: 1 + - + name: ext + doc: external input + - + name: synce-eth-port + doc: ethernet port PHY's recovered clock + - + name: int-oscillator + doc: device internal oscillator + - + name: gnss + doc: GNSS recovered clock + render-max: true + - + type: enum + name: pin-direction + doc: | + defines possible direction of a pin, valid values for + DPLL_A_PIN_DIRECTION attribute + entries: + - + name: input + doc: pin used as a input of a signal + value: 1 + - + name: output + doc: pin used to output the signal + render-max: true + - + type: const + name: pin-frequency-1-hz + value: 1 + - + type: const + name: pin-frequency-10-khz + value: 10000 + - + type: const + name: pin-frequency-77_5-khz + value: 77500 + - + type: const + name: pin-frequency-10-mhz + value: 10000000 + - + type: enum + name: pin-state + doc: | + defines possible states of a pin, valid values for + DPLL_A_PIN_STATE attribute + entries: + - + name: connected + doc: pin connected, active input of phase locked loop + value: 1 + - + name: disconnected + doc: pin disconnected, not considered as a valid input + - + name: selectable + doc: pin enabled for automatic input selection + render-max: true + - + type: flags + name: pin-capabilities + doc: | + defines possible capabilities of a pin, valid flags on + DPLL_A_PIN_CAPABILITIES attribute + entries: + - + name: direction-can-change + doc: pin direction can be changed + - + name: priority-can-change + doc: pin priority can be changed + - + name: state-can-change + doc: pin state can be changed + +attribute-sets: + - + name: dpll + enum-name: dpll_a + attributes: + - + name: id + type: u32 + - + name: module-name + type: string + - + name: pad + type: pad + - + name: clock-id + type: u64 + - + name: mode + type: u32 + enum: mode + - + name: mode-supported + type: u32 + enum: mode + multi-attr: true + - + name: lock-status + type: u32 + enum: lock-status + - + name: temp + type: s32 + - + name: type + type: u32 + enum: type + - + name: pin + enum-name: dpll_a_pin + attributes: + - + name: id + type: u32 + - + name: parent-id + type: u32 + - + name: module-name + type: string + - + name: pad + type: pad + - + name: clock-id + type: u64 + - + name: board-label + type: string + - + name: panel-label + type: string + - + name: package-label + type: string + - + name: type + type: u32 + enum: pin-type + - + name: direction + type: u32 + enum: pin-direction + - + name: frequency + type: u64 + - + name: frequency-supported + type: nest + multi-attr: true + nested-attributes: frequency-range + - + name: frequency-min + type: u64 + - + name: frequency-max + type: u64 + - + name: prio + type: u32 + - + name: state + type: u32 + enum: pin-state + - + name: capabilities + type: u32 + - + name: parent-device + type: nest + multi-attr: true + nested-attributes: pin-parent-device + - + name: parent-pin + type: nest + multi-attr: true + nested-attributes: pin-parent-pin + - + name: pin-parent-device + subset-of: pin + attributes: + - + name: parent-id + type: u32 + - + name: direction + type: u32 + - + name: prio + type: u32 + - + name: state + type: u32 + - + name: pin-parent-pin + subset-of: pin + attributes: + - + name: parent-id + type: u32 + - + name: state + type: u32 + - + name: frequency-range + subset-of: pin + attributes: + - + name: frequency-min + type: u64 + - + name: frequency-max + type: u64 + +operations: + enum-name: dpll_cmd + list: + - + name: device-id-get + doc: | + Get id of dpll device that matches given attributes + attribute-set: dpll + flags: [ admin-perm ] + + do: + pre: dpll-lock-doit + post: dpll-unlock-doit + request: + attributes: + - module-name + - clock-id + - type + reply: + attributes: + - id + + - + name: device-get + doc: | + Get list of DPLL devices (dump) or attributes of a single dpll device + attribute-set: dpll + flags: [ admin-perm ] + + do: + pre: dpll-pre-doit + post: dpll-post-doit + request: + attributes: + - id + reply: &dev-attrs + attributes: + - id + - module-name + - mode + - mode-supported + - lock-status + - temp + - clock-id + - type + + dump: + pre: dpll-lock-dumpit + post: dpll-unlock-dumpit + reply: *dev-attrs + + - + name: device-set + doc: Set attributes for a DPLL device + attribute-set: dpll + flags: [ admin-perm ] + + do: + pre: dpll-pre-doit + post: dpll-post-doit + request: + attributes: + - id + - + name: device-create-ntf + doc: Notification about device appearing + notify: device-get + mcgrp: monitor + - + name: device-delete-ntf + doc: Notification about device disappearing + notify: device-get + mcgrp: monitor + - + name: device-change-ntf + doc: Notification about device configuration being changed + notify: device-get + mcgrp: monitor + - + name: pin-id-get + doc: | + Get id of a pin that matches given attributes + attribute-set: pin + flags: [ admin-perm ] + + do: + pre: dpll-lock-doit + post: dpll-unlock-doit + request: + attributes: + - module-name + - clock-id + - board-label + - panel-label + - package-label + - type + reply: + attributes: + - id + + - + name: pin-get + doc: | + Get list of pins and its attributes. + - dump request without any attributes given - list all the pins in the + system + - dump request with target dpll - list all the pins registered with + a given dpll device + - do request with target dpll and target pin - single pin attributes + attribute-set: pin + flags: [ admin-perm ] + + do: + pre: dpll-pin-pre-doit + post: dpll-pin-post-doit + request: + attributes: + - id + reply: &pin-attrs + attributes: + - id + - board-label + - panel-label + - package-label + - type + - frequency + - frequency-supported + - capabilities + - parent-device + - parent-pin + + dump: + pre: dpll-lock-dumpit + post: dpll-unlock-dumpit + request: + attributes: + - id + reply: *pin-attrs + + - + name: pin-set + doc: Set attributes of a target pin + attribute-set: pin + flags: [ admin-perm ] + + do: + pre: dpll-pin-pre-doit + post: dpll-pin-post-doit + request: + attributes: + - id + - frequency + - direction + - prio + - state + - parent-device + - parent-pin + - + name: pin-create-ntf + doc: Notification about pin appearing + notify: pin-get + mcgrp: monitor + - + name: pin-delete-ntf + doc: Notification about pin disappearing + notify: pin-get + mcgrp: monitor + - + name: pin-change-ntf + doc: Notification about pin configuration being changed + notify: pin-get + mcgrp: monitor + +mcast-groups: + list: + - + name: monitor diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c new file mode 100644 index 000000000000..14064c8c783b --- /dev/null +++ b/drivers/dpll/dpll_nl.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/dpll.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "dpll_nl.h" + +#include + +/* Common nested types */ +const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1] = { + [DPLL_A_PIN_PARENT_ID] = { .type = NLA_U32, }, + [DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2), + [DPLL_A_PIN_PRIO] = { .type = NLA_U32, }, + [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), +}; + +const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1] = { + [DPLL_A_PIN_PARENT_ID] = { .type = NLA_U32, }, + [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), +}; + +/* DPLL_CMD_DEVICE_ID_GET - do */ +static const struct nla_policy dpll_device_id_get_nl_policy[DPLL_A_TYPE + 1] = { + [DPLL_A_MODULE_NAME] = { .type = NLA_NUL_STRING, }, + [DPLL_A_CLOCK_ID] = { .type = NLA_U64, }, + [DPLL_A_TYPE] = NLA_POLICY_RANGE(NLA_U32, 1, 2), +}; + +/* DPLL_CMD_DEVICE_GET - do */ +static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = { + [DPLL_A_ID] = { .type = NLA_U32, }, +}; + +/* DPLL_CMD_DEVICE_SET - do */ +static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_ID + 1] = { + [DPLL_A_ID] = { .type = NLA_U32, }, +}; + +/* DPLL_CMD_PIN_ID_GET - do */ +static const struct nla_policy dpll_pin_id_get_nl_policy[DPLL_A_PIN_TYPE + 1] = { + [DPLL_A_PIN_MODULE_NAME] = { .type = NLA_NUL_STRING, }, + [DPLL_A_PIN_CLOCK_ID] = { .type = NLA_U64, }, + [DPLL_A_PIN_BOARD_LABEL] = { .type = NLA_NUL_STRING, }, + [DPLL_A_PIN_PANEL_LABEL] = { .type = NLA_NUL_STRING, }, + [DPLL_A_PIN_PACKAGE_LABEL] = { .type = NLA_NUL_STRING, }, + [DPLL_A_PIN_TYPE] = NLA_POLICY_RANGE(NLA_U32, 1, 5), +}; + +/* DPLL_CMD_PIN_GET - do */ +static const struct nla_policy dpll_pin_get_do_nl_policy[DPLL_A_PIN_ID + 1] = { + [DPLL_A_PIN_ID] = { .type = NLA_U32, }, +}; + +/* DPLL_CMD_PIN_GET - dump */ +static const struct nla_policy dpll_pin_get_dump_nl_policy[DPLL_A_PIN_ID + 1] = { + [DPLL_A_PIN_ID] = { .type = NLA_U32, }, +}; + +/* DPLL_CMD_PIN_SET - do */ +static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PARENT_PIN + 1] = { + [DPLL_A_PIN_ID] = { .type = NLA_U32, }, + [DPLL_A_PIN_FREQUENCY] = { .type = NLA_U64, }, + [DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2), + [DPLL_A_PIN_PRIO] = { .type = NLA_U32, }, + [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), + [DPLL_A_PIN_PARENT_DEVICE] = NLA_POLICY_NESTED(dpll_pin_parent_device_nl_policy), + [DPLL_A_PIN_PARENT_PIN] = NLA_POLICY_NESTED(dpll_pin_parent_pin_nl_policy), +}; + +/* Ops table for dpll */ +static const struct genl_split_ops dpll_nl_ops[] = { + { + .cmd = DPLL_CMD_DEVICE_ID_GET, + .pre_doit = dpll_lock_doit, + .doit = dpll_nl_device_id_get_doit, + .post_doit = dpll_unlock_doit, + .policy = dpll_device_id_get_nl_policy, + .maxattr = DPLL_A_TYPE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = DPLL_CMD_DEVICE_GET, + .pre_doit = dpll_pre_doit, + .doit = dpll_nl_device_get_doit, + .post_doit = dpll_post_doit, + .policy = dpll_device_get_nl_policy, + .maxattr = DPLL_A_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = DPLL_CMD_DEVICE_GET, + .start = dpll_lock_dumpit, + .dumpit = dpll_nl_device_get_dumpit, + .done = dpll_unlock_dumpit, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, + { + .cmd = DPLL_CMD_DEVICE_SET, + .pre_doit = dpll_pre_doit, + .doit = dpll_nl_device_set_doit, + .post_doit = dpll_post_doit, + .policy = dpll_device_set_nl_policy, + .maxattr = DPLL_A_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = DPLL_CMD_PIN_ID_GET, + .pre_doit = dpll_lock_doit, + .doit = dpll_nl_pin_id_get_doit, + .post_doit = dpll_unlock_doit, + .policy = dpll_pin_id_get_nl_policy, + .maxattr = DPLL_A_PIN_TYPE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = DPLL_CMD_PIN_GET, + .pre_doit = dpll_pin_pre_doit, + .doit = dpll_nl_pin_get_doit, + .post_doit = dpll_pin_post_doit, + .policy = dpll_pin_get_do_nl_policy, + .maxattr = DPLL_A_PIN_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = DPLL_CMD_PIN_GET, + .start = dpll_lock_dumpit, + .dumpit = dpll_nl_pin_get_dumpit, + .done = dpll_unlock_dumpit, + .policy = dpll_pin_get_dump_nl_policy, + .maxattr = DPLL_A_PIN_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, + { + .cmd = DPLL_CMD_PIN_SET, + .pre_doit = dpll_pin_pre_doit, + .doit = dpll_nl_pin_set_doit, + .post_doit = dpll_pin_post_doit, + .policy = dpll_pin_set_nl_policy, + .maxattr = DPLL_A_PIN_PARENT_PIN, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group dpll_nl_mcgrps[] = { + [DPLL_NLGRP_MONITOR] = { "monitor", }, +}; + +struct genl_family dpll_nl_family __ro_after_init = { + .name = DPLL_FAMILY_NAME, + .version = DPLL_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = dpll_nl_ops, + .n_split_ops = ARRAY_SIZE(dpll_nl_ops), + .mcgrps = dpll_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(dpll_nl_mcgrps), +}; diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h new file mode 100644 index 000000000000..1f67aaed4742 --- /dev/null +++ b/drivers/dpll/dpll_nl.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/dpll.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_DPLL_GEN_H +#define _LINUX_DPLL_GEN_H + +#include +#include + +#include + +/* Common nested types */ +extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1]; +extern const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1]; + +int dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +int dpll_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +dpll_unlock_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +dpll_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +dpll_pin_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +int dpll_lock_dumpit(struct netlink_callback *cb); +int dpll_unlock_dumpit(struct netlink_callback *cb); + +int dpll_nl_device_id_get_doit(struct sk_buff *skb, struct genl_info *info); +int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info); +int dpll_nl_device_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int dpll_nl_device_set_doit(struct sk_buff *skb, struct genl_info *info); +int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info); +int dpll_nl_pin_get_doit(struct sk_buff *skb, struct genl_info *info); +int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int dpll_nl_pin_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + DPLL_NLGRP_MONITOR, +}; + +extern struct genl_family dpll_nl_family; + +#endif /* _LINUX_DPLL_GEN_H */ diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h new file mode 100644 index 000000000000..20ef0718f8dc --- /dev/null +++ b/include/uapi/linux/dpll.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/dpll.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_DPLL_H +#define _UAPI_LINUX_DPLL_H + +#define DPLL_FAMILY_NAME "dpll" +#define DPLL_FAMILY_VERSION 1 + +/** + * enum dpll_mode - working modes a dpll can support, differentiates if and how + * dpll selects one of its inputs to syntonize with it, valid values for + * DPLL_A_MODE attribute + * @DPLL_MODE_MANUAL: input can be only selected by sending a request to dpll + * @DPLL_MODE_AUTOMATIC: highest prio input pin auto selected by dpll + */ +enum dpll_mode { + DPLL_MODE_MANUAL = 1, + DPLL_MODE_AUTOMATIC, + + /* private: */ + __DPLL_MODE_MAX, + DPLL_MODE_MAX = (__DPLL_MODE_MAX - 1) +}; + +/** + * enum dpll_lock_status - provides information of dpll device lock status, + * valid values for DPLL_A_LOCK_STATUS attribute + * @DPLL_LOCK_STATUS_UNLOCKED: dpll was not yet locked to any valid input (or + * forced by setting DPLL_A_MODE to DPLL_MODE_DETACHED) + * @DPLL_LOCK_STATUS_LOCKED: dpll is locked to a valid signal, but no holdover + * available + * @DPLL_LOCK_STATUS_LOCKED_HO_ACQ: dpll is locked and holdover acquired + * @DPLL_LOCK_STATUS_HOLDOVER: dpll is in holdover state - lost a valid lock or + * was forced by disconnecting all the pins (latter possible only when dpll + * lock-state was already DPLL_LOCK_STATUS_LOCKED_HO_ACQ, if dpll lock-state + * was not DPLL_LOCK_STATUS_LOCKED_HO_ACQ, the dpll's lock-state shall remain + * DPLL_LOCK_STATUS_UNLOCKED) + */ +enum dpll_lock_status { + DPLL_LOCK_STATUS_UNLOCKED = 1, + DPLL_LOCK_STATUS_LOCKED, + DPLL_LOCK_STATUS_LOCKED_HO_ACQ, + DPLL_LOCK_STATUS_HOLDOVER, + + /* private: */ + __DPLL_LOCK_STATUS_MAX, + DPLL_LOCK_STATUS_MAX = (__DPLL_LOCK_STATUS_MAX - 1) +}; + +#define DPLL_TEMP_DIVIDER 1000 + +/** + * enum dpll_type - type of dpll, valid values for DPLL_A_TYPE attribute + * @DPLL_TYPE_PPS: dpll produces Pulse-Per-Second signal + * @DPLL_TYPE_EEC: dpll drives the Ethernet Equipment Clock + */ +enum dpll_type { + DPLL_TYPE_PPS = 1, + DPLL_TYPE_EEC, + + /* private: */ + __DPLL_TYPE_MAX, + DPLL_TYPE_MAX = (__DPLL_TYPE_MAX - 1) +}; + +/** + * enum dpll_pin_type - defines possible types of a pin, valid values for + * DPLL_A_PIN_TYPE attribute + * @DPLL_PIN_TYPE_MUX: aggregates another layer of selectable pins + * @DPLL_PIN_TYPE_EXT: external input + * @DPLL_PIN_TYPE_SYNCE_ETH_PORT: ethernet port PHY's recovered clock + * @DPLL_PIN_TYPE_INT_OSCILLATOR: device internal oscillator + * @DPLL_PIN_TYPE_GNSS: GNSS recovered clock + */ +enum dpll_pin_type { + DPLL_PIN_TYPE_MUX = 1, + DPLL_PIN_TYPE_EXT, + DPLL_PIN_TYPE_SYNCE_ETH_PORT, + DPLL_PIN_TYPE_INT_OSCILLATOR, + DPLL_PIN_TYPE_GNSS, + + /* private: */ + __DPLL_PIN_TYPE_MAX, + DPLL_PIN_TYPE_MAX = (__DPLL_PIN_TYPE_MAX - 1) +}; + +/** + * enum dpll_pin_direction - defines possible direction of a pin, valid values + * for DPLL_A_PIN_DIRECTION attribute + * @DPLL_PIN_DIRECTION_INPUT: pin used as a input of a signal + * @DPLL_PIN_DIRECTION_OUTPUT: pin used to output the signal + */ +enum dpll_pin_direction { + DPLL_PIN_DIRECTION_INPUT = 1, + DPLL_PIN_DIRECTION_OUTPUT, + + /* private: */ + __DPLL_PIN_DIRECTION_MAX, + DPLL_PIN_DIRECTION_MAX = (__DPLL_PIN_DIRECTION_MAX - 1) +}; + +#define DPLL_PIN_FREQUENCY_1_HZ 1 +#define DPLL_PIN_FREQUENCY_10_KHZ 10000 +#define DPLL_PIN_FREQUENCY_77_5_KHZ 77500 +#define DPLL_PIN_FREQUENCY_10_MHZ 10000000 + +/** + * enum dpll_pin_state - defines possible states of a pin, valid values for + * DPLL_A_PIN_STATE attribute + * @DPLL_PIN_STATE_CONNECTED: pin connected, active input of phase locked loop + * @DPLL_PIN_STATE_DISCONNECTED: pin disconnected, not considered as a valid + * input + * @DPLL_PIN_STATE_SELECTABLE: pin enabled for automatic input selection + */ +enum dpll_pin_state { + DPLL_PIN_STATE_CONNECTED = 1, + DPLL_PIN_STATE_DISCONNECTED, + DPLL_PIN_STATE_SELECTABLE, + + /* private: */ + __DPLL_PIN_STATE_MAX, + DPLL_PIN_STATE_MAX = (__DPLL_PIN_STATE_MAX - 1) +}; + +/** + * enum dpll_pin_capabilities - defines possible capabilities of a pin, valid + * flags on DPLL_A_PIN_CAPABILITIES attribute + * @DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE: pin direction can be changed + * @DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE: pin priority can be changed + * @DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE: pin state can be changed + */ +enum dpll_pin_capabilities { + DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE = 1, + DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE = 2, + DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE = 4, +}; + +enum dpll_a { + DPLL_A_ID = 1, + DPLL_A_MODULE_NAME, + DPLL_A_PAD, + DPLL_A_CLOCK_ID, + DPLL_A_MODE, + DPLL_A_MODE_SUPPORTED, + DPLL_A_LOCK_STATUS, + DPLL_A_TEMP, + DPLL_A_TYPE, + + __DPLL_A_MAX, + DPLL_A_MAX = (__DPLL_A_MAX - 1) +}; + +enum dpll_a_pin { + DPLL_A_PIN_ID = 1, + DPLL_A_PIN_PARENT_ID, + DPLL_A_PIN_MODULE_NAME, + DPLL_A_PIN_PAD, + DPLL_A_PIN_CLOCK_ID, + DPLL_A_PIN_BOARD_LABEL, + DPLL_A_PIN_PANEL_LABEL, + DPLL_A_PIN_PACKAGE_LABEL, + DPLL_A_PIN_TYPE, + DPLL_A_PIN_DIRECTION, + DPLL_A_PIN_FREQUENCY, + DPLL_A_PIN_FREQUENCY_SUPPORTED, + DPLL_A_PIN_FREQUENCY_MIN, + DPLL_A_PIN_FREQUENCY_MAX, + DPLL_A_PIN_PRIO, + DPLL_A_PIN_STATE, + DPLL_A_PIN_CAPABILITIES, + DPLL_A_PIN_PARENT_DEVICE, + DPLL_A_PIN_PARENT_PIN, + + __DPLL_A_PIN_MAX, + DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) +}; + +enum dpll_cmd { + DPLL_CMD_DEVICE_ID_GET = 1, + DPLL_CMD_DEVICE_GET, + DPLL_CMD_DEVICE_SET, + DPLL_CMD_DEVICE_CREATE_NTF, + DPLL_CMD_DEVICE_DELETE_NTF, + DPLL_CMD_DEVICE_CHANGE_NTF, + DPLL_CMD_PIN_ID_GET, + DPLL_CMD_PIN_GET, + DPLL_CMD_PIN_SET, + DPLL_CMD_PIN_CREATE_NTF, + DPLL_CMD_PIN_DELETE_NTF, + DPLL_CMD_PIN_CHANGE_NTF, + + __DPLL_CMD_MAX, + DPLL_CMD_MAX = (__DPLL_CMD_MAX - 1) +}; + +#define DPLL_MCGRP_MONITOR "monitor" + +#endif /* _UAPI_LINUX_DPLL_H */ -- cgit v1.2.3 From 5f18426928800c59fb0f9bc8fb0c182bb6f5ee24 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 13 Sep 2023 21:49:39 +0100 Subject: netdev: expose DPLL pin handle for netdevice In case netdevice represents a SyncE port, the user needs to understand the connection between netdevice and associated DPLL pin. There might me multiple netdevices pointing to the same pin, in case of VF/SF implementation. Add a IFLA Netlink attribute to nest the DPLL pin handle, similar to how it is implemented for devlink port. Add a struct dpll_pin pointer to netdev and protect access to it by RTNL. Expose netdev_dpll_pin_set() and netdev_dpll_pin_clear() helpers to the drivers so they can set/clear the DPLL pin relationship to netdev. Note that during the lifetime of struct dpll_pin the pin handle does not change. Therefore it is save to access it lockless. It is drivers responsibility to call netdev_dpll_pin_clear() before dpll_pin_put(). Signed-off-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- drivers/dpll/dpll_netlink.c | 16 ++++++++++++++-- include/linux/dpll.h | 15 +++++++++++++++ include/linux/netdevice.h | 21 +++++++++++++++++++++ include/uapi/linux/if_link.h | 2 +- net/core/dev.c | 22 ++++++++++++++++++++++ net/core/rtnetlink.c | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 109 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 9464a6865977..764437a0661b 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -47,6 +47,18 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id) return 0; } +/** + * dpll_msg_pin_handle_size - get size of pin handle attribute for given pin + * @pin: pin pointer + * + * Return: byte size of pin handle attribute for given pin. + */ +size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) +{ + return pin ? nla_total_size(4) : 0; /* DPLL_A_PIN_ID */ +} +EXPORT_SYMBOL_GPL(dpll_msg_pin_handle_size); + /** * dpll_msg_add_pin_handle - attach pin handle attribute to a given message * @msg: pointer to sk_buff message to attach a pin handle @@ -56,8 +68,7 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id) * * 0 - success * * -EMSGSIZE - no space in message to attach pin handle */ -static int -dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) { if (!pin) return 0; @@ -65,6 +76,7 @@ dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) return -EMSGSIZE; return 0; } +EXPORT_SYMBOL_GPL(dpll_msg_add_pin_handle); static int dpll_msg_add_mode(struct sk_buff *msg, struct dpll_device *dpll, diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 2202310c10cd..bbc480cd2932 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -101,6 +101,21 @@ struct dpll_pin_properties { struct dpll_pin_frequency *freq_supported; }; +#if IS_ENABLED(CONFIG_DPLL) +size_t dpll_msg_pin_handle_size(struct dpll_pin *pin); +int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin); +#else +static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) +{ + return 0; +} + +static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +{ + return 0; +} +#endif + struct dpll_device * dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0896aaa91dd7..db3d8429d50d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -79,6 +79,8 @@ struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; +/* DPLL specific */ +struct dpll_pin; typedef u32 xdp_features_t; @@ -2049,6 +2051,9 @@ enum netdev_ml_priv_type { * SET_NETDEV_DEVLINK_PORT macro. This pointer is static * during the time netdevice is registered. * + * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, + * where the clock is recovered. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2405,6 +2410,10 @@ struct net_device { struct rtnl_hw_stats64 *offload_xstats_l3; struct devlink_port *devlink_port; + +#if IS_ENABLED(CONFIG_DPLL) + struct dpll_pin *dpll_pin; +#endif }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -3940,6 +3949,18 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); +void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); +void netdev_dpll_pin_clear(struct net_device *dev); + +static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_DPLL) + return dev->dpll_pin; +#else + return NULL; +#endif +} + struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ce3117df9cec..fac351a93aed 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -376,7 +376,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, - + IFLA_DPLL_PIN, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index ccff2b6ef958..cc03a5758d2d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9023,6 +9023,28 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); +static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) +{ +#if IS_ENABLED(CONFIG_DPLL) + rtnl_lock(); + dev->dpll_pin = dpll_pin; + rtnl_unlock(); +#endif +} + +void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) +{ + WARN_ON(!dpll_pin); + netdev_dpll_pin_assign(dev, dpll_pin); +} +EXPORT_SYMBOL(netdev_dpll_pin_set); + +void netdev_dpll_pin_clear(struct net_device *dev) +{ + netdev_dpll_pin_assign(dev, NULL); +} +EXPORT_SYMBOL(netdev_dpll_pin_clear); + /** * dev_change_proto_down - set carrier according to proto_down. * diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4a2ec33bfb51..7452a6d190c5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -57,6 +57,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include #include "dev.h" @@ -1055,6 +1056,15 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev) return size; } +static size_t rtnl_dpll_pin_size(const struct net_device *dev) +{ + size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ + + size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev)); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1111,6 +1121,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + + rtnl_dpll_pin_size(dev) + 0; } @@ -1774,6 +1785,28 @@ nest_cancel: return ret; } +static int rtnl_fill_dpll_pin(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *dpll_pin_nest; + int ret; + + dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); + if (!dpll_pin_nest) + return -EMSGSIZE; + + ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev)); + if (ret < 0) + goto nest_cancel; + + nla_nest_end(skb, dpll_pin_nest); + return 0; + +nest_cancel: + nla_nest_cancel(skb, dpll_pin_nest); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1916,6 +1949,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; + if (rtnl_fill_dpll_pin(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 0b7a2721e36c11313f8b0f251a508d25a872cd28 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 13 Sep 2023 09:12:39 +0200 Subject: devlink: expose peer SF devlink instance Introduce a new helper devl_port_fn_devlink_set() to be used by driver assigning a devlink instance to the peer devlink port function. Expose this to user over new netlink attribute nested under port function nest to expose devlink handle related to the port function. This is particularly helpful for user to understand the relationship between devlink instances created for SFs and the port functions they belong to. Note that caller of devlink_port_notify() needs to hold devlink instance lock, put the assertion to devl_port_fn_devlink_set() to make this requirement explicit. Also note the limitations that only allow to make this assignment for registered objects. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ include/uapi/linux/devlink.h | 1 + net/devlink/port.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 29fd1b4ee654..2655ab6101ec 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -150,6 +150,7 @@ struct devlink_port { struct devlink_rate *devlink_rate; struct devlink_linecard *linecard; + u32 rel_index; }; struct devlink_port_new_attrs { @@ -1697,6 +1698,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); +int devl_port_fn_devlink_set(struct devlink_port *devlink_port, + struct devlink *fn_devlink); struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 03875e078be8..cd4b82458d1b 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -680,6 +680,7 @@ enum devlink_port_function_attr { DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ DEVLINK_PORT_FN_ATTR_CAPS, /* bitfield32 */ + DEVLINK_PORT_FN_ATTR_DEVLINK, /* nested */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 diff --git a/net/devlink/port.c b/net/devlink/port.c index 7b300a322ed9..4e9003242448 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -428,6 +428,13 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); + if (err) + goto out; + err = devlink_rel_devlink_handle_put(msg, port->devlink, + port->rel_index, + DEVLINK_PORT_FN_ATTR_DEVLINK, + &msg_updated); + out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); @@ -1392,6 +1399,50 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); +static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) +{ + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) + return; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + +static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, + u32 rel_index) +{ + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (devlink_port && devlink_port->rel_index == rel_index) + devlink_port->rel_index = 0; +} + +/** + * devl_port_fn_devlink_set - Attach peer devlink + * instance to port function. + * @devlink_port: devlink port + * @fn_devlink: devlink instance to attach + */ +int devl_port_fn_devlink_set(struct devlink_port *devlink_port, + struct devlink *fn_devlink) +{ + ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); + + if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || + devlink_port->attrs.pci_sf.external)) + return -EINVAL; + + return devlink_rel_nested_in_add(&devlink_port->rel_index, + devlink_port->devlink->index, + devlink_port->index, + devlink_port_rel_notify_cb, + devlink_port_rel_cleanup_cb, + fn_devlink); +} +EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); + /** * devlink_port_linecard_set - Link port with a linecard * -- cgit v1.2.3 From ddd7f45c899f7524bdbe6a32fe4906cde8b07b9b Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Thu, 14 Sep 2023 04:20:26 -0400 Subject: wifi: cfg80211: save power spectral density(psd) of regulatory rule 6 GHz regulatory domains introduces Power Spectral Density (PSD). The PSD value of the regulatory rule should be taken into effect for the ieee80211_channels falling into that particular regulatory rule. Save the values in the channel which has PSD value and add nl80211 attributes accordingly to handle it. Co-developed-by: Aditya Kumar Singh Signed-off-by: Aditya Kumar Singh Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20230914082026.3709-1-quic_wgong@quicinc.com [use hole in chan flags, reword docs] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 +++++- include/net/regulatory.h | 1 + include/uapi/linux/nl80211.h | 9 +++++++++ net/wireless/nl80211.c | 9 +++++++++ net/wireless/reg.c | 17 +++++++++++++++++ 5 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8fcfe1869424..9af714431b22 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -76,6 +76,8 @@ struct wiphy; * @IEEE80211_CHAN_DISABLED: This channel is disabled. * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes * sending probe requests or beaconing. + * @IEEE80211_CHAN_PSD: Power spectral density (in dBm) is set for this + * channel. * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel. * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel * is not permitted. @@ -119,7 +121,7 @@ struct wiphy; enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, IEEE80211_CHAN_NO_IR = 1<<1, - /* hole at 1<<2 */ + IEEE80211_CHAN_PSD = 1<<2, IEEE80211_CHAN_RADAR = 1<<3, IEEE80211_CHAN_NO_HT40PLUS = 1<<4, IEEE80211_CHAN_NO_HT40MINUS = 1<<5, @@ -171,6 +173,7 @@ enum ieee80211_channel_flags { * on this channel. * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered. * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels. + * @psd: power spectral density (in dBm) */ struct ieee80211_channel { enum nl80211_band band; @@ -187,6 +190,7 @@ struct ieee80211_channel { enum nl80211_dfs_state dfs_state; unsigned long dfs_state_entered; unsigned int dfs_cac_ms; + s8 psd; }; /** diff --git a/include/net/regulatory.h b/include/net/regulatory.h index b2cb4a9eb04d..ebf9e028d1ef 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -213,6 +213,7 @@ struct ieee80211_reg_rule { u32 flags; u32 dfs_cac_ms; bool has_wmm; + s8 psd; }; struct ieee80211_regdomain { diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f797ab7a6547..367e5fbc8930 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4215,6 +4215,8 @@ enum nl80211_wmm_rule { * as the primary or any of the secondary channels isn't possible * @NL80211_FREQUENCY_ATTR_NO_EHT: EHT operation is not allowed on this channel * in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_PSD: Power spectral density (in dBm) that + * is allowed on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4253,6 +4255,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_16MHZ, NL80211_FREQUENCY_ATTR_NO_320MHZ, NL80211_FREQUENCY_ATTR_NO_EHT, + NL80211_FREQUENCY_ATTR_PSD, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4353,6 +4356,8 @@ enum nl80211_reg_type { * a given frequency range. The value is in mBm (100 * dBm). * @NL80211_ATTR_DFS_CAC_TIME: DFS CAC time in milliseconds. * If not present or 0 default CAC time will be used. + * @NL80211_ATTR_POWER_RULE_PSD: power spectral density (in dBm). + * This could be negative. * @NL80211_REG_RULE_ATTR_MAX: highest regulatory rule attribute number * currently defined * @__NL80211_REG_RULE_ATTR_AFTER_LAST: internal use @@ -4370,6 +4375,8 @@ enum nl80211_reg_rule_attr { NL80211_ATTR_DFS_CAC_TIME, + NL80211_ATTR_POWER_RULE_PSD, + /* keep last */ __NL80211_REG_RULE_ATTR_AFTER_LAST, NL80211_REG_RULE_ATTR_MAX = __NL80211_REG_RULE_ATTR_AFTER_LAST - 1 @@ -4453,6 +4460,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_HE: HE operation not allowed * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed * @NL80211_RRF_NO_EHT: EHT operation not allowed + * @NL80211_RRF_PSD: Ruleset has power spectral density value */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -4473,6 +4481,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_HE = 1<<17, NL80211_RRF_NO_320MHZ = 1<<18, NL80211_RRF_NO_EHT = 1<<19, + NL80211_RRF_PSD = 1<<20, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 218093607b29..e64bf2a58b36 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1115,6 +1115,10 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_PSD) && + nla_put_s8(msg, NL80211_FREQUENCY_ATTR_PSD, chan->psd)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_DISABLED) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED)) goto nla_put_failure; @@ -8529,6 +8533,11 @@ static int nl80211_put_regdom(const struct ieee80211_regdomain *regdom, reg_rule->dfs_cac_ms)) goto nla_put_failure; + if ((reg_rule->flags & NL80211_RRF_PSD) && + nla_put_s8(msg, NL80211_ATTR_POWER_RULE_PSD, + reg_rule->psd)) + goto nla_put_failure; + nla_nest_end(msg, nl_reg_rule); } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 33e2570f2bd6..eb2fa97457b4 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1589,6 +1589,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_320MHZ; if (rd_flags & NL80211_RRF_NO_EHT) channel_flags |= IEEE80211_CHAN_NO_EHT; + if (rd_flags & NL80211_RRF_PSD) + channel_flags |= IEEE80211_CHAN_PSD; return channel_flags; } @@ -1795,6 +1797,9 @@ static void handle_channel_single_rule(struct wiphy *wiphy, chan->dfs_cac_ms = reg_rule->dfs_cac_ms; } + if (chan->flags & IEEE80211_CHAN_PSD) + chan->psd = reg_rule->psd; + return; } @@ -1815,6 +1820,9 @@ static void handle_channel_single_rule(struct wiphy *wiphy, chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } + if (chan->flags & IEEE80211_CHAN_PSD) + chan->psd = reg_rule->psd; + if (chan->orig_mpwr) { /* * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER @@ -1884,6 +1892,12 @@ static void handle_channel_adjacent_rules(struct wiphy *wiphy, rrule2->dfs_cac_ms); } + if ((rrule1->flags & NL80211_RRF_PSD) && + (rrule2->flags & NL80211_RRF_PSD)) + chan->psd = min_t(s8, rrule1->psd, rrule2->psd); + else + chan->flags &= ~NL80211_RRF_PSD; + return; } @@ -2570,6 +2584,9 @@ static void handle_channel_custom(struct wiphy *wiphy, chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; } + if (chan->flags & IEEE80211_CHAN_PSD) + chan->psd = reg_rule->psd; + chan->max_power = chan->max_reg_power; } -- cgit v1.2.3 From 5482c0a28b2634e7a7d8ddaca7feac183e74b528 Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Fri, 22 Sep 2023 14:55:51 +0530 Subject: wifi: cfg80211: OWE DH IE handling offload Introduce new feature flags for OWE offload that driver can advertise to indicate kernel/application space to avoid DH IE handling. When this flag is advertised, the driver/device will take care of DH IE inclusion and processing of peer DH IE to generate PMK. Signed-off-by: Vinayak Yadawad Link: https://lore.kernel.org/r/f891cce4b52c939dfc6b71bb2f73e560e8cad287.1695374530.git.vinayak.yadawad@broadcom.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 367e5fbc8930..0c64994b118e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -334,6 +334,15 @@ * use %NL80211_CMD_START_AP or similar functions. */ +/** + * DOC: OWE DH IE handling offload + * + * By setting @NL80211_EXT_FEATURE_OWE_OFFLOAD flag, drivers can indicate + * kernel/application space to avoid DH IE handling. When this flag is + * advertised, the driver/device will take care of DH IE inclusion and + * processing of peer DH IE to generate PMK. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -6411,6 +6420,12 @@ enum nl80211_feature_flags { * in authentication and deauthentication frames sent to unassociated peer * using @NL80211_CMD_FRAME. * + * @NL80211_EXT_FEATURE_OWE_OFFLOAD: Driver/Device wants to do OWE DH IE + * handling in station mode. + * + * @NL80211_EXT_FEATURE_OWE_OFFLOAD_AP: Driver/Device wants to do OWE DH IE + * handling in AP mode. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6482,6 +6497,8 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_PUNCT, NL80211_EXT_FEATURE_SECURE_NAN, NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA, + NL80211_EXT_FEATURE_OWE_OFFLOAD, + NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From e2b2cd592adbd303bcc02451d32fedd511000fb0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 20 Sep 2023 23:31:38 +0200 Subject: bpf: Add missed value to kprobe_multi link info Add missed value to kprobe_multi link info to hold the stats of missed kprobe_multi probe. The missed counter gets incremented when fprobe fails the recursion check or there's no rethook available for return probe. In either case the attached bpf program is not executed. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Tested-by: Song Liu Reviewed-by: Song Liu Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20230920213145.1941596-3-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5f13db15a3c7..1da5b1bcce71 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6532,6 +6532,7 @@ struct bpf_link_info { __aligned_u64 addrs; __u32 count; /* in/out: kprobe_multi function count */ __u32 flags; + __u64 missed; } kprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 54827d04c9a6..6aec6e7d612a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2614,6 +2614,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) return 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5f13db15a3c7..1da5b1bcce71 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6532,6 +6532,7 @@ struct bpf_link_info { __aligned_u64 addrs; __u32 count; /* in/out: kprobe_multi function count */ __u32 flags; + __u64 missed; } kprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ -- cgit v1.2.3 From 3acf8ace68230e9558cf916847f1cc9f208abdf1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 20 Sep 2023 23:31:39 +0200 Subject: bpf: Add missed value to kprobe perf link info Add missed value to kprobe attached through perf link info to hold the stats of missed kprobe handler execution. The kprobe's missed counter gets incremented when kprobe handler is not executed due to another kprobe running on the same cpu. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230920213145.1941596-4-jolsa@kernel.org --- include/linux/trace_events.h | 6 ++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 14 ++++++++------ kernel/trace/bpf_trace.c | 5 +++-- kernel/trace/trace_kprobe.c | 14 +++++++++++--- tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 28 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 21ae37e49319..5eb88a66eb68 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -761,7 +761,8 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, - u64 *probe_offset, u64 *probe_addr); + u64 *probe_offset, u64 *probe_addr, + unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else @@ -801,7 +802,7 @@ static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, - u64 *probe_addr) + u64 *probe_addr, unsigned long *missed) { return -EOPNOTSUPP; } @@ -877,6 +878,7 @@ extern void perf_kprobe_destroy(struct perf_event *event); extern int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, u64 *probe_addr, + unsigned long *missed, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1da5b1bcce71..70bfa997e896 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6548,6 +6548,7 @@ struct bpf_link_info { __u32 name_len; __u32 offset; /* offset from func_name */ __u64 addr; + __u64 missed; } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ struct { __aligned_u64 tp_name; /* in/out */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85c1d908f70f..6b5280f14a53 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3374,7 +3374,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) static int bpf_perf_link_fill_common(const struct perf_event *event, char __user *uname, u32 ulen, u64 *probe_offset, u64 *probe_addr, - u32 *fd_type) + u32 *fd_type, unsigned long *missed) { const char *buf; u32 prog_id; @@ -3385,7 +3385,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, return -EINVAL; err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, - probe_offset, probe_addr); + probe_offset, probe_addr, missed); if (err) return err; if (!uname) @@ -3408,6 +3408,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, static int bpf_perf_link_fill_kprobe(const struct perf_event *event, struct bpf_link_info *info) { + unsigned long missed; char __user *uname; u64 addr, offset; u32 ulen, type; @@ -3416,7 +3417,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, - &type); + &type, &missed); if (err) return err; if (type == BPF_FD_TYPE_KRETPROBE) @@ -3425,6 +3426,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KPROBE; info->perf_event.kprobe.offset = offset; + info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) addr = 0; info->perf_event.kprobe.addr = addr; @@ -3444,7 +3446,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, - &type); + &type, NULL); if (err) return err; @@ -3480,7 +3482,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL); + return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -4813,7 +4815,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, err = bpf_get_perf_event_info(event, &prog_id, &fd_type, &buf, &probe_offset, - &probe_addr); + &probe_addr, NULL); if (!err) err = bpf_task_fd_query_copy(attr, uattr, prog_id, fd_type, buf, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6aec6e7d612a..f6a7d2524949 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, - u64 *probe_offset, u64 *probe_addr) + u64 *probe_offset, u64 *probe_addr, + unsigned long *missed) { bool is_tracepoint, is_syscall_tp; struct bpf_prog *prog; @@ -2419,7 +2420,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, #ifdef CONFIG_KPROBE_EVENTS if (flags & TRACE_EVENT_FL_KPROBE) err = bpf_get_kprobe_info(event, fd_type, buf, - probe_offset, probe_addr, + probe_offset, probe_addr, missed, event->attr.type == PERF_TYPE_TRACEPOINT); #endif #ifdef CONFIG_UPROBE_EVENTS diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..961a78ffd6d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1189,6 +1189,12 @@ static const struct file_operations kprobe_events_ops = { .write = probes_write, }; +static unsigned long trace_kprobe_missed(struct trace_kprobe *tk) +{ + return trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; +} + /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { @@ -1200,8 +1206,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) return 0; tk = to_trace_kprobe(ev); - nmissed = trace_kprobe_is_return(tk) ? - tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; + nmissed = trace_kprobe_missed(tk); seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), @@ -1547,7 +1552,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func); int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, - u64 *probe_addr, bool perf_type_tracepoint) + u64 *probe_addr, unsigned long *missed, + bool perf_type_tracepoint) { const char *pevent = trace_event_name(event->tp_event); const char *group = event->tp_event->class->system; @@ -1566,6 +1572,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, *probe_addr = kallsyms_show_value(current_cred()) ? (unsigned long)tk->rp.kp.addr : 0; *symbol = tk->symbol; + if (missed) + *missed = trace_kprobe_missed(tk); return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1da5b1bcce71..70bfa997e896 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6548,6 +6548,7 @@ struct bpf_link_info { __u32 name_len; __u32 offset; /* offset from func_name */ __u64 addr; + __u64 missed; } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ struct { __aligned_u64 tp_name; /* in/out */ -- cgit v1.2.3 From 076433bd78d719b34d465c1e69eef512036b534c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Sep 2023 20:17:14 +0000 Subject: net_sched: sch_fq: add fast path for mostly idle qdisc TCQ_F_CAN_BYPASS can be used by few qdiscs. Idea is that if we queue a packet to an empty qdisc, following dequeue() would pick it immediately. FQ can not use the generic TCQ_F_CAN_BYPASS code, because some additional checks need to be performed. This patch adds a similar fast path to FQ. Most of the time, qdisc is not throttled, and many packets can avoid bringing/touching at least four cache lines, and consuming 128bytes of memory to store the state of a flow. After this patch, netperf can send UDP packets about 13 % faster, and pktgen goes 30 % faster (when FQ is in the way), on a fast NIC. TCP traffic is also improved, thanks to a reduction of cache line misses. I have measured a 5 % increase of throughput on a tcp_rr intensive workload. tc -s -d qd sh dev eth1 ... qdisc fq 8004: parent 1:2 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit refill_delay 40ms timer_slack 10us horizon 10s horizon_drop Sent 5646784384 bytes 1985161 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 flows 122 (inactive 122 throttled 0) gc 0 highprio 0 fastpath 659990 throttled 27762 latency 8.57us Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_fq.c | 128 +++++++++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 37 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 3f85ae578056..579f641846b8 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -962,6 +962,7 @@ struct tc_fq_qd_stats { __u64 ce_mark; /* packets above ce_threshold */ __u64 horizon_drops; __u64 horizon_caps; + __u64 fastpath_packets; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4af43a401dbb..5cf3b50a24d5 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -2,7 +2,7 @@ /* * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) * - * Copyright (C) 2013-2015 Eric Dumazet + * Copyright (C) 2013-2023 Eric Dumazet * * Meant to be mostly used for locally generated traffic : * Fast classification depends on skb->sk being set before reaching us. @@ -73,7 +73,13 @@ struct fq_flow { struct sk_buff *tail; /* last skb in the list */ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; - struct rb_node fq_node; /* anchor in fq_root[] trees */ + union { + struct rb_node fq_node; /* anchor in fq_root[] trees */ + /* Following field is only used for q->internal, + * because q->internal is not hashed in fq_root[] + */ + u64 stat_fastpath_packets; + }; struct sock *sk; u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ @@ -134,7 +140,7 @@ struct fq_sched_data { /* Seldom used fields. */ - u64 stat_internal_packets; + u64 stat_internal_packets; /* aka highprio */ u64 stat_ce_mark; u64 stat_horizon_drops; u64 stat_horizon_caps; @@ -266,17 +272,64 @@ static void fq_gc(struct fq_sched_data *q, kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } -static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +/* Fast path can be used if : + * 1) Packet tstamp is in the past. + * 2) FQ qlen == 0 OR + * (no flow is currently eligible for transmit, + * AND fast path queue has less than 8 packets) + * 3) No SO_MAX_PACING_RATE on the socket (if any). + * 4) No @maxrate attribute on this qdisc, + * + * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure. + */ +static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb) +{ + const struct fq_sched_data *q = qdisc_priv(sch); + const struct sock *sk; + + if (fq_skb_cb(skb)->time_to_send > q->ktime_cache) + return false; + + if (sch->q.qlen != 0) { + /* Even if some packets are stored in this qdisc, + * we can still enable fast path if all of them are + * scheduled in the future (ie no flows are eligible) + * or in the fast path queue. + */ + if (q->flows != q->inactive_flows + q->throttled_flows) + return false; + + /* Do not allow fast path queue to explode, we want Fair Queue mode + * under pressure. + */ + if (q->internal.qlen >= 8) + return false; + } + + sk = skb->sk; + if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) && + sk->sk_max_pacing_rate != ~0UL) + return false; + + if (q->flow_max_rate != ~0UL) + return false; + + return true; +} + +static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb) { + struct fq_sched_data *q = qdisc_priv(sch); struct rb_node **p, *parent; struct sock *sk = skb->sk; struct rb_root *root; struct fq_flow *f; /* warning: no starvation prevention... */ - if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) { + q->stat_internal_packets++; /* highprio packet */ return &q->internal; - + } /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, @@ -307,6 +360,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) sk = (struct sock *)((hash << 1) | 1UL); } + if (fq_fastpath_check(sch, skb)) { + q->internal.stat_fastpath_packets++; + return &q->internal; + } + root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; if (q->flows >= (2U << q->fq_trees_log) && @@ -402,12 +460,8 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { - struct fq_sched_data *q = qdisc_priv(sch); - fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); - if (--flow->qlen == 0) - q->inactive_flows++; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } @@ -459,49 +513,45 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + q->ktime_cache = ktime_get_ns(); if (!skb->tstamp) { - fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + fq_skb_cb(skb)->time_to_send = q->ktime_cache; } else { - /* Check if packet timestamp is too far in the future. - * Try first if our cached value, to avoid ktime_get_ns() - * cost in most cases. - */ + /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q)) { - /* Refresh our cache and check another time */ - q->ktime_cache = ktime_get_ns(); - if (fq_packet_beyond_horizon(skb, q)) { - if (q->horizon_drop) { + if (q->horizon_drop) { q->stat_horizon_drops++; return qdisc_drop(skb, sch, to_free); - } - q->stat_horizon_caps++; - skb->tstamp = q->ktime_cache + q->horizon; } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; } fq_skb_cb(skb)->time_to_send = skb->tstamp; } - f = fq_classify(skb, q); - if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { - q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); - } + f = fq_classify(sch, skb); - if (f->qlen++ == 0) - q->inactive_flows--; - qdisc_qstats_backlog_inc(sch, skb); - if (fq_flow_is_detached(f)) { - fq_flow_add_tail(&q->new_flows, f); - if (time_after(jiffies, f->age + q->flow_refill_delay)) - f->credit = max_t(u32, f->credit, q->quantum); + if (f != &q->internal) { + if (unlikely(f->qlen >= q->flow_plimit)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch, to_free); + } + + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + } + + if (f->qlen == 0) + q->inactive_flows--; } + f->qlen++; /* Note: this overwrites f->age */ flow_queue_add(f, skb); - if (unlikely(f == &q->internal)) { - q->stat_internal_packets++; - } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -549,6 +599,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) skb = fq_peek(&q->internal); if (unlikely(skb)) { + q->internal.qlen--; fq_dequeue_skb(sch, &q->internal, skb); goto out; } @@ -592,6 +643,8 @@ begin: INET_ECN_set_ce(skb); q->stat_ce_mark++; } + if (--f->qlen == 0) + q->inactive_flows++; fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; @@ -1024,6 +1077,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.gc_flows = q->stat_gc_flows; st.highprio_packets = q->stat_internal_packets; + st.fastpath_packets = q->internal.stat_fastpath_packets; st.tcp_retrans = 0; st.throttled = q->stat_throttled; st.flows_plimit = q->stat_flows_plimit; -- cgit v1.2.3 From 29f834aa326e659ed354c406056e94ea3d29706a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2023 13:17:37 +0000 Subject: net_sched: sch_fq: add 3 bands and WRR scheduling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before Google adopted FQ for its production servers, we had to ensure AF4 packets would get a higher share than BE1 ones. As discussed this week in Netconf 2023 in Paris, it is time to upstream this for public use. After this patch FQ can replace pfifo_fast, with the following differences : - FQ uses WRR instead of strict prio, to avoid starvation of low priority packets. - We make sure each band/prio tracks its own usage against sch->limit. This was done to make sure flood of low priority packets would not prevent AF4 packets to be queued. Contributed by Willem. - priomap can be changed, if needed (default value are the ones coming from pfifo_fast). In this patch, we set default band weights so that : - high prio (band=0) packets get 90% of the bandwidth if they compete with low prio (band=2) packets. - high prio packets get 75% of the bandwidth if they compete with medium prio (band=1) packets. Following patch in this series adds the possibility to tune the per-band weights. As we added many fields in 'struct fq_sched_data', we had to make sure to have the first cache line read-mostly, and avoid wasting precious cache lines. More optimizations are possible but will be sent separately. Signed-off-by: Eric Dumazet Acked-by: Dave Taht Reviewed-by: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Paolo Abeni --- include/uapi/linux/pkt_sched.h | 11 ++- net/sched/sch_fq.c | 204 ++++++++++++++++++++++++++++++++--------- 2 files changed, 171 insertions(+), 44 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 579f641846b8..ec5ab44d41a2 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -941,15 +941,19 @@ enum { TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */ + TCA_FQ_PRIOMAP, /* prio2band */ + __TCA_FQ_MAX }; #define TCA_FQ_MAX (__TCA_FQ_MAX - 1) +#define FQ_BANDS 3 + struct tc_fq_qd_stats { __u64 gc_flows; - __u64 highprio_packets; - __u64 tcp_retrans; + __u64 highprio_packets; /* obsolete */ + __u64 tcp_retrans; /* obsolete */ __u64 throttled; __u64 flows_plimit; __u64 pkts_too_long; @@ -963,6 +967,9 @@ struct tc_fq_qd_stats { __u64 horizon_drops; __u64 horizon_caps; __u64 fastpath_packets; + __u64 band_drops[FQ_BANDS]; + __u32 band_pkt_count[FQ_BANDS]; + __u32 pad; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 818ac786379d..081105801fa6 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -51,7 +51,8 @@ #include struct fq_skb_cb { - u64 time_to_send; + u64 time_to_send; + u8 band; }; static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) @@ -84,32 +85,28 @@ struct fq_flow { u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ -/* Second cache line, used in fq_dequeue() */ +/* Second cache line */ int credit; - /* 32bit hole on 64bit arches */ - + int band; struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; -} ____cacheline_aligned_in_smp; +}; struct fq_flow_head { struct fq_flow *first; struct fq_flow *last; }; -struct fq_sched_data { +struct fq_perband_flows { struct fq_flow_head new_flows; - struct fq_flow_head old_flows; + int credit; + int quantum; /* based on band nr : 576KB, 192KB, 64KB */ +}; - struct rb_root delayed; /* for rate limited flows */ - u64 time_next_delayed_flow; - unsigned long unthrottle_latency_ns; - - struct fq_flow internal; /* for non classified or high prio packets */ - +struct fq_sched_data { /* Read mostly cache line */ u32 quantum; @@ -125,10 +122,21 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; + u8 prio2band[(TC_PRIO_MAX + 1) >> 2]; u32 timer_slack; /* hrtimer slack in ns */ /* Read/Write fields. */ + unsigned int band_nr; /* band being serviced in fq_dequeue() */ + + struct fq_perband_flows band_flows[FQ_BANDS]; + + struct fq_flow internal; /* fastpath queue. */ + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + unsigned long unthrottle_latency_ns; + + u32 band_pkt_count[FQ_BANDS]; u32 flows; u32 inactive_flows; /* Flows with no packet to send. */ u32 throttled_flows; @@ -139,7 +147,7 @@ struct fq_sched_data { /* Seldom used fields. */ - u64 stat_internal_packets; /* aka highprio */ + u64 stat_band_drops[FQ_BANDS]; u64 stat_ce_mark; u64 stat_horizon_drops; u64 stat_horizon_caps; @@ -148,6 +156,12 @@ struct fq_sched_data { u64 stat_allocation_errors; }; +/* return the i-th 2-bit value ("crumb") */ +static u8 fq_prio2band(const u8 *prio2band, unsigned int prio) +{ + return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3; +} + /* * f->tail and f->age share the same location. * We can use the low order bit to differentiate if this location points @@ -172,8 +186,19 @@ static bool fq_flow_is_throttled(const struct fq_flow *f) return f->next == &throttled; } -static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +enum new_flow { + NEW_FLOW, + OLD_FLOW +}; + +static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow, + enum new_flow list_sel) { + struct fq_perband_flows *pband = &q->band_flows[flow->band]; + struct fq_flow_head *head = (list_sel == NEW_FLOW) ? + &pband->new_flows : + &pband->old_flows; + if (head->first) head->last->next = flow; else @@ -186,7 +211,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) { rb_erase(&f->rate_node, &q->delayed); q->throttled_flows--; - fq_flow_add_tail(&q->old_flows, f); + fq_flow_add_tail(q, f, OLD_FLOW); } static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) @@ -326,11 +351,6 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb, struct rb_root *root; struct fq_flow *f; - /* warning: no starvation prevention... */ - if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) { - q->stat_internal_packets++; /* highprio packet */ - return &q->internal; - } /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, @@ -509,9 +529,13 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct fq_sched_data *q = qdisc_priv(sch); struct fq_flow *f; u64 now; + u8 band; - if (unlikely(sch->q.qlen >= sch->limit)) + band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); + if (unlikely(q->band_pkt_count[band] >= sch->limit)) { + q->stat_band_drops[band]++; return qdisc_drop(skb, sch, to_free); + } now = ktime_get_ns(); if (!skb->tstamp) { @@ -538,11 +562,14 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (fq_flow_is_detached(f)) { - fq_flow_add_tail(&q->new_flows, f); + fq_flow_add_tail(q, f, NEW_FLOW); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); } + f->band = band; + q->band_pkt_count[band]++; + fq_skb_cb(skb)->band = band; if (f->qlen == 0) q->inactive_flows--; } @@ -584,13 +611,26 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) } } +static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband) +{ + if (pband->credit <= 0) + return NULL; + + if (pband->new_flows.first) + return &pband->new_flows; + + return pband->old_flows.first ? &pband->old_flows : NULL; +} + static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); + struct fq_perband_flows *pband; struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; unsigned long rate; + int retry; u32 plen; u64 now; @@ -606,24 +646,31 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) now = ktime_get_ns(); fq_check_throttled(q, now); + retry = 0; + pband = &q->band_flows[q->band_nr]; begin: - head = &q->new_flows; - if (!head->first) { - head = &q->old_flows; - if (!head->first) { - if (q->time_next_delayed_flow != ~0ULL) - qdisc_watchdog_schedule_range_ns(&q->watchdog, + head = fq_pband_head_select(pband); + if (!head) { + while (++retry < FQ_BANDS) { + if (++q->band_nr == FQ_BANDS) + q->band_nr = 0; + pband = &q->band_flows[q->band_nr]; + pband->credit = min(pband->credit + pband->quantum, + pband->quantum); + goto begin; + } + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_range_ns(&q->watchdog, q->time_next_delayed_flow, q->timer_slack); - return NULL; - } + return NULL; } f = head->first; - + retry = 0; if (f->credit <= 0) { f->credit += q->quantum; head->first = f->next; - fq_flow_add_tail(&q->old_flows, f); + fq_flow_add_tail(q, f, OLD_FLOW); goto begin; } @@ -645,12 +692,13 @@ begin: } if (--f->qlen == 0) q->inactive_flows++; + q->band_pkt_count[fq_skb_cb(skb)->band]--; fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ - if ((head == &q->new_flows) && q->old_flows.first) { - fq_flow_add_tail(&q->old_flows, f); + if (head == &pband->new_flows) { + fq_flow_add_tail(q, f, OLD_FLOW); } else { fq_flow_set_detached(f); } @@ -658,6 +706,7 @@ begin: } plen = qdisc_pkt_len(skb); f->credit -= plen; + pband->credit -= plen; if (!q->rate_enable) goto out; @@ -749,8 +798,10 @@ static void fq_reset(struct Qdisc *sch) kmem_cache_free(fq_flow_cachep, f); } } - q->new_flows.first = NULL; - q->old_flows.first = NULL; + for (idx = 0; idx < FQ_BANDS; idx++) { + q->band_flows[idx].new_flows.first = NULL; + q->band_flows[idx].old_flows.first = NULL; + } q->delayed = RB_ROOT; q->flows = 0; q->inactive_flows = 0; @@ -864,8 +915,54 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, [TCA_FQ_HORIZON] = { .type = NLA_U32 }, [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, + [TCA_FQ_PRIOMAP] = { + .type = NLA_BINARY, + .len = sizeof(struct tc_prio_qopt), + }, }; +/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ +static void fq_prio2band_compress_crumb(const u8 *in, u8 *out) +{ + const int num_elems = TC_PRIO_MAX + 1; + int i; + + memset(out, 0, num_elems / 4); + for (i = 0; i < num_elems; i++) + out[i / 4] |= in[i] << (2 * (i & 0x3)); +} + +static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) +{ + const int num_elems = TC_PRIO_MAX + 1; + int i; + + for (i = 0; i < num_elems; i++) + out[i] = fq_prio2band(in, i); +} + +static int fq_load_priomap(struct fq_sched_data *q, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const struct tc_prio_qopt *map = nla_data(attr); + int i; + + if (map->bands != FQ_BANDS) { + NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands"); + return -EINVAL; + } + for (i = 0; i < TC_PRIO_MAX + 1; i++) { + if (map->priomap[i] >= FQ_BANDS) { + NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d", + i, map->priomap[i]); + return -EINVAL; + } + } + fq_prio2band_compress_crumb(map->priomap, q->prio2band); + return 0; +} + static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -940,6 +1037,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, q->flow_refill_delay = usecs_to_jiffies(usecs_delay); } + if (!err && tb[TCA_FQ_PRIOMAP]) + err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack); + if (tb[TCA_FQ_ORPHAN_MASK]) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); @@ -991,7 +1091,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct fq_sched_data *q = qdisc_priv(sch); - int err; + int i, err; sch->limit = 10000; q->flow_plimit = 100; @@ -1001,8 +1101,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->flow_max_rate = ~0UL; q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; - q->new_flows.first = NULL; - q->old_flows.first = NULL; + for (i = 0; i < FQ_BANDS; i++) { + q->band_flows[i].new_flows.first = NULL; + q->band_flows[i].old_flows.first = NULL; + } + q->band_flows[0].quantum = 9 << 16; + q->band_flows[1].quantum = 3 << 16; + q->band_flows[2].quantum = 1 << 16; q->delayed = RB_ROOT; q->fq_root = NULL; q->fq_trees_log = ilog2(1024); @@ -1017,6 +1122,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; + fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band); qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); if (opt) @@ -1031,6 +1137,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); u64 ce_threshold = q->ce_threshold; + struct tc_prio_qopt prio = { + .bands = FQ_BANDS, + }; u64 horizon = q->horizon; struct nlattr *opts; @@ -1062,6 +1171,10 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) goto nla_put_failure; + fq_prio2band_decompress_crumb(q->prio2band, prio.priomap); + if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -1072,11 +1185,14 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); struct tc_fq_qd_stats st; + int i; + + st.pad = 0; sch_tree_lock(sch); st.gc_flows = q->stat_gc_flows; - st.highprio_packets = q->stat_internal_packets; + st.highprio_packets = 0; st.fastpath_packets = q->internal.stat_fastpath_packets; st.tcp_retrans = 0; st.throttled = q->stat_throttled; @@ -1093,6 +1209,10 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.ce_mark = q->stat_ce_mark; st.horizon_drops = q->stat_horizon_drops; st.horizon_caps = q->stat_horizon_caps; + for (i = 0; i < FQ_BANDS; i++) { + st.band_drops[i] = q->stat_band_drops[i]; + st.band_pkt_count[i] = q->band_pkt_count[i]; + } sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); @@ -1120,7 +1240,7 @@ static int __init fq_module_init(void) fq_flow_cachep = kmem_cache_create("fq_flow_cache", sizeof(struct fq_flow), - 0, 0, NULL); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!fq_flow_cachep) return -ENOMEM; -- cgit v1.2.3 From 49e7265fd098fdade2bbdd9331e6b914cda7fa83 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2023 13:17:38 +0000 Subject: net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This attribute can be used to tune the per band weight and report them in "tc qdisc show" output: qdisc fq 802f: parent 1:9 limit 100000p flow_limit 500p buckets 1024 orphan_mask 1023 quantum 8364b initial_quantum 41820b low_rate_threshold 550Kbit refill_delay 40ms timer_slack 10us horizon 10s horizon_drop bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536 Sent 236460814 bytes 792991 pkt (dropped 0, overlimits 0 requeues 0) rate 25816bit 10pps backlog 0b 0p requeues 0 flows 4 (inactive 4 throttled 0) gc 0 throttled 19 latency 17.6us fastpath 773882 Signed-off-by: Eric Dumazet Acked-by: Dave Taht Reviewed-by: Willem de Bruijn Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Paolo Abeni --- include/uapi/linux/pkt_sched.h | 3 +++ net/sched/sch_fq.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index ec5ab44d41a2..f762a10bfb78 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -943,12 +943,15 @@ enum { TCA_FQ_PRIOMAP, /* prio2band */ + TCA_FQ_WEIGHTS, /* Weights for each band */ + __TCA_FQ_MAX }; #define TCA_FQ_MAX (__TCA_FQ_MAX - 1) #define FQ_BANDS 3 +#define FQ_MIN_WEIGHT 16384 struct tc_fq_qd_stats { __u64 gc_flows; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 081105801fa6..8eacdb54e72f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -919,6 +919,10 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { .type = NLA_BINARY, .len = sizeof(struct tc_prio_qopt), }, + [TCA_FQ_WEIGHTS] = { + .type = NLA_BINARY, + .len = FQ_BANDS * sizeof(s32), + }, }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ @@ -941,6 +945,25 @@ static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) out[i] = fq_prio2band(in, i); } +static int fq_load_weights(struct fq_sched_data *q, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + s32 *weights = nla_data(attr); + int i; + + for (i = 0; i < FQ_BANDS; i++) { + if (weights[i] < FQ_MIN_WEIGHT) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d", + weights[i], FQ_MIN_WEIGHT); + return -EINVAL; + } + } + for (i = 0; i < FQ_BANDS; i++) + q->band_flows[i].quantum = weights[i]; + return 0; +} + static int fq_load_priomap(struct fq_sched_data *q, const struct nlattr *attr, struct netlink_ext_ack *extack) @@ -1040,6 +1063,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (!err && tb[TCA_FQ_PRIOMAP]) err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack); + if (!err && tb[TCA_FQ_WEIGHTS]) + err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack); + if (tb[TCA_FQ_ORPHAN_MASK]) q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); @@ -1142,6 +1168,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) }; u64 horizon = q->horizon; struct nlattr *opts; + s32 weights[3]; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) @@ -1175,6 +1202,12 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio)) goto nla_put_failure; + weights[0] = q->band_flows[0].quantum; + weights[1] = q->band_flows[1].quantum; + weights[2] = q->band_flows[2].quantum; + if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From d6247ecb6c1e17d7a33317090627f5bfe563cbb2 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 4 Oct 2023 11:23:38 -0500 Subject: bpf: Add ability to pin bpf timer to calling CPU BPF supports creating high resolution timers using bpf_timer_* helper functions. Currently, only the BPF_F_TIMER_ABS flag is supported, which specifies that the timeout should be interpreted as absolute time. It would also be useful to be able to pin that timer to a core. For example, if you wanted to make a subset of cores run without timer interrupts, and only have the timer be invoked on a single core. This patch adds support for this with a new BPF_F_TIMER_CPU_PIN flag. When specified, the HRTIMER_MODE_PINNED flag is passed to hrtimer_start(). A subsequent patch will update selftests to validate. Signed-off-by: David Vernet Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20231004162339.200702-2-void@manifault.com --- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/helpers.c | 5 ++++- tools/include/uapi/linux/bpf.h | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 70bfa997e896..a7d4a1a69f21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5096,6 +5096,8 @@ union bpf_attr { * **BPF_F_TIMER_ABS** * Start the timer in absolute expire value instead of the * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. * * Return * 0 on success. @@ -7309,9 +7311,11 @@ struct bpf_core_relo { * Flags to control bpf_timer_start() behaviour. * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is * relative to current time. + * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. */ enum { BPF_F_TIMER_ABS = (1ULL << 0), + BPF_F_TIMER_CPU_PIN = (1ULL << 1), }; /* BPF numbers iterator state */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dd1c69ee3375..d2840dd5b00d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1272,7 +1272,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla if (in_nmi()) return -EOPNOTSUPP; - if (flags > BPF_F_TIMER_ABS) + if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; @@ -1286,6 +1286,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla else mode = HRTIMER_MODE_REL_SOFT; + if (flags & BPF_F_TIMER_CPU_PIN) + mode |= HRTIMER_MODE_PINNED; + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 70bfa997e896..a7d4a1a69f21 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5096,6 +5096,8 @@ union bpf_attr { * **BPF_F_TIMER_ABS** * Start the timer in absolute expire value instead of the * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. * * Return * 0 on success. @@ -7309,9 +7311,11 @@ struct bpf_core_relo { * Flags to control bpf_timer_start() behaviour. * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is * relative to current time. + * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. */ enum { BPF_F_TIMER_ABS = (1ULL << 0), + BPF_F_TIMER_CPU_PIN = (1ULL << 1), }; /* BPF numbers iterator state */ -- cgit v1.2.3 From dab4e1f06cabb6834de14264394ccab197007302 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Sat, 7 Oct 2023 10:14:14 +0200 Subject: bpf: Derive source IP addr via bpf_*_fib_lookup() Extend the bpf_fib_lookup() helper by making it to return the source IPv4/IPv6 address if the BPF_FIB_LOOKUP_SRC flag is set. For example, the following snippet can be used to derive the desired source IP address: struct bpf_fib_lookup p = { .ipv4_dst = ip4->daddr }; ret = bpf_skb_fib_lookup(skb, p, sizeof(p), BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH); if (ret != BPF_FIB_LKUP_RET_SUCCESS) return TC_ACT_SHOT; /* the p.ipv4_src now contains the source address */ The inability to derive the proper source address may cause malfunctions in BPF-based dataplanes for hosts containing netdevs with more than one routable IP address or for multi-homed hosts. For example, Cilium implements packet masquerading in BPF. If an egressing netdev to which the Cilium's BPF prog is attached has multiple IP addresses, then only one [hardcoded] IP address can be used for masquerading. This breaks connectivity if any other IP address should have been selected instead, for example, when a public and private addresses are attached to the same egress interface. The change was tested with Cilium [1]. Nikolay Aleksandrov helped to figure out the IPv6 addr selection. [1]: https://github.com/cilium/cilium/pull/28283 Signed-off-by: Martynas Pumputis Link: https://lore.kernel.org/r/20231007081415.33502-2-m@lambda.lt Signed-off-by: Martin KaFai Lau --- include/net/ipv6_stubs.h | 5 +++++ include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 18 +++++++++++++++++- net/ipv6/af_inet6.c | 1 + tools/include/uapi/linux/bpf.h | 10 ++++++++++ 5 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index c48186bf4737..21da31e1dff5 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -85,6 +85,11 @@ struct ipv6_bpf_stub { sockptr_t optval, unsigned int optlen); int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); + int (*ipv6_dev_get_saddr)(struct net *net, + const struct net_device *dst_dev, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a7d4a1a69f21..e0aa457f94a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3264,6 +3264,11 @@ union bpf_attr { * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -6964,6 +6969,7 @@ enum { BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6976,6 +6982,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -7010,6 +7017,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ diff --git a/net/core/filter.c b/net/core/filter.c index a094694899c9..3880bf0b740d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5850,6 +5850,9 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) + params->ipv4_src = fib_result_prefsrc(net, &res); + /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ @@ -5992,6 +5995,18 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) { + if (res.f6i->fib6_prefsrc.plen) { + *src = res.f6i->fib6_prefsrc.addr; + } else { + err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, + &fl6.daddr, 0, + src); + if (err) + return BPF_FIB_LKUP_RET_NO_SRC_ADDR; + } + } + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; @@ -6010,7 +6025,8 @@ set_fwd_params: #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ - BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID) + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ + BPF_FIB_LOOKUP_SRC) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c6ad0d6e99b5..6337fb4504fd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1061,6 +1061,7 @@ static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .udp6_lib_lookup = __udp6_lib_lookup, .ipv6_setsockopt = do_ipv6_setsockopt, .ipv6_getsockopt = do_ipv6_getsockopt, + .ipv6_dev_get_saddr = ipv6_dev_get_saddr, }; static int __init inet6_init(void) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a7d4a1a69f21..e0aa457f94a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3264,6 +3264,11 @@ union bpf_attr { * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -6964,6 +6969,7 @@ enum { BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6976,6 +6982,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -7010,6 +7017,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ -- cgit v1.2.3 From 859051dd165ec6cc915f0f2114699021144fd249 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Wed, 11 Oct 2023 20:51:06 +0200 Subject: bpf: Implement cgroup sockaddr hooks for unix sockets These hooks allows intercepting connect(), getsockname(), getpeername(), sendmsg() and recvmsg() for unix sockets. The unix socket hooks get write access to the address length because the address length is not fixed when dealing with unix sockets and needs to be modified when a unix socket address is modified by the hook. Because abstract socket unix addresses start with a NUL byte, we cannot recalculate the socket address in kernelspace after running the hook by calculating the length of the unix socket path using strlen(). These hooks can be used when users want to multiplex syscall to a single unix socket to multiple different processes behind the scenes by redirecting the connect() and other syscalls to process specific sockets. We do not implement support for intercepting bind() because when using bind() with unix sockets with a pathname address, this creates an inode in the filesystem which must be cleaned up. If we rewrite the address, the user might try to clean up the wrong file, leaking the socket in the filesystem where it is never cleaned up. Until we figure out a solution for this (and a use case for intercepting bind()), we opt to not allow rewriting the sockaddr in bind() calls. We also implement recvmsg() support for connected streams so that after a connect() that is modified by a sockaddr hook, any corresponding recmvsg() on the connected socket can also be modified to make the connected program think it is connected to the "intended" remote. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Daan De Meyer Link: https://lore.kernel.org/r/20231011185113.140426-5-daan.j.demeyer@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf-cgroup-defs.h | 5 +++++ include/linux/bpf-cgroup.h | 17 +++++++++++++++++ include/uapi/linux/bpf.h | 13 +++++++++---- kernel/bpf/cgroup.c | 11 +++++++++-- kernel/bpf/syscall.c | 15 +++++++++++++++ kernel/bpf/verifier.c | 5 ++++- net/core/filter.c | 14 ++++++++++++-- net/unix/af_unix.c | 35 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 13 +++++++++---- 9 files changed, 114 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index 7b121bd780eb..0985221d5478 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -28,19 +28,24 @@ enum cgroup_bpf_attach_type { CGROUP_INET6_BIND, CGROUP_INET4_CONNECT, CGROUP_INET6_CONNECT, + CGROUP_UNIX_CONNECT, CGROUP_INET4_POST_BIND, CGROUP_INET6_POST_BIND, CGROUP_UDP4_SENDMSG, CGROUP_UDP6_SENDMSG, + CGROUP_UNIX_SENDMSG, CGROUP_SYSCTL, CGROUP_UDP4_RECVMSG, CGROUP_UDP6_RECVMSG, + CGROUP_UNIX_RECVMSG, CGROUP_GETSOCKOPT, CGROUP_SETSOCKOPT, CGROUP_INET4_GETPEERNAME, CGROUP_INET6_GETPEERNAME, + CGROUP_UNIX_GETPEERNAME, CGROUP_INET4_GETSOCKNAME, CGROUP_INET6_GETSOCKNAME, + CGROUP_UNIX_GETSOCKNAME, CGROUP_INET_SOCK_RELEASE, CGROUP_LSM_START, CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1, diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 31561e789715..98b8cea904fe 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -48,19 +48,24 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) CGROUP_ATYPE(CGROUP_INET6_BIND); CGROUP_ATYPE(CGROUP_INET4_CONNECT); CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_UNIX_CONNECT); CGROUP_ATYPE(CGROUP_INET4_POST_BIND); CGROUP_ATYPE(CGROUP_INET6_POST_BIND); CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_UNIX_SENDMSG); CGROUP_ATYPE(CGROUP_SYSCTL); CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_UNIX_RECVMSG); CGROUP_ATYPE(CGROUP_GETSOCKOPT); CGROUP_ATYPE(CGROUP_SETSOCKOPT); CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); default: return CGROUP_BPF_ATTACH_TYPE_INVALID; @@ -289,18 +294,27 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL) +#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL) + #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx) +#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx) + #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL) +#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL) + /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by * sk_to_full_sk(). @@ -492,10 +506,13 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e0aa457f94a9..7ba61b75bc0e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1047,6 +1047,11 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_CGROUP_UNIX_CONNECT, + BPF_CGROUP_UNIX_SENDMSG, + BPF_CGROUP_UNIX_RECVMSG, + BPF_CGROUP_UNIX_GETPEERNAME, + BPF_CGROUP_UNIX_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; @@ -2704,8 +2709,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -2943,8 +2948,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ac37bd53aee0..74ad2215e1ba 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). * - * socket is expected to be of type INET or INET6. + * socket is expected to be of type INET, INET6 or UNIX. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases, 0 is returned. @@ -1482,7 +1482,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && + sk->sk_family != AF_UNIX) return 0; if (!ctx.uaddr) { @@ -2533,10 +2534,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_get_retval_proto; @@ -2548,10 +2552,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_set_retval_proto; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6b5280f14a53..8677837f3deb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2446,14 +2446,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return 0; default: return -EINVAL; @@ -3678,14 +3683,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: return BPF_PROG_TYPE_SOCK_OPS; @@ -3942,14 +3952,19 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eed7350e15f4..e777f50401b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14797,10 +14797,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) range = tnum_range(1, 1); if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) diff --git a/net/core/filter.c b/net/core/filter.c index ff0bd9f20b95..cc2e4babc85f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7875,14 +7875,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; @@ -7893,14 +7898,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; @@ -8948,8 +8958,8 @@ static bool sock_addr_is_valid_access(int off, int size, if (off % size != 0) return false; - /* Disallow access to IPv6 fields from IPv4 contex and vise - * versa. + /* Disallow access to fields not belonging to the attach type's address + * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3e8a04a13668..e10d07c76044 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,6 +116,7 @@ #include #include #include +#include #include "scm.h" @@ -1381,6 +1382,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; + err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); + if (err) + goto out; + if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !unix_sk(sk)->addr) { @@ -1490,6 +1495,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; + err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); + if (err) + goto out; + if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); @@ -1770,6 +1779,13 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); + + if (peer) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, + CGROUP_UNIX_GETPEERNAME); + else + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, + CGROUP_UNIX_GETSOCKNAME); } sock_put(sk); out: @@ -1922,6 +1938,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = unix_validate_addr(sunaddr, msg->msg_namelen); if (err) goto out; + + err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, + msg->msg_name, + &msg->msg_namelen, + NULL); + if (err) + goto out; } else { sunaddr = NULL; err = -ENOTCONN; @@ -2390,9 +2413,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - if (msg->msg_name) + if (msg->msg_name) { unix_copy_addr(msg, skb->sk); + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, + msg->msg_name, + &msg->msg_namelen); + } + if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) @@ -2744,6 +2772,11 @@ unlock: DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, state->msg->msg_name); unix_copy_addr(state->msg, skb->sk); + + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, + state->msg->msg_name, + &state->msg->msg_namelen); + sunaddr = NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e0aa457f94a9..7ba61b75bc0e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1047,6 +1047,11 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_CGROUP_UNIX_CONNECT, + BPF_CGROUP_UNIX_SENDMSG, + BPF_CGROUP_UNIX_RECVMSG, + BPF_CGROUP_UNIX_GETPEERNAME, + BPF_CGROUP_UNIX_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; @@ -2704,8 +2709,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -2943,8 +2948,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by -- cgit v1.2.3 From 49dbe25adac42d3e06f65d1420946bec65896222 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 10 Oct 2023 22:15:14 +0300 Subject: vsock: read from socket's error queue This adds handling of MSG_ERRQUEUE input flag in receive call. This flag is used to read socket's error queue instead of data queue. Possible scenario of error queue usage is receiving completions for transmission with MSG_ZEROCOPY flag. This patch also adds new defines: 'SOL_VSOCK' and 'VSOCK_RECVERR'. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + include/uapi/linux/vm_sockets.h | 17 +++++++++++++++++ net/vmw_vsock/af_vsock.c | 6 ++++++ 3 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 39b74d83c7c4..cfcb7e2c3813 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -383,6 +383,7 @@ struct ucred { #define SOL_MPTCP 284 #define SOL_MCTP 285 #define SOL_SMC 286 +#define SOL_VSOCK 287 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index c60ca33eac59..ed07181d4eff 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -191,4 +191,21 @@ struct sockaddr_vm { #define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9) +/* MSG_ZEROCOPY notifications are encoded in the standard error format, + * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in + * kernel source tree for more details. + */ + +/* 'cmsg_level' field value of 'struct cmsghdr' for notification parsing + * when MSG_ZEROCOPY flag is used on transmissions. + */ + +#define SOL_VSOCK 287 + +/* 'cmsg_type' field value of 'struct cmsghdr' for notification parsing + * when MSG_ZEROCOPY flag is used on transmissions. + */ + +#define VSOCK_RECVERR 1 + #endif /* _UAPI_VM_SOCKETS_H */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index d841f4de33b0..38486efd3d05 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -110,6 +111,7 @@ #include #include #include +#include static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); @@ -2137,6 +2139,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int err; sk = sock->sk; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); + vsk = vsock_sk(sk); err = 0; -- cgit v1.2.3 From c3c6ab95c397134bf5948f18743b3ba8008e7c47 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Wed, 11 Oct 2023 12:12:33 +0200 Subject: dpll: spec: add support for pin-dpll signal phase offset/adjust Add attributes for providing the user with: - measurement of signals phase offset between pin and dpll - ability to adjust the phase of pin signal Signed-off-by: Arkadiusz Kubalewski Signed-off-by: David S. Miller --- Documentation/netlink/specs/dpll.yaml | 30 ++++++++++++++++++++++++++++++ drivers/dpll/dpll_nl.c | 8 +++++--- drivers/dpll/dpll_nl.h | 2 +- include/uapi/linux/dpll.h | 6 ++++++ 4 files changed, 42 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 1c1b53136c7b..cf8abe1c0550 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -164,6 +164,18 @@ definitions: - name: state-can-change doc: pin state can be changed + - + type: const + name: phase-offset-divider + value: 1000 + doc: | + phase offset divider allows userspace to calculate a value of + measured signal phase difference between a pin and dpll device + as a fractional value with three digit decimal precision. + Value of (DPLL_A_PHASE_OFFSET / DPLL_PHASE_OFFSET_DIVIDER) is an + integer part of a measured phase offset value. + Value of (DPLL_A_PHASE_OFFSET % DPLL_PHASE_OFFSET_DIVIDER) is a + fractional part of a measured phase offset value. attribute-sets: - @@ -272,6 +284,18 @@ attribute-sets: type: nest multi-attr: true nested-attributes: pin-parent-pin + - + name: phase-adjust-min + type: s32 + - + name: phase-adjust-max + type: s32 + - + name: phase-adjust + type: s32 + - + name: phase-offset + type: s64 - name: pin-parent-device subset-of: pin @@ -284,6 +308,8 @@ attribute-sets: name: prio - name: state + - + name: phase-offset - name: pin-parent-pin subset-of: pin @@ -431,6 +457,9 @@ operations: - capabilities - parent-device - parent-pin + - phase-adjust-min + - phase-adjust-max + - phase-adjust dump: pre: dpll-lock-dumpit @@ -458,6 +487,7 @@ operations: - state - parent-device - parent-pin + - phase-adjust - name: pin-create-ntf doc: Notification about pin appearing diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index 14064c8c783b..eaee5be7aa64 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -11,11 +11,12 @@ #include /* Common nested types */ -const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1] = { +const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_PHASE_OFFSET + 1] = { [DPLL_A_PIN_PARENT_ID] = { .type = NLA_U32, }, [DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2), [DPLL_A_PIN_PRIO] = { .type = NLA_U32, }, [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), + [DPLL_A_PIN_PHASE_OFFSET] = { .type = NLA_S64, }, }; const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1] = { @@ -61,7 +62,7 @@ static const struct nla_policy dpll_pin_get_dump_nl_policy[DPLL_A_PIN_ID + 1] = }; /* DPLL_CMD_PIN_SET - do */ -static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PARENT_PIN + 1] = { +static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PHASE_ADJUST + 1] = { [DPLL_A_PIN_ID] = { .type = NLA_U32, }, [DPLL_A_PIN_FREQUENCY] = { .type = NLA_U64, }, [DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2), @@ -69,6 +70,7 @@ static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_PARENT_PIN + 1] [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), [DPLL_A_PIN_PARENT_DEVICE] = NLA_POLICY_NESTED(dpll_pin_parent_device_nl_policy), [DPLL_A_PIN_PARENT_PIN] = NLA_POLICY_NESTED(dpll_pin_parent_pin_nl_policy), + [DPLL_A_PIN_PHASE_ADJUST] = { .type = NLA_S32, }, }; /* Ops table for dpll */ @@ -140,7 +142,7 @@ static const struct genl_split_ops dpll_nl_ops[] = { .doit = dpll_nl_pin_set_doit, .post_doit = dpll_pin_post_doit, .policy = dpll_pin_set_nl_policy, - .maxattr = DPLL_A_PIN_PARENT_PIN, + .maxattr = DPLL_A_PIN_PHASE_ADJUST, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h index 1f67aaed4742..92d4c9c4f788 100644 --- a/drivers/dpll/dpll_nl.h +++ b/drivers/dpll/dpll_nl.h @@ -12,7 +12,7 @@ #include /* Common nested types */ -extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_STATE + 1]; +extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_PHASE_OFFSET + 1]; extern const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1]; int dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb, diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 20ef0718f8dc..715a491d2727 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -138,6 +138,8 @@ enum dpll_pin_capabilities { DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE = 4, }; +#define DPLL_PHASE_OFFSET_DIVIDER 1000 + enum dpll_a { DPLL_A_ID = 1, DPLL_A_MODULE_NAME, @@ -173,6 +175,10 @@ enum dpll_a_pin { DPLL_A_PIN_CAPABILITIES, DPLL_A_PIN_PARENT_DEVICE, DPLL_A_PIN_PARENT_PIN, + DPLL_A_PIN_PHASE_ADJUST_MIN, + DPLL_A_PIN_PHASE_ADJUST_MAX, + DPLL_A_PIN_PHASE_ADJUST, + DPLL_A_PIN_PHASE_OFFSET, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit v1.2.3 From c5a445b1e9347b14752b01f1a304bd7a2f260acc Mon Sep 17 00:00:00 2001 From: Xabier Marquiegui Date: Thu, 12 Oct 2023 00:39:56 +0200 Subject: ptp: support event queue reader channel masks On systems with multiple timestamp event channels, some readers might want to receive only a subset of those channels. Add the necessary modifications to support timestamp event channel filtering, including two IOCTL operations: - Clear all channels - Enable one channel The mask modification operations will be applied exclusively on the event queue assigned to the file descriptor used on the IOCTL operation, so the typical procedure to have a reader receiving only a subset of the enabled channels would be: - Open device file - ioctl: clear all channels - ioctl: enable one channel - start reading Calling the enable one channel ioctl more than once will result in multiple enabled channels. Signed-off-by: Xabier Marquiegui Suggested-by: Richard Cochran Suggested-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 26 ++++++++++++++++++++++++++ drivers/ptp/ptp_clock.c | 12 ++++++++++-- drivers/ptp/ptp_private.h | 3 +++ include/uapi/linux/ptp_clock.h | 2 ++ 4 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index abe94bb80cf6..ac2f2b5ea0b7 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -110,6 +110,12 @@ int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode) queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return -EINVAL; + queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL); + if (!queue->mask) { + kfree(queue); + return -EINVAL; + } + bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS); spin_lock_init(&queue->lock); list_add_tail(&queue->qlist, &ptp->tsevqs); pccontext->private_clkdata = queue; @@ -126,6 +132,7 @@ int ptp_release(struct posix_clock_context *pccontext) spin_lock_irqsave(&queue->lock, flags); list_del(&queue->qlist); spin_unlock_irqrestore(&queue->lock, flags); + bitmap_free(queue->mask); kfree(queue); } return 0; @@ -141,6 +148,7 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct system_device_crosststamp xtstamp; struct ptp_clock_info *ops = ptp->info; struct ptp_sys_offset *sysoff = NULL; + struct timestamp_event_queue *tsevq; struct ptp_system_timestamp sts; struct ptp_clock_request req; struct ptp_clock_caps caps; @@ -150,6 +158,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct timespec64 ts; int enable, err = 0; + tsevq = pccontext->private_clkdata; + switch (cmd) { case PTP_CLOCK_GETCAPS: @@ -448,6 +458,22 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, mutex_unlock(&ptp->pincfg_mux); break; + case PTP_MASK_CLEAR_ALL: + bitmap_clear(tsevq->mask, 0, PTP_MAX_CHANNELS); + break; + + case PTP_MASK_EN_SINGLE: + if (copy_from_user(&i, (void __user *)arg, sizeof(i))) { + err = -EFAULT; + break; + } + if (i >= PTP_MAX_CHANNELS) { + err = -EFAULT; + break; + } + set_bit(i, tsevq->mask); + break; + default: err = -ENOTTY; break; diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 74f1ce2dbccb..ed16d9787ce9 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -183,6 +183,7 @@ static void ptp_clock_release(struct device *dev) spin_lock_irqsave(&tsevq->lock, flags); list_del(&tsevq->qlist); spin_unlock_irqrestore(&tsevq->lock, flags); + bitmap_free(tsevq->mask); kfree(tsevq); ida_free(&ptp_clocks_map, ptp->index); kfree(ptp); @@ -243,6 +244,10 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (!queue) goto no_memory_queue; list_add_tail(&queue->qlist, &ptp->tsevqs); + queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL); + if (!queue->mask) + goto no_memory_bitmap; + bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS); spin_lock_init(&queue->lock); mutex_init(&ptp->pincfg_mux); mutex_init(&ptp->n_vclocks_mux); @@ -346,6 +351,8 @@ no_mem_for_vclocks: kworker_err: mutex_destroy(&ptp->pincfg_mux); mutex_destroy(&ptp->n_vclocks_mux); + bitmap_free(queue->mask); +no_memory_bitmap: list_del(&queue->qlist); kfree(queue); no_memory_queue: @@ -400,9 +407,10 @@ void ptp_clock_event(struct ptp_clock *ptp, struct ptp_clock_event *event) break; case PTP_CLOCK_EXTTS: - /* Enqueue timestamp on all queues */ + /* Enqueue timestamp on selected queues */ list_for_each_entry(tsevq, &ptp->tsevqs, qlist) { - enqueue_external_timestamp(tsevq, event); + if (test_bit((unsigned int)event->index, tsevq->mask)) + enqueue_external_timestamp(tsevq, event); } wake_up_interruptible(&ptp->tsev_wq); break; diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 9d5f3d95058e..ad4ce1b25c86 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -16,10 +16,12 @@ #include #include #include +#include #define PTP_MAX_TIMESTAMPS 128 #define PTP_BUF_TIMESTAMPS 30 #define PTP_DEFAULT_MAX_VCLOCKS 20 +#define PTP_MAX_CHANNELS 2048 struct timestamp_event_queue { struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; @@ -27,6 +29,7 @@ struct timestamp_event_queue { int tail; spinlock_t lock; struct list_head qlist; + unsigned long *mask; }; struct ptp_clock { diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 05cc35fc94ac..da700999cad4 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -224,6 +224,8 @@ struct ptp_pin_desc { _IOWR(PTP_CLK_MAGIC, 17, struct ptp_sys_offset_precise) #define PTP_SYS_OFFSET_EXTENDED2 \ _IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended) +#define PTP_MASK_CLEAR_ALL _IO(PTP_CLK_MAGIC, 19) +#define PTP_MASK_EN_SINGLE _IOW(PTP_CLK_MAGIC, 20, unsigned int) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occured. */ -- cgit v1.2.3 From ddd1ad68826d8ff61a2e47733959570aa4d39a16 Mon Sep 17 00:00:00 2001 From: Johannes Nixdorf Date: Mon, 16 Oct 2023 15:27:22 +0200 Subject: net: bridge: Add netlink knobs for number / max learned FDB entries The previous patch added accounting and a limit for the number of dynamically learned FDB entries per bridge. However it did not provide means to actually configure those bounds or read back the count. This patch does that. Two new netlink attributes are added for the accounting and limit of dynamically learned FDB entries: - IFLA_BR_FDB_N_LEARNED (RO) for the number of entries accounted for a single bridge. - IFLA_BR_FDB_MAX_LEARNED (RW) for the configured limit of entries for the bridge. The new attributes are used like this: # ip link add name br up type bridge fdb_max_learned 256 # ip link add name v1 up master br type veth peer v2 # ip link set up dev v2 # mausezahn -a rand -c 1024 v2 0.01 seconds (90877 packets per second # bridge fdb | grep -v permanent | wc -l 256 # ip -d link show dev br 13: br: mtu 1500 [...] [...] fdb_n_learned 256 fdb_max_learned 256 Signed-off-by: Johannes Nixdorf Acked-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/20231016-fdb_limit-v5-3-32cddff87758@avm.de Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 2 ++ net/bridge/br_netlink.c | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index fac351a93aed..9f8a3da0f14f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -510,6 +510,8 @@ enum { IFLA_BR_VLAN_STATS_PER_PORT, IFLA_BR_MULTI_BOOLOPT, IFLA_BR_MCAST_QUERIER_STATE, + IFLA_BR_FDB_N_LEARNED, + IFLA_BR_FDB_MAX_LEARNED, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 10f0d33d8ccf..0c3cf6e6dea2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1265,6 +1265,8 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, [IFLA_BR_MULTI_BOOLOPT] = NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)), + [IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT }, + [IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1539,6 +1541,12 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } + if (data[IFLA_BR_FDB_MAX_LEARNED]) { + u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]); + + WRITE_ONCE(br->fdb_max_learned, val); + } + return 0; } @@ -1593,6 +1601,8 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_N_LEARNED */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_MAX_LEARNED */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */ @@ -1668,7 +1678,10 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED, br->topology_change_detected) || nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) || - nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm)) + nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) || + nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED, + atomic_read(&br->fdb_n_learned)) || + nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_VLAN_FILTERING -- cgit v1.2.3 From 374d345d9b5e13380c66d7042f9533a6ac6d1195 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Oct 2023 14:39:20 -0700 Subject: netlink: add variable-length / auto integers We currently push everyone to use padding to align 64b values in netlink. Un-padded nla_put_u64() doesn't even exist any more. The story behind this possibly start with this thread: https://lore.kernel.org/netdev/20121204.130914.1457976839967676240.davem@davemloft.net/ where DaveM was concerned about the alignment of a structure containing 64b stats. If user space tries to access such struct directly: struct some_stats *stats = nla_data(attr); printf("A: %llu", stats->a); lack of alignment may become problematic for some architectures. These days we most often put every single member in a separate attribute, meaning that the code above would use a helper like nla_get_u64(), which can deal with alignment internally. Even for arches which don't have good unaligned access - access aligned to 4B should be pretty efficient. Kernel and well known libraries deal with unaligned input already. Padded 64b is quite space-inefficient (64b + pad means at worst 16B per attr vs 32b which takes 8B). It is also more typing: if (nla_put_u64_pad(rsp, NETDEV_A_SOMETHING_SOMETHING, value, NETDEV_A_SOMETHING_PAD)) Create a new attribute type which will use 32 bits at netlink level if value is small enough (probably most of the time?), and (4B-aligned) 64 bits otherwise. Kernel API is just: if (nla_put_uint(rsp, NETDEV_A_SOMETHING_SOMETHING, value)) Calling this new type "just" sint / uint with no specific size will hopefully also make people more comfortable with using it. Currently telling people "don't use u8, you may need the bits, and netlink will round up to 4B, anyway" is the #1 comment we give to newcomers. In terms of netlink layout it looks like this: 0 4 8 12 16 32b: [nlattr][ u32 ] 64b: [ pad ][nlattr][ u64 ] uint(32) [nlattr][ u32 ] uint(64) [nlattr][ u64 ] Signed-off-by: Jakub Kicinski Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- Documentation/userspace-api/netlink/specs.rst | 18 ++++++- include/net/netlink.h | 69 ++++++++++++++++++++++++++- include/uapi/linux/netlink.h | 5 ++ lib/nlattr.c | 22 +++++++++ net/netlink/policy.c | 14 ++++-- 5 files changed, 121 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/netlink/specs.rst b/Documentation/userspace-api/netlink/specs.rst index 40dd7442d2c3..c1b951649113 100644 --- a/Documentation/userspace-api/netlink/specs.rst +++ b/Documentation/userspace-api/netlink/specs.rst @@ -403,10 +403,21 @@ This section describes the attribute types supported by the ``genetlink`` compatibility level. Refer to documentation of different levels for additional attribute types. -Scalar integer types +Common integer types -------------------- -Fixed-width integer types: +``sint`` and ``uint`` represent signed and unsigned 64 bit integers. +If the value can fit on 32 bits only 32 bits are carried in netlink +messages, otherwise full 64 bits are carried. Note that the payload +is only aligned to 4B, so the full 64 bit value may be unaligned! + +Common integer types should be preferred over fix-width types in majority +of cases. + +Fix-width integer types +----------------------- + +Fixed-width integer types include: ``u8``, ``u16``, ``u32``, ``u64``, ``s8``, ``s16``, ``s32``, ``s64``. Note that types smaller than 32 bit should be avoided as using them @@ -416,6 +427,9 @@ See :ref:`pad_type` for padding of 64 bit attributes. The payload of the attribute is the integer in host order unless ``byte-order`` specifies otherwise. +64 bit values are usually aligned by the kernel but it is recommended +that the user space is able to deal with unaligned values. + .. _pad_type: pad diff --git a/include/net/netlink.h b/include/net/netlink.h index 8a7cd1170e1f..aba2b162a226 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -128,6 +128,8 @@ * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: + * nla_get_uint(nla) get payload for a uint attribute + * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute @@ -183,6 +185,8 @@ enum { NLA_REJECT, NLA_BE16, NLA_BE32, + NLA_SINT, + NLA_UINT, __NLA_TYPE_MAX, }; @@ -229,6 +233,7 @@ enum nla_policy_validation { * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. + * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, @@ -260,12 +265,14 @@ enum nla_policy_validation { * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. + * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_BE16, * NLA_BE32, + * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, @@ -280,6 +287,7 @@ enum nla_policy_validation { * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. + * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, @@ -288,6 +296,7 @@ enum nla_policy_validation { * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). + * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, @@ -377,9 +386,11 @@ struct nla_policy { #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ - tp == NLA_U64 || tp == NLA_BE16 || tp == NLA_BE32) + tp == NLA_U64 || tp == NLA_UINT || \ + tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ - (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) + (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ + tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ @@ -1357,6 +1368,22 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) return nla_put(skb, attrtype, sizeof(u32), &tmp); } +/** + * nla_put_uint - Add a variable-size unsigned int to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) +{ + u64 tmp64 = value; + u32 tmp32 = value; + + if (tmp64 == tmp32) + return nla_put_u32(skb, attrtype, tmp32); + return nla_put(skb, attrtype, sizeof(u64), &tmp64); +} + /** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to @@ -1511,6 +1538,22 @@ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } +/** + * nla_put_sint - Add a variable-size signed int to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) +{ + s64 tmp64 = value; + s32 tmp32 = value; + + if (tmp64 == tmp32) + return nla_put_s32(skb, attrtype, tmp32); + return nla_put(skb, attrtype, sizeof(s64), &tmp64); +} + /** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to @@ -1667,6 +1710,17 @@ static inline u64 nla_get_u64(const struct nlattr *nla) return tmp; } +/** + * nla_get_uint - return payload of uint attribute + * @nla: uint netlink attribute + */ +static inline u64 nla_get_uint(const struct nlattr *nla) +{ + if (nla_len(nla) == sizeof(u32)) + return nla_get_u32(nla); + return nla_get_u64(nla); +} + /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute @@ -1729,6 +1783,17 @@ static inline s64 nla_get_s64(const struct nlattr *nla) return tmp; } +/** + * nla_get_sint - return payload of uint attribute + * @nla: uint netlink attribute + */ +static inline s64 nla_get_sint(const struct nlattr *nla) +{ + if (nla_len(nla) == sizeof(s32)) + return nla_get_s32(nla); + return nla_get_s64(nla); +} + /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index e2ae82e3f9f7..f87aaf28a649 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -298,6 +298,8 @@ struct nla_bitfield32 { * entry has attributes again, the policy for those inner ones * and the corresponding maxtype may be specified. * @NL_ATTR_TYPE_BITFIELD32: &struct nla_bitfield32 attribute + * @NL_ATTR_TYPE_SINT: 32-bit or 64-bit signed attribute, aligned to 4B + * @NL_ATTR_TYPE_UINT: 32-bit or 64-bit unsigned attribute, aligned to 4B */ enum netlink_attribute_type { NL_ATTR_TYPE_INVALID, @@ -322,6 +324,9 @@ enum netlink_attribute_type { NL_ATTR_TYPE_NESTED_ARRAY, NL_ATTR_TYPE_BITFIELD32, + + NL_ATTR_TYPE_SINT, + NL_ATTR_TYPE_UINT, }; /** diff --git a/lib/nlattr.c b/lib/nlattr.c index 7a2b6c38fd59..dc15e7888fc1 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -134,6 +134,7 @@ void nla_get_range_unsigned(const struct nla_policy *pt, range->max = U32_MAX; break; case NLA_U64: + case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; @@ -183,6 +184,9 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, case NLA_U64: value = nla_get_u64(nla); break; + case NLA_UINT: + value = nla_get_uint(nla); + break; case NLA_MSECS: value = nla_get_u64(nla); break; @@ -248,6 +252,7 @@ void nla_get_range_signed(const struct nla_policy *pt, range->max = S32_MAX; break; case NLA_S64: + case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; @@ -295,6 +300,9 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt, case NLA_S64: value = nla_get_s64(nla); break; + case NLA_SINT: + value = nla_get_sint(nla); + break; default: return -EINVAL; } @@ -320,6 +328,7 @@ static int nla_validate_int_range(const struct nla_policy *pt, case NLA_U16: case NLA_U32: case NLA_U64: + case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: @@ -329,6 +338,7 @@ static int nla_validate_int_range(const struct nla_policy *pt, case NLA_S16: case NLA_S32: case NLA_S64: + case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); @@ -355,6 +365,9 @@ static int nla_validate_mask(const struct nla_policy *pt, case NLA_U64: value = nla_get_u64(nla); break; + case NLA_UINT: + value = nla_get_uint(nla); + break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; @@ -433,6 +446,15 @@ static int validate_nla(const struct nlattr *nla, int maxtype, goto out_err; break; + case NLA_SINT: + case NLA_UINT: + if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "invalid attribute length"); + return -EINVAL; + } + break; + case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; diff --git a/net/netlink/policy.c b/net/netlink/policy.c index e2f111edf66c..1f8909c16f14 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -230,6 +230,8 @@ int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt) case NLA_S16: case NLA_S32: case NLA_S64: + case NLA_SINT: + case NLA_UINT: /* maximum is common, u64 min/max with padding */ return common + 2 * (nla_attr_size(0) + nla_attr_size(sizeof(u64))); @@ -288,6 +290,7 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, case NLA_U16: case NLA_U32: case NLA_U64: + case NLA_UINT: case NLA_MSECS: { struct netlink_range_validation range; @@ -297,8 +300,10 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, type = NL_ATTR_TYPE_U16; else if (pt->type == NLA_U32) type = NL_ATTR_TYPE_U32; - else + else if (pt->type == NLA_U64) type = NL_ATTR_TYPE_U64; + else + type = NL_ATTR_TYPE_UINT; if (pt->validation_type == NLA_VALIDATE_MASK) { if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MASK, @@ -320,7 +325,8 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, case NLA_S8: case NLA_S16: case NLA_S32: - case NLA_S64: { + case NLA_S64: + case NLA_SINT: { struct netlink_range_validation_signed range; if (pt->type == NLA_S8) @@ -329,8 +335,10 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, type = NL_ATTR_TYPE_S16; else if (pt->type == NLA_S32) type = NL_ATTR_TYPE_S32; - else + else if (pt->type == NLA_S64) type = NL_ATTR_TYPE_S64; + else + type = NL_ATTR_TYPE_SINT; nla_get_range_signed(pt, &range); -- cgit v1.2.3 From b4a11b2033b7d3dfdd46592f7036a775b18cecd1 Mon Sep 17 00:00:00 2001 From: Heng Guo Date: Thu, 19 Oct 2023 09:20:53 +0800 Subject: net: fix IPSTATS_MIB_OUTPKGS increment in OutForwDatagrams. Reproduce environment: network with 3 VM linuxs is connected as below: VM1<---->VM2(latest kernel 6.5.0-rc7)<---->VM3 VM1: eth0 ip: 192.168.122.207 MTU 1500 VM2: eth0 ip: 192.168.122.208, eth1 ip: 192.168.123.224 MTU 1500 VM3: eth0 ip: 192.168.123.240 MTU 1500 Reproduce: VM1 send 1400 bytes UDP data to VM3 using tools scapy with flags=0. scapy command: send(IP(dst="192.168.123.240",flags=0)/UDP()/str('0'*1400),count=1, inter=1.000000) Result: Before IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 1 64 11 0 3 4 0 0 4 7 0 0 0 0 0 0 0 0 0 ...... ---------------------------------------------------------------------- After IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 1 64 12 0 3 5 0 0 4 8 0 0 0 0 0 0 0 0 0 ...... ---------------------------------------------------------------------- "ForwDatagrams" increase from 4 to 5 and "OutRequests" also increase from 7 to 8. Issue description and patch: IPSTATS_MIB_OUTPKTS("OutRequests") is counted with IPSTATS_MIB_OUTOCTETS ("OutOctets") in ip_finish_output2(). According to RFC 4293, it is "OutOctets" counted with "OutTransmits" but not "OutRequests". "OutRequests" does not include any datagrams counted in "ForwDatagrams". ipSystemStatsOutOctets OBJECT-TYPE DESCRIPTION "The total number of octets in IP datagrams delivered to the lower layers for transmission. Octets from datagrams counted in ipIfStatsOutTransmits MUST be counted here. ipSystemStatsOutRequests OBJECT-TYPE DESCRIPTION "The total number of IP datagrams that local IP user- protocols (including ICMP) supplied to IP in requests for transmission. Note that this counter does not include any datagrams counted in ipSystemStatsOutForwDatagrams. So do patch to define IPSTATS_MIB_OUTPKTS to "OutTransmits" and add IPSTATS_MIB_OUTREQUESTS for "OutRequests". Add IPSTATS_MIB_OUTREQUESTS counter in __ip_local_out() for ipv4 and add IPSTATS_MIB_OUT counter in ip6_finish_output2() for ipv6. Test result with patch: Before IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates OutTransmits Ip: 1 64 9 0 5 1 0 0 3 3 0 0 0 0 0 0 0 0 0 4 ...... root@qemux86-64:~# cat /proc/net/netstat ...... IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastPkts InOctets OutOctets InMcastOctets OutMcastOctets InBcastOctets OutBcastOctets InCsumErrors InNoECTPkts InECT1Pkts InECT0Pkts InCEPkts ReasmOverlaps IpExt: 0 0 0 0 0 0 2976 1896 0 0 0 0 0 9 0 0 0 0 ---------------------------------------------------------------------- After IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates OutTransmits Ip: 1 64 10 0 5 2 0 0 3 3 0 0 0 0 0 0 0 0 0 5 ...... root@qemux86-64:~# cat /proc/net/netstat ...... IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastPkts InOctets OutOctets InMcastOctets OutMcastOctets InBcastOctets OutBcastOctets InCsumErrors InNoECTPkts InECT1Pkts InECT0Pkts InCEPkts ReasmOverlaps IpExt: 0 0 0 0 0 0 4404 3324 0 0 0 0 0 10 0 0 0 0 ---------------------------------------------------------------------- "ForwDatagrams" increase from 1 to 2 and "OutRequests" is keeping 3. "OutTransmits" increase from 4 to 5 and "OutOctets" increase 1428. Signed-off-by: Heng Guo Reviewed-by: Kun Song Reviewed-by: Filip Pudak Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 3 ++- net/ipv4/ip_output.c | 2 ++ net/ipv4/proc.c | 3 ++- net/ipv6/ip6_output.c | 6 ++++-- net/ipv6/mcast.c | 5 ++--- net/ipv6/ndisc.c | 2 +- net/ipv6/proc.c | 3 ++- net/ipv6/raw.c | 2 +- 8 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 26f33a4c253d..b2b72886cb6d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -24,7 +24,7 @@ enum IPSTATS_MIB_INOCTETS, /* InOctets */ IPSTATS_MIB_INDELIVERS, /* InDelivers */ IPSTATS_MIB_OUTFORWDATAGRAMS, /* OutForwDatagrams */ - IPSTATS_MIB_OUTPKTS, /* OutRequests */ + IPSTATS_MIB_OUTREQUESTS, /* OutRequests */ IPSTATS_MIB_OUTOCTETS, /* OutOctets */ /* other fields */ IPSTATS_MIB_INHDRERRORS, /* InHdrErrors */ @@ -57,6 +57,7 @@ enum IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ IPSTATS_MIB_CEPKTS, /* InCEPkts */ IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */ + IPSTATS_MIB_OUTPKTS, /* OutTransmits */ __IPSTATS_MIB_MAX }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 89e62ed08dad..b06f678b03a1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,6 +101,8 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); + IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS); + iph_set_totlen(iph, skb->len); ip_send_check(iph); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index eaf1d3113b62..a85b0aba3646 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -83,7 +83,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), - SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS), + SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), @@ -93,6 +93,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a471c7e91761..571c10fb00b1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -117,6 +117,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return res; } + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + rcu_read_lock(); nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); @@ -328,7 +330,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -1987,7 +1989,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, skb->tstamp = cork->base.transmit_time; ip6_cork_steal_dst(skb, cork); - IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 99e28b444a4c..b75d3c9d41bb 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1789,7 +1789,7 @@ static void mld_sendpack(struct sk_buff *skb) rcu_read_lock(); idev = __in6_dev_get(skb->dev); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - sizeof(*pip6); @@ -2147,8 +2147,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) full_len = sizeof(struct ipv6hdr) + payload_len; rcu_read_lock(); - IP6_UPD_PO_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUT, full_len); + IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); rcu_read_unlock(); skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 679443d7ecb5..a19999b30bc0 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -504,7 +504,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, rcu_read_lock(); idev = __in6_dev_get(dst->dev); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index e20b3705c2d2..6d1d9221649d 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -61,7 +61,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), - SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), @@ -84,6 +84,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), + SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a2aa54a2baae..dd0a4e73e602 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -651,7 +651,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, * have been queued for deletion. */ rcu_read_lock(); - IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) -- cgit v1.2.3 From 3d44de9a10ea2b1658dfaed8ea6d3d7b6e0defbb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:45 +0000 Subject: tcp: add RTAX_FEATURE_TCP_USEC_TS This new dst feature flag will be used to allow TCP to use usec based timestamps instead of msec ones. ip route .... feature tcp_usec_ts Also document that RTAX_FEATURE_SACK and RTAX_FEATURE_TIMESTAMP are unused. RTAX_FEATURE_ALLFRAG is also going away soon. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 +++++ include/uapi/linux/rtnetlink.h | 18 +++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e15452df9804..04a0e647ef74 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -576,4 +576,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_user_timeout(struct sock *sk, int val); +static inline bool dst_tcp_usec_ts(const struct dst_entry *dst) +{ + return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS); +} + #endif /* _LINUX_TCP_H */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 51c13cf9c5ae..aa2482a0614a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -502,13 +502,17 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) -#define RTAX_FEATURE_ECN (1 << 0) -#define RTAX_FEATURE_SACK (1 << 1) -#define RTAX_FEATURE_TIMESTAMP (1 << 2) -#define RTAX_FEATURE_ALLFRAG (1 << 3) - -#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ - RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) /* unused */ +#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ +#define RTAX_FEATURE_ALLFRAG (1 << 3) +#define RTAX_FEATURE_TCP_USEC_TS (1 << 4) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ + RTAX_FEATURE_TCP_USEC_TS) struct rta_session { __u8 proto; -- cgit v1.2.3 From a77a0f5c7f23a8a4981a2a3ff47baa91ceaf1f53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:48 +0000 Subject: tcp: add TCPI_OPT_USEC_TS Add the ability to report in tcp_info.tcpi_options if a flow is using usec resolution in TCP TS val. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d1d08da6331a..8aa3916e14f6 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +#define TCPI_OPT_USEC_TS 64 /* usec timestamps */ /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b961364b4961..a86d8200a1e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3760,6 +3760,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) + info->tcpi_options |= TCPI_OPT_USEC_TS; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, -- cgit v1.2.3 From 8c90b8b4e8eb671929dc684b96e059cd862c067f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Oct 2023 12:16:33 -0700 Subject: wifi: nl80211: fix doc typos Correct some typos. Signed-off-by: Randy Dunlap Cc: Johannes Berg Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231001191633.19090-3-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0c64994b118e..dced2c49daec 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -167,7 +167,7 @@ * following events occur. * a) Expiration of hardware timer whose expiration time is set to maximum * coalescing delay of matching coalesce rule. - * b) Coalescing buffer in hardware reaches it's limit. + * b) Coalescing buffer in hardware reaches its limit. * c) Packet doesn't match any of the configured coalesce rules. * * User needs to configure following parameters for creating a coalesce @@ -326,7 +326,7 @@ /** * DOC: Multi-Link Operation * - * In Multi-Link Operation, a connection between to MLDs utilizes multiple + * In Multi-Link Operation, a connection between two MLDs utilizes multiple * links. To use this in nl80211, various commands and responses now need * to or will include the new %NL80211_ATTR_MLO_LINKS attribute. * Additionally, various commands that need to operate on a specific link -- cgit v1.2.3 From e3570f040836b2f6332ce5cb4db0fd070448aeb5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 21 Oct 2023 13:27:07 +0200 Subject: devlink: make devlink_flash_overwrite enum named one Since this enum is going to be used in generated userspace file, name it properly. Signed-off-by: Jiri Pirko Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20231021112711.660606-7-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/uapi/linux/devlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cd4b82458d1b..b3c8383d342d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -265,7 +265,7 @@ enum { * Documentation/networking/devlink/devlink-flash.rst * */ -enum { +enum devlink_flash_overwrite { DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT, DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT, -- cgit v1.2.3 From 1d0507f46843b14b0cb051fe50ebc7e6432111ab Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 23 Oct 2023 11:17:07 -0700 Subject: net: mptcp: convert netlink from small_ops to ops in the current MPTCP control plane, all operations use a netlink attribute of the same type "MPTCP_PM_ATTR". However, add/del/get/flush operations only parse the first element in the message _ the one that describes MPTCP endpoints (that was named MPTCP_PM_ATTR_ADDR and mostly used in ADD_ADDR operations _ probably the similarity of "attr", "addr" and "add" might cause some confusion to human readers). Convert MPTCP from 'small_ops' to 'ops', thus allowing different attributes for each single operation, hopefully makes all this clearer to human readers. - use a separate attribute set for add/del/get/flush address operation, binary compatible with the existing one, to store the endpoint address. MPTCP_PM_ENDPOINT_ADDR is added to the uAPI (with the same value as MPTCP_PM_ATTR_ADDR) for these operations. - convert mptcp_pm_ops[] and add policy files accordingly. this prepares MPTCP control plane to be described as YAML spec. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/340 Acked-by: Paolo Abeni Signed-off-by: Davide Caratti Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-1-v2-3-16b1f701f900@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 8 ++ net/mptcp/pm_netlink.c | 191 ++++++++++++++++++++++++++++++--------------- 2 files changed, 135 insertions(+), 64 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index ee9c49f949a2..0e62937ab17c 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -65,6 +65,14 @@ enum { #define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) +enum { + MPTCP_PM_ENDPOINT_ADDR = 1, + + __MPTCP_PM_ENDPOINT_MAX +}; + +#define MPTCP_PM_ENDPOINT_MAX (__MPTCP_PM_ENDPOINT_MAX - 1) + enum { MPTCP_PM_ADDR_ATTR_UNSPEC, diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 9661f3812682..fd4e843505e5 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -48,6 +48,60 @@ struct pm_nl_pernet { #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 +static +const struct nla_policy mptcp_pm_address_nl_policy[MPTCP_PM_ADDR_ATTR_IF_IDX + 1] = { + [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, + [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, + [MPTCP_PM_ADDR_ATTR_ADDR6] = NLA_POLICY_EXACT_LEN(16), + [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16, }, + [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32, }, + [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32, }, +}; + +/* MPTCP_PM_CMD_ADD_ADDR / DEL / GET / FLUSH - do */ +static +const struct nla_policy mptcp_pm_endpoint_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = { + [MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), +}; + +/* MPTCP_PM_CMD_SET_LIMITS - do */ +static +const struct nla_policy mptcp_pm_set_limits_nl_policy[MPTCP_PM_ATTR_SUBFLOWS + 1] = { + [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, +}; + +/* MPTCP_PM_CMD_SET_FLAGS - do */ +static +const struct nla_policy mptcp_pm_set_flags_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = { + [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), +}; + +/* MPTCP_PM_CMD_ANNOUNCE - do */ +static +const struct nla_policy mptcp_pm_announce_nl_policy[MPTCP_PM_ATTR_TOKEN + 1] = { + [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, +}; + +/* MPTCP_PM_CMD_REMOVE - do */ +static +const struct nla_policy mptcp_pm_remove_nl_policy[MPTCP_PM_ATTR_LOC_ID + 1] = { + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, }, +}; + +/* MPTCP_PM_CMD_SUBFLOW_CREATE / DESTROY - do */ +static +const struct nla_policy mptcp_pm_subflow_create_nl_policy[MPTCP_PM_ATTR_ADDR_REMOTE + 1] = { + [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), +}; + static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) { return net_generic(net, pm_nl_pernet_id); @@ -1104,29 +1158,6 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = { }, }; -static const struct nla_policy -mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = { - [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, - [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, - [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, - [MPTCP_PM_ADDR_ATTR_ADDR6] = - NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), - [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 }, - [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 }, - [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 }, -}; - -static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { - [MPTCP_PM_ATTR_ADDR] = - NLA_POLICY_NESTED(mptcp_pm_addr_policy), - [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, - [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, - [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, - [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, }, - [MPTCP_PM_ATTR_ADDR_REMOTE] = - NLA_POLICY_NESTED(mptcp_pm_addr_policy), -}; - void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); @@ -1188,7 +1219,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], /* no validation needed - was already done via nested policy */ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, - mptcp_pm_addr_policy, info->extack); + mptcp_pm_address_nl_policy, info->extack); if (err) return err; @@ -1305,7 +1336,7 @@ next: static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; int ret; @@ -1486,7 +1517,7 @@ next: static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; unsigned int addr_max; @@ -1677,7 +1708,7 @@ nla_put_failure: static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; struct sk_buff *msg; @@ -2283,72 +2314,104 @@ nla_put_failure: nlmsg_free(skb); } -static const struct genl_small_ops mptcp_pm_ops[] = { +static const struct genl_ops mptcp_pm_ops[] = { { - .cmd = MPTCP_PM_CMD_ADD_ADDR, - .doit = mptcp_nl_cmd_add_addr, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_ADD_ADDR, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_add_addr, + .policy = mptcp_pm_endpoint_nl_policy, + .maxattr = MPTCP_PM_ENDPOINT_ADDR, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_DEL_ADDR, - .doit = mptcp_nl_cmd_del_addr, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_DEL_ADDR, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_del_addr, + .policy = mptcp_pm_endpoint_nl_policy, + .maxattr = MPTCP_PM_ENDPOINT_ADDR, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, - .doit = mptcp_nl_cmd_flush_addrs, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_GET_ADDR, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_get_addr, + .dumpit = mptcp_nl_cmd_dump_addrs, + .policy = mptcp_pm_endpoint_nl_policy, + .maxattr = MPTCP_PM_ENDPOINT_ADDR, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_GET_ADDR, - .doit = mptcp_nl_cmd_get_addr, - .dumpit = mptcp_nl_cmd_dump_addrs, + .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_flush_addrs, + .policy = mptcp_pm_endpoint_nl_policy, + .maxattr = MPTCP_PM_ENDPOINT_ADDR, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_SET_LIMITS, - .doit = mptcp_nl_cmd_set_limits, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_SET_LIMITS, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_set_limits, + .policy = mptcp_pm_set_limits_nl_policy, + .maxattr = MPTCP_PM_ATTR_SUBFLOWS, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_GET_LIMITS, - .doit = mptcp_nl_cmd_get_limits, + .cmd = MPTCP_PM_CMD_GET_LIMITS, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_get_limits, + .policy = mptcp_pm_set_limits_nl_policy, + .maxattr = MPTCP_PM_ATTR_SUBFLOWS, }, { - .cmd = MPTCP_PM_CMD_SET_FLAGS, - .doit = mptcp_nl_cmd_set_flags, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_SET_FLAGS, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_set_flags, + .policy = mptcp_pm_set_flags_nl_policy, + .maxattr = MPTCP_PM_ATTR_ADDR_REMOTE, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_ANNOUNCE, - .doit = mptcp_nl_cmd_announce, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_ANNOUNCE, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_announce, + .policy = mptcp_pm_announce_nl_policy, + .maxattr = MPTCP_PM_ATTR_TOKEN, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_REMOVE, - .doit = mptcp_nl_cmd_remove, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_REMOVE, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_remove, + .policy = mptcp_pm_remove_nl_policy, + .maxattr = MPTCP_PM_ATTR_LOC_ID, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, - .doit = mptcp_nl_cmd_sf_create, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_sf_create, + .policy = mptcp_pm_subflow_create_nl_policy, + .maxattr = MPTCP_PM_ATTR_ADDR_REMOTE, + .flags = GENL_UNS_ADMIN_PERM, }, { - .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, - .doit = mptcp_nl_cmd_sf_destroy, - .flags = GENL_UNS_ADMIN_PERM, + .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, + .validate = GENL_DONT_VALIDATE_STRICT, + .doit = mptcp_nl_cmd_sf_destroy, + .policy = mptcp_pm_subflow_create_nl_policy, + .maxattr = MPTCP_PM_ATTR_ADDR_REMOTE, + .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, - .maxattr = MPTCP_PM_ATTR_MAX, - .policy = mptcp_pm_policy, .netnsok = true, .module = THIS_MODULE, - .small_ops = mptcp_pm_ops, - .n_small_ops = ARRAY_SIZE(mptcp_pm_ops), + .ops = mptcp_pm_ops, + .n_ops = ARRAY_SIZE(mptcp_pm_ops), .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), -- cgit v1.2.3 From 9d1ed17f93ce873cda16698267bdee096a768839 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 23 Oct 2023 11:17:09 -0700 Subject: uapi: mptcp: use header file generated from YAML spec generated with: $ ./tools/net/ynl/ynl-gen-c.py --mode uapi \ > --spec Documentation/netlink/specs/mptcp.yaml \ > --header -o include/uapi/linux/mptcp_pm.h Link: https://github.com/multipath-tcp/mptcp_net-next/issues/340 Acked-by: Paolo Abeni Signed-off-by: Davide Caratti Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-1-v2-5-16b1f701f900@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- include/uapi/linux/mptcp.h | 182 +++--------------------------------------- include/uapi/linux/mptcp_pm.h | 150 ++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 173 deletions(-) create mode 100644 include/uapi/linux/mptcp_pm.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 977de4624fe0..b2f53d5cae06 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14964,7 +14964,7 @@ F: Documentation/netlink/specs/mptcp.yaml F: Documentation/networking/mptcp-sysctl.rst F: include/net/mptcp.h F: include/trace/events/mptcp.h -F: include/uapi/linux/mptcp.h +F: include/uapi/linux/mptcp*.h F: net/mptcp/ F: tools/testing/selftests/bpf/*/*mptcp*.c F: tools/testing/selftests/net/mptcp/ diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 0e62937ab17c..64ecc8a3f9f2 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -23,99 +23,24 @@ #define MPTCP_SUBFLOW_FLAG_CONNECTED _BITUL(7) #define MPTCP_SUBFLOW_FLAG_MAPVALID _BITUL(8) -enum { - MPTCP_SUBFLOW_ATTR_UNSPEC, - MPTCP_SUBFLOW_ATTR_TOKEN_REM, - MPTCP_SUBFLOW_ATTR_TOKEN_LOC, - MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, - MPTCP_SUBFLOW_ATTR_MAP_SEQ, - MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, - MPTCP_SUBFLOW_ATTR_SSN_OFFSET, - MPTCP_SUBFLOW_ATTR_MAP_DATALEN, - MPTCP_SUBFLOW_ATTR_FLAGS, - MPTCP_SUBFLOW_ATTR_ID_REM, - MPTCP_SUBFLOW_ATTR_ID_LOC, - MPTCP_SUBFLOW_ATTR_PAD, - __MPTCP_SUBFLOW_ATTR_MAX -}; - -#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1) - -/* netlink interface */ -#define MPTCP_PM_NAME "mptcp_pm" #define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" #define MPTCP_PM_EV_GRP_NAME "mptcp_pm_events" -#define MPTCP_PM_VER 0x1 - -/* - * ATTR types defined for MPTCP - */ -enum { - MPTCP_PM_ATTR_UNSPEC, - - MPTCP_PM_ATTR_ADDR, /* nested address */ - MPTCP_PM_ATTR_RCV_ADD_ADDRS, /* u32 */ - MPTCP_PM_ATTR_SUBFLOWS, /* u32 */ - MPTCP_PM_ATTR_TOKEN, /* u32 */ - MPTCP_PM_ATTR_LOC_ID, /* u8 */ - MPTCP_PM_ATTR_ADDR_REMOTE, /* nested address */ - - __MPTCP_PM_ATTR_MAX -}; - -#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) - -enum { - MPTCP_PM_ENDPOINT_ADDR = 1, - - __MPTCP_PM_ENDPOINT_MAX -}; - -#define MPTCP_PM_ENDPOINT_MAX (__MPTCP_PM_ENDPOINT_MAX - 1) - -enum { - MPTCP_PM_ADDR_ATTR_UNSPEC, - - MPTCP_PM_ADDR_ATTR_FAMILY, /* u16 */ - MPTCP_PM_ADDR_ATTR_ID, /* u8 */ - MPTCP_PM_ADDR_ATTR_ADDR4, /* struct in_addr */ - MPTCP_PM_ADDR_ATTR_ADDR6, /* struct in6_addr */ - MPTCP_PM_ADDR_ATTR_PORT, /* u16 */ - MPTCP_PM_ADDR_ATTR_FLAGS, /* u32 */ - MPTCP_PM_ADDR_ATTR_IF_IDX, /* s32 */ - - __MPTCP_PM_ADDR_ATTR_MAX -}; -#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1) +#include -#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) -#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) -#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) -#define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) -#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) - -enum { - MPTCP_PM_CMD_UNSPEC, - - MPTCP_PM_CMD_ADD_ADDR, - MPTCP_PM_CMD_DEL_ADDR, - MPTCP_PM_CMD_GET_ADDR, - MPTCP_PM_CMD_FLUSH_ADDRS, - MPTCP_PM_CMD_SET_LIMITS, - MPTCP_PM_CMD_GET_LIMITS, - MPTCP_PM_CMD_SET_FLAGS, - MPTCP_PM_CMD_ANNOUNCE, - MPTCP_PM_CMD_REMOVE, - MPTCP_PM_CMD_SUBFLOW_CREATE, - MPTCP_PM_CMD_SUBFLOW_DESTROY, - - __MPTCP_PM_CMD_AFTER_LAST -}; +/* for backward compatibility */ +#define __MPTCP_PM_CMD_AFTER_LAST __MPTCP_PM_CMD_MAX +#define __MPTCP_ATTR_AFTER_LAST __MPTCP_ATTR_MAX #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) +#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) +#define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) +#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) + struct mptcp_info { __u8 mptcpi_subflows; __u8 mptcpi_add_addr_signal; @@ -138,93 +63,6 @@ struct mptcp_info { __u64 mptcpi_bytes_acked; }; -/* - * MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport - * A new MPTCP connection has been created. It is the good time to allocate - * memory and send ADD_ADDR if needed. Depending on the traffic-patterns - * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. - * - * MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport - * A MPTCP connection is established (can start new subflows). - * - * MPTCP_EVENT_CLOSED: token - * A MPTCP connection has stopped. - * - * MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] - * A new address has been announced by the peer. - * - * MPTCP_EVENT_REMOVED: token, rem_id - * An address has been lost by the peer. - * - * MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, - * saddr4 | saddr6, daddr4 | daddr6, sport, - * dport, backup, if_idx [, error] - * A new subflow has been established. 'error' should not be set. - * - * MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx - * [, error] - * A subflow has been closed. An error (copy of sk_err) could be set if an - * error has been detected for this subflow. - * - * MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx - * [, error] - * The priority of a subflow has changed. 'error' should not be set. - * - * MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 - * A new PM listener is created. - * - * MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 - * A PM listener is closed. - */ -enum mptcp_event_type { - MPTCP_EVENT_UNSPEC = 0, - MPTCP_EVENT_CREATED = 1, - MPTCP_EVENT_ESTABLISHED = 2, - MPTCP_EVENT_CLOSED = 3, - - MPTCP_EVENT_ANNOUNCED = 6, - MPTCP_EVENT_REMOVED = 7, - - MPTCP_EVENT_SUB_ESTABLISHED = 10, - MPTCP_EVENT_SUB_CLOSED = 11, - - MPTCP_EVENT_SUB_PRIORITY = 13, - - MPTCP_EVENT_LISTENER_CREATED = 15, - MPTCP_EVENT_LISTENER_CLOSED = 16, -}; - -enum mptcp_event_attr { - MPTCP_ATTR_UNSPEC = 0, - - MPTCP_ATTR_TOKEN, /* u32 */ - MPTCP_ATTR_FAMILY, /* u16 */ - MPTCP_ATTR_LOC_ID, /* u8 */ - MPTCP_ATTR_REM_ID, /* u8 */ - MPTCP_ATTR_SADDR4, /* be32 */ - MPTCP_ATTR_SADDR6, /* struct in6_addr */ - MPTCP_ATTR_DADDR4, /* be32 */ - MPTCP_ATTR_DADDR6, /* struct in6_addr */ - MPTCP_ATTR_SPORT, /* be16 */ - MPTCP_ATTR_DPORT, /* be16 */ - MPTCP_ATTR_BACKUP, /* u8 */ - MPTCP_ATTR_ERROR, /* u8 */ - MPTCP_ATTR_FLAGS, /* u16 */ - MPTCP_ATTR_TIMEOUT, /* u32 */ - MPTCP_ATTR_IF_IDX, /* s32 */ - MPTCP_ATTR_RESET_REASON,/* u32 */ - MPTCP_ATTR_RESET_FLAGS, /* u32 */ - MPTCP_ATTR_SERVER_SIDE, /* u8 */ - - __MPTCP_ATTR_AFTER_LAST -}; - -#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) - /* MPTCP Reset reason codes, rfc8684 */ #define MPTCP_RST_EUNSPEC 0 #define MPTCP_RST_EMPTCP 1 diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h new file mode 100644 index 000000000000..0ad598fe940b --- /dev/null +++ b/include/uapi/linux/mptcp_pm.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/mptcp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_MPTCP_PM_H +#define _UAPI_LINUX_MPTCP_PM_H + +#define MPTCP_PM_NAME "mptcp_pm" +#define MPTCP_PM_VER 1 + +/** + * enum mptcp_event_type + * @MPTCP_EVENT_UNSPEC: unused event + * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport A new MPTCP connection has been created. It is the good time + * to allocate memory and send ADD_ADDR if needed. Depending on the + * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED + * is sent. + * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport A MPTCP connection is established (can start new subflows). + * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. + * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A + * new address has been announced by the peer. + * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer. + * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new + * subflow has been established. 'error' should not be set. + * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been + * closed. An error (copy of sk_err) could be set if an error has been + * detected for this subflow. + * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a + * subflow has changed. 'error' should not be set. + * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM + * listener is created. + * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener + * is closed. + */ +enum mptcp_event_type { + MPTCP_EVENT_UNSPEC, + MPTCP_EVENT_CREATED, + MPTCP_EVENT_ESTABLISHED, + MPTCP_EVENT_CLOSED, + MPTCP_EVENT_ANNOUNCED = 6, + MPTCP_EVENT_REMOVED, + MPTCP_EVENT_SUB_ESTABLISHED = 10, + MPTCP_EVENT_SUB_CLOSED, + MPTCP_EVENT_SUB_PRIORITY = 13, + MPTCP_EVENT_LISTENER_CREATED = 15, + MPTCP_EVENT_LISTENER_CLOSED, +}; + +enum { + MPTCP_PM_ADDR_ATTR_UNSPEC, + MPTCP_PM_ADDR_ATTR_FAMILY, + MPTCP_PM_ADDR_ATTR_ID, + MPTCP_PM_ADDR_ATTR_ADDR4, + MPTCP_PM_ADDR_ATTR_ADDR6, + MPTCP_PM_ADDR_ATTR_PORT, + MPTCP_PM_ADDR_ATTR_FLAGS, + MPTCP_PM_ADDR_ATTR_IF_IDX, + + __MPTCP_PM_ADDR_ATTR_MAX +}; +#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1) + +enum { + MPTCP_SUBFLOW_ATTR_UNSPEC, + MPTCP_SUBFLOW_ATTR_TOKEN_REM, + MPTCP_SUBFLOW_ATTR_TOKEN_LOC, + MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + MPTCP_SUBFLOW_ATTR_SSN_OFFSET, + MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + MPTCP_SUBFLOW_ATTR_FLAGS, + MPTCP_SUBFLOW_ATTR_ID_REM, + MPTCP_SUBFLOW_ATTR_ID_LOC, + MPTCP_SUBFLOW_ATTR_PAD, + + __MPTCP_SUBFLOW_ATTR_MAX +}; +#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1) + +enum { + MPTCP_PM_ENDPOINT_ADDR = 1, + + __MPTCP_PM_ENDPOINT_MAX +}; +#define MPTCP_PM_ENDPOINT_MAX (__MPTCP_PM_ENDPOINT_MAX - 1) + +enum { + MPTCP_PM_ATTR_UNSPEC, + MPTCP_PM_ATTR_ADDR, + MPTCP_PM_ATTR_RCV_ADD_ADDRS, + MPTCP_PM_ATTR_SUBFLOWS, + MPTCP_PM_ATTR_TOKEN, + MPTCP_PM_ATTR_LOC_ID, + MPTCP_PM_ATTR_ADDR_REMOTE, + + __MPTCP_PM_ATTR_MAX +}; +#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) + +enum mptcp_event_attr { + MPTCP_ATTR_UNSPEC, + MPTCP_ATTR_TOKEN, + MPTCP_ATTR_FAMILY, + MPTCP_ATTR_LOC_ID, + MPTCP_ATTR_REM_ID, + MPTCP_ATTR_SADDR4, + MPTCP_ATTR_SADDR6, + MPTCP_ATTR_DADDR4, + MPTCP_ATTR_DADDR6, + MPTCP_ATTR_SPORT, + MPTCP_ATTR_DPORT, + MPTCP_ATTR_BACKUP, + MPTCP_ATTR_ERROR, + MPTCP_ATTR_FLAGS, + MPTCP_ATTR_TIMEOUT, + MPTCP_ATTR_IF_IDX, + MPTCP_ATTR_RESET_REASON, + MPTCP_ATTR_RESET_FLAGS, + MPTCP_ATTR_SERVER_SIDE, + + __MPTCP_ATTR_MAX +}; +#define MPTCP_ATTR_MAX (__MPTCP_ATTR_MAX - 1) + +enum { + MPTCP_PM_CMD_UNSPEC, + MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_CMD_SET_LIMITS, + MPTCP_PM_CMD_GET_LIMITS, + MPTCP_PM_CMD_SET_FLAGS, + MPTCP_PM_CMD_ANNOUNCE, + MPTCP_PM_CMD_REMOVE, + MPTCP_PM_CMD_SUBFLOW_CREATE, + MPTCP_PM_CMD_SUBFLOW_DESTROY, + + __MPTCP_PM_CMD_MAX +}; +#define MPTCP_PM_CMD_MAX (__MPTCP_PM_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_MPTCP_PM_H */ -- cgit v1.2.3 From 87cd83714f30ef2f19f0390e98beb8d78e173f0f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 23 Oct 2023 11:17:29 -0700 Subject: net: dsa: Rename IFLA_DSA_MASTER to IFLA_DSA_CONDUIT This preserves the existing IFLA_DSA_MASTER which is part of the uAPI and creates an alias named IFLA_DSA_CONDUIT. Reviewed-by: Andrew Lunn Reviewed-by: Vladimir Oltean Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20231023181729.1191071-3-florian.fainelli@broadcom.com Signed-off-by: Jakub Kicinski --- Documentation/networking/dsa/configuration.rst | 4 ++-- include/uapi/linux/if_link.h | 4 +++- net/dsa/netlink.c | 10 +++++----- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst index e6c9719874b0..6cc4ded3cc23 100644 --- a/Documentation/networking/dsa/configuration.rst +++ b/Documentation/networking/dsa/configuration.rst @@ -393,7 +393,7 @@ description which has an ``ethernet`` property. It is up to the user to configure the system for the switch to use other conduits. DSA uses the ``rtnl_link_ops`` mechanism (with a "dsa" ``kind``) to allow -changing the DSA conduit of a user port. The ``IFLA_DSA_MASTER`` u32 netlink +changing the DSA conduit of a user port. The ``IFLA_DSA_CONDUIT`` u32 netlink attribute contains the ifindex of the conduit device that handles each user device. The DSA conduit must be a valid candidate based on firmware node information, or a LAG interface which contains only slaves which are valid @@ -435,7 +435,7 @@ Using iproute2, the following manipulations are possible: dsa master bond0 Notice that in the case of CPU ports under a LAG, the use of the -``IFLA_DSA_MASTER`` netlink attribute is not strictly needed, but rather, DSA +``IFLA_DSA_CONDUIT`` netlink attribute is not strictly needed, but rather, DSA reacts to the ``IFLA_MASTER`` attribute change of its present conduit (``eth0``) and migrates all user ports to the new upper of ``eth0``, ``bond0``. Similarly, when ``bond0`` is destroyed using ``RTM_DELLINK``, DSA migrates the user ports diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 9f8a3da0f14f..f4191be137a4 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1394,7 +1394,9 @@ enum { enum { IFLA_DSA_UNSPEC, - IFLA_DSA_MASTER, + IFLA_DSA_CONDUIT, + /* Deprecated, use IFLA_DSA_CONDUIT instead */ + IFLA_DSA_MASTER = IFLA_DSA_CONDUIT, __IFLA_DSA_MAX, }; diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c index f56f90a25b99..1332e56349e5 100644 --- a/net/dsa/netlink.c +++ b/net/dsa/netlink.c @@ -8,7 +8,7 @@ #include "user.h" static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = { - [IFLA_DSA_MASTER] = { .type = NLA_U32 }, + [IFLA_DSA_CONDUIT] = { .type = NLA_U32 }, }; static int dsa_changelink(struct net_device *dev, struct nlattr *tb[], @@ -20,8 +20,8 @@ static int dsa_changelink(struct net_device *dev, struct nlattr *tb[], if (!data) return 0; - if (data[IFLA_DSA_MASTER]) { - u32 ifindex = nla_get_u32(data[IFLA_DSA_MASTER]); + if (data[IFLA_DSA_CONDUIT]) { + u32 ifindex = nla_get_u32(data[IFLA_DSA_CONDUIT]); struct net_device *conduit; conduit = __dev_get_by_index(dev_net(dev), ifindex); @@ -38,7 +38,7 @@ static int dsa_changelink(struct net_device *dev, struct nlattr *tb[], static size_t dsa_get_size(const struct net_device *dev) { - return nla_total_size(sizeof(u32)) + /* IFLA_DSA_MASTER */ + return nla_total_size(sizeof(u32)) + /* IFLA_DSA_CONDUIT */ 0; } @@ -46,7 +46,7 @@ static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); - if (nla_put_u32(skb, IFLA_DSA_MASTER, conduit->ifindex)) + if (nla_put_u32(skb, IFLA_DSA_CONDUIT, conduit->ifindex)) return -EMSGSIZE; return 0; -- cgit v1.2.3 From 35dfaad7188cdc043fde31709c796f5a692ba2bd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Oct 2023 23:48:58 +0200 Subject: netkit, bpf: Add bpf programmable net device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work adds a new, minimal BPF-programmable device called "netkit" (former PoC code-name "meta") we recently presented at LSF/MM/BPF. The core idea is that BPF programs are executed within the drivers xmit routine and therefore e.g. in case of containers/Pods moving BPF processing closer to the source. One of the goals was that in case of Pod egress traffic, this allows to move BPF programs from hostns tcx ingress into the device itself, providing earlier drop or forward mechanisms, for example, if the BPF program determines that the skb must be sent out of the node, then a redirect to the physical device can take place directly without going through per-CPU backlog queue. This helps to shift processing for such traffic from softirq to process context, leading to better scheduling decisions/performance (see measurements in the slides). In this initial version, the netkit device ships as a pair, but we plan to extend this further so it can also operate in single device mode. The pair comes with a primary and a peer device. Only the primary device, typically residing in hostns, can manage BPF programs for itself and its peer. The peer device is designated for containers/Pods and cannot attach/detach BPF programs. Upon the device creation, the user can set the default policy to 'pass' or 'drop' for the case when no BPF program is attached. Additionally, the device can be operated in L3 (default) or L2 mode. The management of BPF programs is done via bpf_mprog, so that multi-attach is supported right from the beginning with similar API and dependency controls as tcx. For details on the latter see commit 053c8e1f235d ("bpf: Add generic attach/detach/query API for multi-progs"). tc BPF compatibility is provided, so that existing programs can be easily migrated. Going forward, we plan to use netkit devices in Cilium as the main device type for connecting Pods. They will be operated in L3 mode in order to simplify a Pod's neighbor management and the peer will operate in default drop mode, so that no traffic is leaving between the time when a Pod is brought up by the CNI plugin and programs attached by the agent. Additionally, the programs we attach via tcx on the physical devices are using bpf_redirect_peer() for inbound traffic into netkit device, hence the latter is also supporting the ndo_get_peer_dev callback. Similarly, we use bpf_redirect_neigh() for the way out, pushing from netkit peer to phys device directly. Also, BIG TCP is supported on netkit device. For the follow-up work in single device mode, we plan to convert Cilium's cilium_host/_net devices into a single one. An extensive test suite for checking device operations and the BPF program and link management API comes as BPF selftests in this series. Co-developed-by: Nikolay Aleksandrov Signed-off-by: Nikolay Aleksandrov Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://github.com/borkmann/iproute2/tree/pr/netkit Link: http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf (24ff.) Link: https://lore.kernel.org/r/20231024214904.29825-2-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- MAINTAINERS | 9 + drivers/net/Kconfig | 9 + drivers/net/Makefile | 1 + drivers/net/netkit.c | 940 +++++++++++++++++++++++++++++++++++++++++ include/net/netkit.h | 38 ++ include/uapi/linux/bpf.h | 14 + include/uapi/linux/if_link.h | 24 ++ kernel/bpf/syscall.c | 30 +- tools/include/uapi/linux/bpf.h | 14 + 9 files changed, 1074 insertions(+), 5 deletions(-) create mode 100644 drivers/net/netkit.c create mode 100644 include/net/netkit.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index ed33b9df8b3d..43be6197e655 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3795,6 +3795,15 @@ L: bpf@vger.kernel.org S: Odd Fixes K: (?:\b|_)bpf(?:\b|_) +BPF [NETKIT] (BPF-programmable network device) +M: Daniel Borkmann +M: Nikolay Aleksandrov +L: bpf@vger.kernel.org +L: netdev@vger.kernel.org +S: Supported +F: drivers/net/netkit.c +F: include/net/netkit.h + BPF [NETWORKING] (struct_ops, reuseport) M: Martin KaFai Lau L: bpf@vger.kernel.org diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 44eeb5d61ba9..af0da4bb429b 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -448,6 +448,15 @@ config NLMON diagnostics, etc. This is mostly intended for developers or support to debug netlink issues. If unsure, say N. +config NETKIT + bool "BPF-programmable network device" + depends on BPF_SYSCALL + help + The netkit device is a virtual networking device where BPF programs + can be attached to the device(s) transmission routine in order to + implement the driver's internal logic. The device can be configured + to operate in L3 or L2 mode. If unsure, say N. + config NET_VRF tristate "Virtual Routing and Forwarding (Lite)" depends on IP_MULTIPLE_TABLES diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 8a83db32509d..7cab36f94782 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_MDIO) += mdio.o obj-$(CONFIG_NET) += loopback.o obj-$(CONFIG_NETDEV_LEGACY_INIT) += Space.o obj-$(CONFIG_NETCONSOLE) += netconsole.o +obj-$(CONFIG_NETKIT) += netkit.o obj-y += phy/ obj-y += pse-pd/ obj-y += mdio/ diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c new file mode 100644 index 000000000000..7e484f9fd3ae --- /dev/null +++ b/drivers/net/netkit.c @@ -0,0 +1,940 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2023 Isovalent */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DRV_NAME "netkit" + +struct netkit { + /* Needed in fast-path */ + struct net_device __rcu *peer; + struct bpf_mprog_entry __rcu *active; + enum netkit_action policy; + struct bpf_mprog_bundle bundle; + + /* Needed in slow-path */ + enum netkit_mode mode; + bool primary; + u32 headroom; +}; + +struct netkit_link { + struct bpf_link link; + struct net_device *dev; + u32 location; +}; + +static __always_inline int +netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + enum netkit_action ret) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != NETKIT_NEXT) + break; + } + return ret; +} + +static void netkit_prep_forward(struct sk_buff *skb, bool xnet) +{ + skb_scrub_packet(skb, xnet); + skb->priority = 0; + nf_skip_egress(skb, true); +} + +static struct netkit *netkit_priv(const struct net_device *dev) +{ + return netdev_priv(dev); +} + +static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + enum netkit_action ret = READ_ONCE(nk->policy); + netdev_tx_t ret_dev = NET_XMIT_SUCCESS; + const struct bpf_mprog_entry *entry; + struct net_device *peer; + + rcu_read_lock(); + peer = rcu_dereference(nk->peer); + if (unlikely(!peer || !(peer->flags & IFF_UP) || + !pskb_may_pull(skb, ETH_HLEN) || + skb_orphan_frags(skb, GFP_ATOMIC))) + goto drop; + netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + skb->dev = peer; + entry = rcu_dereference(nk->active); + if (entry) + ret = netkit_run(entry, skb, ret); + switch (ret) { + case NETKIT_NEXT: + case NETKIT_PASS: + skb->protocol = eth_type_trans(skb, skb->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + __netif_rx(skb); + break; + case NETKIT_REDIRECT: + skb_do_redirect(skb); + break; + case NETKIT_DROP: + default: +drop: + kfree_skb(skb); + dev_core_stats_tx_dropped_inc(dev); + ret_dev = NET_XMIT_DROP; + break; + } + rcu_read_unlock(); + return ret_dev; +} + +static int netkit_open(struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + if (!peer) + return -ENOTCONN; + if (peer->flags & IFF_UP) { + netif_carrier_on(dev); + netif_carrier_on(peer); + } + return 0; +} + +static int netkit_close(struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + netif_carrier_off(dev); + if (peer) + netif_carrier_off(peer); + return 0; +} + +static int netkit_get_iflink(const struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer; + int iflink = 0; + + rcu_read_lock(); + peer = rcu_dereference(nk->peer); + if (peer) + iflink = peer->ifindex; + rcu_read_unlock(); + return iflink; +} + +static void netkit_set_multicast(struct net_device *dev) +{ + /* Nothing to do, we receive whatever gets pushed to us! */ +} + +static void netkit_set_headroom(struct net_device *dev, int headroom) +{ + struct netkit *nk = netkit_priv(dev), *nk2; + struct net_device *peer; + + if (headroom < 0) + headroom = NET_SKB_PAD; + + rcu_read_lock(); + peer = rcu_dereference(nk->peer); + if (unlikely(!peer)) + goto out; + + nk2 = netkit_priv(peer); + nk->headroom = headroom; + headroom = max(nk->headroom, nk2->headroom); + + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; +out: + rcu_read_unlock(); +} + +static struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return rcu_dereference(netkit_priv(dev)->peer); +} + +static void netkit_uninit(struct net_device *dev); + +static const struct net_device_ops netkit_netdev_ops = { + .ndo_open = netkit_open, + .ndo_stop = netkit_close, + .ndo_start_xmit = netkit_xmit, + .ndo_set_rx_mode = netkit_set_multicast, + .ndo_set_rx_headroom = netkit_set_headroom, + .ndo_get_iflink = netkit_get_iflink, + .ndo_get_peer_dev = netkit_peer_dev, + .ndo_uninit = netkit_uninit, + .ndo_features_check = passthru_features_check, +}; + +static void netkit_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); +} + +static const struct ethtool_ops netkit_ethtool_ops = { + .get_drvinfo = netkit_get_drvinfo, +}; + +static void netkit_setup(struct net_device *dev) +{ + static const netdev_features_t netkit_features_hw_vlan = + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_STAG_TX | + NETIF_F_HW_VLAN_STAG_RX; + static const netdev_features_t netkit_features = + netkit_features_hw_vlan | + NETIF_F_SG | + NETIF_F_FRAGLIST | + NETIF_F_HW_CSUM | + NETIF_F_RXCSUM | + NETIF_F_SCTP_CRC | + NETIF_F_HIGHDMA | + NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL; + + ether_setup(dev); + dev->max_mtu = ETH_MAX_MTU; + + dev->flags |= IFF_NOARP; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->priv_flags |= IFF_PHONY_HEADROOM; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->ethtool_ops = &netkit_ethtool_ops; + dev->netdev_ops = &netkit_netdev_ops; + + dev->features |= netkit_features | NETIF_F_LLTX; + dev->hw_features = netkit_features; + dev->hw_enc_features = netkit_features; + dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + dev->vlan_features = dev->features & ~netkit_features_hw_vlan; + + dev->needs_free_netdev = true; + + netif_set_tso_max_size(dev, GSO_MAX_SIZE); +} + +static struct net *netkit_get_link_net(const struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + return peer ? dev_net(peer) : dev_net(dev); +} + +static int netkit_check_policy(int policy, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + switch (policy) { + case NETKIT_PASS: + case NETKIT_DROP: + return 0; + default: + NL_SET_ERR_MSG_ATTR(extack, tb, + "Provided default xmit policy not supported"); + return -EINVAL; + } +} + +static int netkit_check_mode(int mode, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + switch (mode) { + case NETKIT_L2: + case NETKIT_L3: + return 0; + default: + NL_SET_ERR_MSG_ATTR(extack, tb, + "Provided device mode can only be L2 or L3"); + return -EINVAL; + } +} + +static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *attr = tb[IFLA_ADDRESS]; + + if (!attr) + return 0; + NL_SET_ERR_MSG_ATTR(extack, attr, + "Setting Ethernet address is not supported"); + return -EOPNOTSUPP; +} + +static struct rtnl_link_ops netkit_link_ops; + +static int netkit_new_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr; + enum netkit_action default_prim = NETKIT_PASS; + enum netkit_action default_peer = NETKIT_PASS; + enum netkit_mode mode = NETKIT_L3; + unsigned char ifname_assign_type; + struct ifinfomsg *ifmp = NULL; + struct net_device *peer; + char ifname[IFNAMSIZ]; + struct netkit *nk; + struct net *net; + int err; + + if (data) { + if (data[IFLA_NETKIT_MODE]) { + attr = data[IFLA_NETKIT_MODE]; + mode = nla_get_u32(attr); + err = netkit_check_mode(mode, attr, extack); + if (err < 0) + return err; + } + if (data[IFLA_NETKIT_PEER_INFO]) { + attr = data[IFLA_NETKIT_PEER_INFO]; + ifmp = nla_data(attr); + err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack); + if (err < 0) + return err; + err = netkit_validate(peer_tb, NULL, extack); + if (err < 0) + return err; + tbp = peer_tb; + } + if (data[IFLA_NETKIT_POLICY]) { + attr = data[IFLA_NETKIT_POLICY]; + default_prim = nla_get_u32(attr); + err = netkit_check_policy(default_prim, attr, extack); + if (err < 0) + return err; + } + if (data[IFLA_NETKIT_PEER_POLICY]) { + attr = data[IFLA_NETKIT_PEER_POLICY]; + default_peer = nla_get_u32(attr); + err = netkit_check_policy(default_peer, attr, extack); + if (err < 0) + return err; + } + } + + if (ifmp && tbp[IFLA_IFNAME]) { + nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); + ifname_assign_type = NET_NAME_USER; + } else { + strscpy(ifname, "nk%d", IFNAMSIZ); + ifname_assign_type = NET_NAME_ENUM; + } + + net = rtnl_link_get_net(src_net, tbp); + if (IS_ERR(net)) + return PTR_ERR(net); + + peer = rtnl_create_link(net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) { + put_net(net); + return PTR_ERR(peer); + } + + netif_inherit_tso_max(peer, dev); + + if (mode == NETKIT_L2) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = default_peer; + nk->mode = mode; + bpf_mprog_bundle_init(&nk->bundle); + RCU_INIT_POINTER(nk->active, NULL); + RCU_INIT_POINTER(nk->peer, NULL); + + err = register_netdevice(peer); + put_net(net); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; + + if (mode == NETKIT_L2) + eth_hw_addr_random(dev); + if (tb[IFLA_IFNAME]) + nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + else + strscpy(dev->name, "nk%d", IFNAMSIZ); + + nk = netkit_priv(dev); + nk->primary = true; + nk->policy = default_prim; + nk->mode = mode; + bpf_mprog_bundle_init(&nk->bundle); + RCU_INIT_POINTER(nk->active, NULL); + RCU_INIT_POINTER(nk->peer, NULL); + + err = register_netdevice(dev); + if (err < 0) + goto err_configure_peer; + netif_carrier_off(dev); + if (mode == NETKIT_L2) + dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); + + rcu_assign_pointer(netkit_priv(dev)->peer, peer); + rcu_assign_pointer(netkit_priv(peer)->peer, dev); + return 0; +err_configure_peer: + unregister_netdevice(peer); + return err; +err_register_peer: + free_netdev(peer); + return err; +} + +static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev, + bool bundle_fallback) +{ + struct netkit *nk = netkit_priv(dev); + struct bpf_mprog_entry *entry; + + ASSERT_RTNL(); + entry = rcu_dereference_rtnl(nk->active); + if (entry) + return entry; + if (bundle_fallback) + return &nk->bundle.a; + return NULL; +} + +static void netkit_entry_update(struct net_device *dev, + struct bpf_mprog_entry *entry) +{ + struct netkit *nk = netkit_priv(dev); + + ASSERT_RTNL(); + rcu_assign_pointer(nk->active, entry); +} + +static void netkit_entry_sync(void) +{ + synchronize_rcu(); +} + +static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which) +{ + struct net_device *dev; + struct netkit *nk; + + ASSERT_RTNL(); + + switch (which) { + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + break; + default: + return ERR_PTR(-EINVAL); + } + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return ERR_PTR(-ENODEV); + if (dev->netdev_ops != &netkit_netdev_ops) + return ERR_PTR(-ENXIO); + + nk = netkit_priv(dev); + if (!nk->primary) + return ERR_PTR(-EACCES); + if (which == BPF_NETKIT_PEER) { + dev = rcu_dereference_rtnl(nk->peer); + if (!dev) + return ERR_PTR(-ENODEV); + } + return dev; +} + +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_mprog_entry *entry, *entry_new; + struct bpf_prog *replace_prog = NULL; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex, + attr->attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + entry = netkit_entry_fetch(dev, true); + if (attr->attach_flags & BPF_F_REPLACE) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, + prog->type); + if (IS_ERR(replace_prog)) { + ret = PTR_ERR(replace_prog); + replace_prog = NULL; + goto out; + } + } + ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, + attr->attach_flags, attr->relative_fd, + attr->expected_revision); + if (!ret) { + if (entry != entry_new) { + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + } + bpf_mprog_commit(entry); + } +out: + if (replace_prog) + bpf_prog_put(replace_prog); + rtnl_unlock(); + return ret; +} + +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex, + attr->attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + entry = netkit_entry_fetch(dev, false); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, + attr->relative_fd, attr->expected_revision); + if (!ret) { + if (!bpf_mprog_total(entry_new)) + entry_new = NULL; + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, + attr->query.target_ifindex, + attr->query.attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false)); +out: + rtnl_unlock(); + return ret; +} + +static struct netkit_link *netkit_link(const struct bpf_link *link) +{ + return container_of(link, struct netkit_link, link); +} + +static int netkit_link_prog_attach(struct bpf_link *link, u32 flags, + u32 id_or_fd, u64 revision) +{ + struct netkit_link *nkl = netkit_link(link); + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev = nkl->dev; + int ret; + + ASSERT_RTNL(); + entry = netkit_entry_fetch(dev, true); + ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, + id_or_fd, revision); + if (!ret) { + if (entry != entry_new) { + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + } + bpf_mprog_commit(entry); + } + return ret; +} + +static void netkit_link_release(struct bpf_link *link) +{ + struct netkit_link *nkl = netkit_link(link); + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = nkl->dev; + if (!dev) + goto out; + entry = netkit_entry_fetch(dev, false); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); + if (!ret) { + if (!bpf_mprog_total(entry_new)) + entry_new = NULL; + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + bpf_mprog_commit(entry); + nkl->dev = NULL; + } +out: + WARN_ON_ONCE(ret); + rtnl_unlock(); +} + +static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog, + struct bpf_prog *oprog) +{ + struct netkit_link *nkl = netkit_link(link); + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = nkl->dev; + if (!dev) { + ret = -ENOLINK; + goto out; + } + if (oprog && link->prog != oprog) { + ret = -EPERM; + goto out; + } + oprog = link->prog; + if (oprog == nprog) { + bpf_prog_put(nprog); + goto out; + } + entry = netkit_entry_fetch(dev, false); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, + BPF_F_REPLACE | BPF_F_ID, + link->prog->aux->id, 0); + if (!ret) { + WARN_ON_ONCE(entry != entry_new); + oprog = xchg(&link->prog, nprog); + bpf_prog_put(oprog); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +static void netkit_link_dealloc(struct bpf_link *link) +{ + kfree(netkit_link(link)); +} + +static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) +{ + const struct netkit_link *nkl = netkit_link(link); + u32 ifindex = 0; + + rtnl_lock(); + if (nkl->dev) + ifindex = nkl->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); + seq_printf(seq, "attach_type:\t%u (%s)\n", + nkl->location, + nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer"); +} + +static int netkit_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct netkit_link *nkl = netkit_link(link); + u32 ifindex = 0; + + rtnl_lock(); + if (nkl->dev) + ifindex = nkl->dev->ifindex; + rtnl_unlock(); + + info->netkit.ifindex = ifindex; + info->netkit.attach_type = nkl->location; + return 0; +} + +static int netkit_link_detach(struct bpf_link *link) +{ + netkit_link_release(link); + return 0; +} + +static const struct bpf_link_ops netkit_link_lops = { + .release = netkit_link_release, + .detach = netkit_link_detach, + .dealloc = netkit_link_dealloc, + .update_prog = netkit_link_update, + .show_fdinfo = netkit_link_fdinfo, + .fill_link_info = netkit_link_fill_info, +}; + +static int netkit_link_init(struct netkit_link *nkl, + struct bpf_link_primer *link_primer, + const union bpf_attr *attr, + struct net_device *dev, + struct bpf_prog *prog) +{ + bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT, + &netkit_link_lops, prog); + nkl->location = attr->link_create.attach_type; + nkl->dev = dev; + return bpf_link_prime(&nkl->link, link_primer); +} + +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct netkit_link *nkl; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, + attr->link_create.target_ifindex, + attr->link_create.attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT); + if (!nkl) { + ret = -ENOMEM; + goto out; + } + ret = netkit_link_init(nkl, &link_primer, attr, dev, prog); + if (ret) { + kfree(nkl); + goto out; + } + ret = netkit_link_prog_attach(&nkl->link, + attr->link_create.flags, + attr->link_create.netkit.relative_fd, + attr->link_create.netkit.expected_revision); + if (ret) { + nkl->dev = NULL; + bpf_link_cleanup(&link_primer); + goto out; + } + ret = bpf_link_settle(&link_primer); +out: + rtnl_unlock(); + return ret; +} + +static void netkit_release_all(struct net_device *dev) +{ + struct bpf_mprog_entry *entry; + struct bpf_tuple tuple = {}; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + + entry = netkit_entry_fetch(dev, false); + if (!entry) + return; + netkit_entry_update(dev, NULL); + netkit_entry_sync(); + bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { + if (tuple.link) + netkit_link(tuple.link)->dev = NULL; + else + bpf_prog_put(tuple.prog); + } +} + +static void netkit_uninit(struct net_device *dev) +{ + netkit_release_all(dev); +} + +static void netkit_del_link(struct net_device *dev, struct list_head *head) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + RCU_INIT_POINTER(nk->peer, NULL); + unregister_netdevice_queue(dev, head); + if (peer) { + nk = netkit_priv(peer); + RCU_INIT_POINTER(nk->peer, NULL); + unregister_netdevice_queue(peer, head); + } +} + +static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + enum netkit_action policy; + struct nlattr *attr; + int err; + + if (!nk->primary) { + NL_SET_ERR_MSG(extack, + "netkit link settings can be changed only through the primary device"); + return -EACCES; + } + + if (data[IFLA_NETKIT_MODE]) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE], + "netkit link operating mode cannot be changed after device creation"); + return -EACCES; + } + + if (data[IFLA_NETKIT_POLICY]) { + attr = data[IFLA_NETKIT_POLICY]; + policy = nla_get_u32(attr); + err = netkit_check_policy(policy, attr, extack); + if (err) + return err; + WRITE_ONCE(nk->policy, policy); + } + + if (data[IFLA_NETKIT_PEER_POLICY]) { + err = -EOPNOTSUPP; + attr = data[IFLA_NETKIT_PEER_POLICY]; + policy = nla_get_u32(attr); + if (peer) + err = netkit_check_policy(policy, attr, extack); + if (err) + return err; + nk = netkit_priv(peer); + WRITE_ONCE(nk->policy, policy); + } + + return 0; +} + +static size_t netkit_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */ + nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ + 0; +} + +static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary)) + return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy)) + return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode)) + return -EMSGSIZE; + + if (peer) { + nk = netkit_priv(peer); + if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy)) + return -EMSGSIZE; + } + + return 0; +} + +static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { + [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) }, + [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_MODE] = { .type = NLA_U32 }, + [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, + .reject_message = "Primary attribute is read-only" }, +}; + +static struct rtnl_link_ops netkit_link_ops = { + .kind = DRV_NAME, + .priv_size = sizeof(struct netkit), + .setup = netkit_setup, + .newlink = netkit_new_link, + .dellink = netkit_del_link, + .changelink = netkit_change_link, + .get_link_net = netkit_get_link_net, + .get_size = netkit_get_size, + .fill_info = netkit_fill_info, + .policy = netkit_policy, + .validate = netkit_validate, + .maxtype = IFLA_NETKIT_MAX, +}; + +static __init int netkit_init(void) +{ + BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || + (int)NETKIT_PASS != (int)TCX_PASS || + (int)NETKIT_DROP != (int)TCX_DROP || + (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); + + return rtnl_link_register(&netkit_link_ops); +} + +static __exit void netkit_exit(void) +{ + rtnl_link_unregister(&netkit_link_ops); +} + +module_init(netkit_init); +module_exit(netkit_exit); + +MODULE_DESCRIPTION("BPF-programmable network device"); +MODULE_AUTHOR("Daniel Borkmann "); +MODULE_AUTHOR("Nikolay Aleksandrov "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); diff --git a/include/net/netkit.h b/include/net/netkit.h new file mode 100644 index 000000000000..0ba2e6b847ca --- /dev/null +++ b/include/net/netkit.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_NETKIT_H +#define __NET_NETKIT_H + +#include + +#ifdef CONFIG_NETKIT +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int netkit_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif /* CONFIG_NETKIT */ +#endif /* __NET_NETKIT_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ba61b75bc0e..0f6cdf52b1da 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1052,6 +1052,8 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_RECVMSG, BPF_CGROUP_UNIX_GETPEERNAME, BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, __MAX_BPF_ATTACH_TYPE }; @@ -1071,6 +1073,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, MAX_BPF_LINK_TYPE, }; @@ -1656,6 +1659,13 @@ union bpf_attr { __u32 flags; __u32 pid; } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; }; } link_create; @@ -6576,6 +6586,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; }; } __attribute__((aligned(8))); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index fac351a93aed..a0aa05a28cf2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -756,6 +756,30 @@ struct tunnel_msg { __u32 ifindex; }; +/* netkit section */ +enum netkit_action { + NETKIT_NEXT = -1, + NETKIT_PASS = 0, + NETKIT_DROP = 2, + NETKIT_REDIRECT = 7, +}; + +enum netkit_mode { + NETKIT_L2, + NETKIT_L3, +}; + +enum { + IFLA_NETKIT_UNSPEC, + IFLA_NETKIT_PEER_INFO, + IFLA_NETKIT_PRIMARY, + IFLA_NETKIT_POLICY, + IFLA_NETKIT_PEER_POLICY, + IFLA_NETKIT_MODE, + __IFLA_NETKIT_MAX, +}; +#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) + /* VXLAN section */ /* include statistics in the dump */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a9b2cb500bf7..0ed286b8a0f0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,8 +35,9 @@ #include #include #include -#include +#include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -3730,6 +3731,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_LSM; case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; @@ -3781,7 +3784,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return 0; case BPF_PROG_TYPE_SCHED_CLS: if (attach_type != BPF_TCX_INGRESS && - attach_type != BPF_TCX_EGRESS) + attach_type != BPF_TCX_EGRESS && + attach_type != BPF_NETKIT_PRIMARY && + attach_type != BPF_NETKIT_PEER) return -EINVAL; return 0; default: @@ -3864,7 +3869,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_attach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_attach(attr, prog); + else + ret = netkit_prog_attach(attr, prog); break; default: ret = -EINVAL; @@ -3925,7 +3934,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_detach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_detach(attr, prog); + else + ret = netkit_prog_detach(attr, prog); break; default: ret = -EINVAL; @@ -3992,6 +4005,9 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return tcx_prog_query(attr, uattr); + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + return netkit_prog_query(attr, uattr); default: return -EINVAL; } @@ -4973,7 +4989,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_link_attach(attr, prog); + if (attr->link_create.attach_type == BPF_TCX_INGRESS || + attr->link_create.attach_type == BPF_TCX_EGRESS) + ret = tcx_link_attach(attr, prog); + else + ret = netkit_link_attach(attr, prog); break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7ba61b75bc0e..0f6cdf52b1da 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1052,6 +1052,8 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_RECVMSG, BPF_CGROUP_UNIX_GETPEERNAME, BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, __MAX_BPF_ATTACH_TYPE }; @@ -1071,6 +1073,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, MAX_BPF_LINK_TYPE, }; @@ -1656,6 +1659,13 @@ union bpf_attr { __u32 flags; __u32 pid; } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; }; } link_create; @@ -6576,6 +6586,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From e57a34478586fe3562560ccebd655b707a5b4a56 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Tue, 24 Oct 2023 07:26:33 -0700 Subject: ipv6: drop feature RTAX_FEATURE_ALLFRAG RTAX_FEATURE_ALLFRAG was added before the first git commit: https://www.mail-archive.com/bk-commits-head@vger.kernel.org/msg03399.html The feature would send packets to the fragmentation path if a box receives a PMTU value with less than 1280 byte. However, since commit 9d289715eb5c ("ipv6: stop sending PTB packets for MTU < 1280"), such message would be simply discarded. The feature flag is neither supported in iproute2 utility. In theory one can still manipulate it with direct netlink message, but it is not ideal because it was based on obsoleted guidance of RFC-2460 (replaced by RFC-8200). The feature would always test false at the moment, so remove related code or mark them as unused. Signed-off-by: Yan Zhai Reviewed-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/d78e44dcd9968a252143ffe78460446476a472a1.1698156966.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 7 ------- include/net/inet_connection_sock.h | 1 - include/net/inet_sock.h | 1 - include/uapi/linux/rtnetlink.h | 2 +- net/ipv4/tcp_output.c | 20 +------------------- net/ipv6/ip6_output.c | 15 ++------------- net/ipv6/tcp_ipv6.c | 1 - net/ipv6/xfrm6_output.c | 2 +- net/mptcp/subflow.c | 1 - 9 files changed, 5 insertions(+), 45 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/dst.h b/include/net/dst.h index f8b8599a0600..f5dfc8fb7b37 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -222,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr return msecs_to_jiffies(dst_metric(dst, metric)); } -static inline u32 -dst_allfrag(const struct dst_entry *dst) -{ - int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG); - return ret; -} - static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 086d1193c9ef..d0a2f827d5f2 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -44,7 +44,6 @@ struct inet_connection_sock_af_ops { struct request_sock *req_unhash, bool *own_req); u16 net_header_len; - u16 net_frag_header_len; u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 98e11958cdff..74db6d97cae1 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -244,7 +244,6 @@ struct inet_sock { }; #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ -#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ enum { INET_FLAGS_PKTINFO = 0, diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index aa2482a0614a..3b687d20c9ed 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -505,7 +505,7 @@ enum { #define RTAX_FEATURE_ECN (1 << 0) #define RTAX_FEATURE_SACK (1 << 1) /* unused */ #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ -#define RTAX_FEATURE_ALLFRAG (1 << 3) +#define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2866ccbccde0..ca4d7594efd4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1698,14 +1698,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); - /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ - if (icsk->icsk_af_ops->net_frag_header_len) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_allfrag(dst)) - mss_now -= icsk->icsk_af_ops->net_frag_header_len; - } - /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; @@ -1733,21 +1725,11 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - int mtu; - mtu = mss + + return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; - - /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ - if (icsk->icsk_af_ops->net_frag_header_len) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_allfrag(dst)) - mtu += icsk->icsk_af_ops->net_frag_header_len; - } - return mtu; } EXPORT_SYMBOL(tcp_mss_to_mtu); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3c7de89d6755..86efd901ee5a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -191,7 +191,6 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); if ((skb->len > mtu && !skb_is_gso(skb)) || - dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); else @@ -1017,9 +1016,6 @@ slow_path: return err; fail_toobig: - if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_gso_disable(skb->sk); - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; @@ -1384,10 +1380,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); - if (dst_allfrag(xfrm_dst_path(&rt->dst))) - cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; - cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; @@ -1444,8 +1437,6 @@ static int __ip6_append_data(struct sock *sk, headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + - (dst_allfrag(&rt->dst) ? - sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; if (mtu <= fragheaderlen || @@ -1555,7 +1546,7 @@ emsgsize: while (length > 0) { /* Check if the remaining data fits into current packet. */ - copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; @@ -1586,7 +1577,7 @@ alloc_new_skb: */ datalen = length + fraggap; - if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; pagedlen = 0; @@ -1835,7 +1826,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) struct dst_entry *dst = cork->base.dst; cork->base.dst = NULL; - cork->base.flags &= ~IPCORK_ALLFRAG; skb_dst_set(skb, dst); } @@ -1856,7 +1846,6 @@ static void ip6_cork_release(struct inet_cork_full *cork, if (cork->base.dst) { dst_release(cork->base.dst); cork->base.dst = NULL; - cork->base.flags &= ~IPCORK_ALLFRAG; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0c8a14ba104f..dc27988512a6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1895,7 +1895,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), - .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index ad07904642ca..5f7b1fdbffe6 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -95,7 +95,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) return -EMSGSIZE; } - if (toobig || dst_allfrag(skb_dst(skb))) + if (toobig) return ip6_fragment(net, sk, skb, __xfrm6_output_finish); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 2b43577f952e..e120e9616454 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2051,7 +2051,6 @@ void __init mptcp_subflow_init(void) subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; - subflow_v6m_specific.net_frag_header_len = 0; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; -- cgit v1.2.3 From c845f5f3590ef4669fe5464f8a42be6442cd174b Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:21:54 +0100 Subject: net/tcp: Add TCP-AO config and structures Introduce new kernel config option and common structures as well as helpers to be used by TCP-AO code. Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/tcp.h | 9 +++-- include/net/tcp.h | 8 ++--- include/net/tcp_ao.h | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/Kconfig | 13 +++++++ 5 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 include/net/tcp_ao.h (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6df715b6e51d..64e7b560fa79 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -447,13 +447,18 @@ struct tcp_sock { bool syn_smc; /* SYN includes SMC */ #endif -#ifdef CONFIG_TCP_MD5SIG -/* TCP AF-Specific parts; only used by MD5 Signature support so far */ +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) +/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; +#ifdef CONFIG_TCP_MD5SIG /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif +#ifdef CONFIG_TCP_AO + struct tcp_ao_info __rcu *ao_info; +#endif +#endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; diff --git a/include/net/tcp.h b/include/net/tcp.h index f6e2db5292b5..3153712faa3d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1688,12 +1689,7 @@ static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) tp->retransmit_skb_hint = NULL; } -union tcp_md5_addr { - struct in_addr a4; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr a6; -#endif -}; +#define tcp_md5_addr tcp_ao_addr /* - key database */ struct tcp_md5sig_key { diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h new file mode 100644 index 000000000000..af76e1c47bea --- /dev/null +++ b/include/net/tcp_ao.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_AO_H +#define _TCP_AO_H + +#define TCP_AO_KEY_ALIGN 1 +#define __tcp_ao_key_align __aligned(TCP_AO_KEY_ALIGN) + +union tcp_ao_addr { + struct in_addr a4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr a6; +#endif +}; + +struct tcp_ao_hdr { + u8 kind; + u8 length; + u8 keyid; + u8 rnext_keyid; +}; + +struct tcp_ao_key { + struct hlist_node node; + union tcp_ao_addr addr; + u8 key[TCP_AO_MAXKEYLEN] __tcp_ao_key_align; + unsigned int tcp_sigpool_id; + unsigned int digest_size; + u8 prefixlen; + u8 family; + u8 keylen; + u8 keyflags; + u8 sndid; + u8 rcvid; + u8 maclen; + struct rcu_head rcu; + u8 traffic_keys[]; +}; + +static inline u8 *rcv_other_key(struct tcp_ao_key *key) +{ + return key->traffic_keys; +} + +static inline u8 *snd_other_key(struct tcp_ao_key *key) +{ + return key->traffic_keys + key->digest_size; +} + +static inline int tcp_ao_maclen(const struct tcp_ao_key *key) +{ + return key->maclen; +} + +static inline int tcp_ao_len(const struct tcp_ao_key *key) +{ + return tcp_ao_maclen(key) + sizeof(struct tcp_ao_hdr); +} + +static inline unsigned int tcp_ao_digest_size(struct tcp_ao_key *key) +{ + return key->digest_size; +} + +static inline int tcp_ao_sizeof_key(const struct tcp_ao_key *key) +{ + return sizeof(struct tcp_ao_key) + (key->digest_size << 1); +} + +struct tcp_ao_info { + /* List of tcp_ao_key's */ + struct hlist_head head; + /* current_key and rnext_key aren't maintained on listen sockets. + * Their purpose is to cache keys on established connections, + * saving needless lookups. Never dereference any of them from + * listen sockets. + * ::current_key may change in RX to the key that was requested by + * the peer, please use READ_ONCE()/WRITE_ONCE() in order to avoid + * load/store tearing. + * Do the same for ::rnext_key, if you don't hold socket lock + * (it's changed only by userspace request in setsockopt()). + */ + struct tcp_ao_key *current_key; + struct tcp_ao_key *rnext_key; + u32 flags; + __be32 lisn; + __be32 risn; + struct rcu_head rcu; +}; + +#endif /* _TCP_AO_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8aa3916e14f6..74a1267baac5 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -361,6 +361,8 @@ struct tcp_diag_md5sig { __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; }; +#define TCP_AO_MAXKEYLEN 80 + /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 89e2ab023272..8e94ed7c56a0 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -744,6 +744,19 @@ config DEFAULT_TCP_CONG config TCP_SIGPOOL tristate +config TCP_AO + bool "TCP: Authentication Option (RFC5925)" + select CRYPTO + select TCP_SIGPOOL + depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64) + help + TCP-AO specifies the use of stronger Message Authentication Codes (MACs), + protects against replays for long-lived TCP connections, and + provides more details on the association of security with TCP + connections than TCP MD5 (See RFC5925) + + If unsure, say N. + config TCP_MD5SIG bool "TCP: MD5 Signature Option support (RFC2385)" select CRYPTO -- cgit v1.2.3 From 4954f17ddefc51d218625dcdfaf422a253dad3fa Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:21:55 +0100 Subject: net/tcp: Introduce TCP_AO setsockopt()s Add 3 setsockopt()s: 1. TCP_AO_ADD_KEY to add a new Master Key Tuple (MKT) on a socket 2. TCP_AO_DEL_KEY to delete present MKT from a socket 3. TCP_AO_INFO to change flags, Current_key/RNext_key on a TCP-AO sk Userspace has to introduce keys on every socket it wants to use TCP-AO option on, similarly to TCP_MD5SIG/TCP_MD5SIG_EXT. RFC5925 prohibits definition of MKTs that would match the same peer, so do sanity checks on the data provided by userspace. Be as conservative as possible, including refusal of defining MKT on an established connection with no AO, removing the key in-use and etc. (1) and (2) are to be used by userspace key manager to add/remove keys. (3) main purpose is to set RNext_key, which (as prescribed by RFC5925) is the KeyID that will be requested in TCP-AO header from the peer to sign their segments with. At this moment the life of ao_info ends in tcp_v4_destroy_sock(). Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/sockptr.h | 23 ++ include/net/tcp.h | 3 + include/net/tcp_ao.h | 17 +- include/uapi/linux/tcp.h | 46 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp.c | 17 + net/ipv4/tcp_ao.c | 794 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 10 +- net/ipv6/Makefile | 1 + net/ipv6/tcp_ao.c | 19 ++ net/ipv6/tcp_ipv6.c | 39 ++- 11 files changed, 952 insertions(+), 18 deletions(-) create mode 100644 net/ipv4/tcp_ao.c create mode 100644 net/ipv6/tcp_ao.c (limited to 'include/uapi/linux') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..307961b41541 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -55,6 +55,29 @@ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) return copy_from_sockptr_offset(dst, src, 0, size); } +static inline int copy_struct_from_sockptr(void *dst, size_t ksize, + sockptr_t src, size_t usize) +{ + size_t size = min(ksize, usize); + size_t rest = max(ksize, usize) - size; + + if (!sockptr_is_kernel(src)) + return copy_struct_from_user(dst, ksize, src.user, size); + + if (usize < ksize) { + memset(dst + size, 0, rest); + } else if (usize > ksize) { + char *p = src.kernel; + + while (rest--) { + if (*p++) + return -E2BIG; + } + } + memcpy(dst, src.kernel, size); + return 0; +} + static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 3153712faa3d..ff204471d451 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2175,6 +2175,9 @@ struct tcp_sock_af_ops { sockptr_t optval, int optlen); #endif +#ifdef CONFIG_TCP_AO + int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); +#endif }; struct tcp_request_sock_ops { diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index af76e1c47bea..a81e40fd255a 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -81,10 +81,25 @@ struct tcp_ao_info { */ struct tcp_ao_key *current_key; struct tcp_ao_key *rnext_key; - u32 flags; + u32 ao_required :1, + __unused :31; __be32 lisn; __be32 risn; struct rcu_head rcu; }; +#ifdef CONFIG_TCP_AO +int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, + sockptr_t optval, int optlen); +void tcp_ao_destroy_sock(struct sock *sk); +/* ipv4 specific functions */ +int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); +/* ipv6 specific functions */ +int tcp_v6_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen); +#else +static inline void tcp_ao_destroy_sock(struct sock *sk) +{ +} +#endif + #endif /* _TCP_AO_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 74a1267baac5..fa49f03e62fe 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -129,6 +129,9 @@ enum { #define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ +#define TCP_AO_ADD_KEY 38 /* Add/Set MKT */ +#define TCP_AO_DEL_KEY 39 /* Delete MKT */ +#define TCP_AO_INFO 40 /* Modify TCP-AO per-socket options */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 @@ -363,6 +366,49 @@ struct tcp_diag_md5sig { #define TCP_AO_MAXKEYLEN 80 +#define TCP_AO_KEYF_IFINDEX (1 << 0) /* L3 ifindex for VRF */ + +struct tcp_ao_add { /* setsockopt(TCP_AO_ADD_KEY) */ + struct __kernel_sockaddr_storage addr; /* peer's address for the key */ + char alg_name[64]; /* crypto hash algorithm to use */ + __s32 ifindex; /* L3 dev index for VRF */ + __u32 set_current :1, /* set key as Current_key at once */ + set_rnext :1, /* request it from peer with RNext_key */ + reserved :30; /* must be 0 */ + __u16 reserved2; /* padding, must be 0 */ + __u8 prefix; /* peer's address prefix */ + __u8 sndid; /* SendID for outgoing segments */ + __u8 rcvid; /* RecvID to match for incoming seg */ + __u8 maclen; /* length of authentication code (hash) */ + __u8 keyflags; /* see TCP_AO_KEYF_ */ + __u8 keylen; /* length of ::key */ + __u8 key[TCP_AO_MAXKEYLEN]; +} __attribute__((aligned(8))); + +struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */ + struct __kernel_sockaddr_storage addr; /* peer's address for the key */ + __s32 ifindex; /* L3 dev index for VRF */ + __u32 set_current :1, /* corresponding ::current_key */ + set_rnext :1, /* corresponding ::rnext */ + reserved :30; /* must be 0 */ + __u16 reserved2; /* padding, must be 0 */ + __u8 prefix; /* peer's address prefix */ + __u8 sndid; /* SendID for outgoing segments */ + __u8 rcvid; /* RecvID to match for incoming seg */ + __u8 current_key; /* KeyID to set as Current_key */ + __u8 rnext; /* KeyID to set as Rnext_key */ + __u8 keyflags; /* see TCP_AO_KEYF_ */ +} __attribute__((aligned(8))); + +struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */ + __u32 set_current :1, /* corresponding ::current_key */ + set_rnext :1, /* corresponding ::rnext */ + ao_required :1, /* don't accept non-AO connects */ + reserved :29; /* must be 0 */ + __u8 current_key; /* KeyID to set as Current_key */ + __u8 rnext; /* KeyID to set as Rnext_key */ +} __attribute__((aligned(8))); + /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index cd760793cfcb..e144a02a6a61 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o +obj-$(CONFIG_TCP_AO) += tcp_ao.o ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dca9ca2f1081..b6faee8a1e67 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3593,6 +3593,23 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, __tcp_sock_set_quickack(sk, val); break; +#ifdef CONFIG_TCP_AO + case TCP_AO_ADD_KEY: + case TCP_AO_DEL_KEY: + case TCP_AO_INFO: { + /* If this is the first TCP-AO setsockopt() on the socket, + * sk_state has to be LISTEN or CLOSE + */ + if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) || + rcu_dereference_protected(tcp_sk(sk)->ao_info, + lockdep_sock_is_held(sk))) + err = tp->af_specific->ao_parse(sk, optname, optval, + optlen); + else + err = -EISCONN; + break; + } +#endif #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c new file mode 100644 index 000000000000..3c2d005a37ce --- /dev/null +++ b/net/ipv4/tcp_ao.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * INET An implementation of the TCP Authentication Option (TCP-AO). + * See RFC5925. + * + * Authors: Dmitry Safonov + * Francesco Ruggeri + * Salam Noureddine + */ +#define pr_fmt(fmt) "TCP: " fmt + +#include +#include +#include + +#include +#include + +/* Optimized version of tcp_ao_do_lookup(): only for sockets for which + * it's known that the keys in ao_info are matching peer's + * family/address/VRF/etc. + */ +static struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, + int sndid, int rcvid) +{ + struct tcp_ao_key *key; + + hlist_for_each_entry_rcu(key, &ao->head, node) { + if ((sndid >= 0 && key->sndid != sndid) || + (rcvid >= 0 && key->rcvid != rcvid)) + continue; + return key; + } + + return NULL; +} + +static int ipv4_prefix_cmp(const struct in_addr *addr1, + const struct in_addr *addr2, + unsigned int prefixlen) +{ + __be32 mask = inet_make_mask(prefixlen); + __be32 a1 = addr1->s_addr & mask; + __be32 a2 = addr2->s_addr & mask; + + if (a1 == a2) + return 0; + return memcmp(&a1, &a2, sizeof(a1)); +} + +static int __tcp_ao_key_cmp(const struct tcp_ao_key *key, + const union tcp_ao_addr *addr, u8 prefixlen, + int family, int sndid, int rcvid) +{ + if (sndid >= 0 && key->sndid != sndid) + return (key->sndid > sndid) ? 1 : -1; + if (rcvid >= 0 && key->rcvid != rcvid) + return (key->rcvid > rcvid) ? 1 : -1; + + if (family == AF_UNSPEC) + return 0; + if (key->family != family) + return (key->family > family) ? 1 : -1; + + if (family == AF_INET) { + if (ntohl(key->addr.a4.s_addr) == INADDR_ANY) + return 0; + if (ntohl(addr->a4.s_addr) == INADDR_ANY) + return 0; + return ipv4_prefix_cmp(&key->addr.a4, &addr->a4, prefixlen); +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (ipv6_addr_any(&key->addr.a6) || ipv6_addr_any(&addr->a6)) + return 0; + if (ipv6_prefix_equal(&key->addr.a6, &addr->a6, prefixlen)) + return 0; + return memcmp(&key->addr.a6, &addr->a6, sizeof(addr->a6)); +#endif + } + return -1; +} + +static int tcp_ao_key_cmp(const struct tcp_ao_key *key, + const union tcp_ao_addr *addr, u8 prefixlen, + int family, int sndid, int rcvid) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && ipv6_addr_v4mapped(&addr->a6)) { + __be32 addr4 = addr->a6.s6_addr32[3]; + + return __tcp_ao_key_cmp(key, (union tcp_ao_addr *)&addr4, + prefixlen, AF_INET, sndid, rcvid); + } +#endif + return __tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid); +} + +static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk, + const union tcp_ao_addr *addr, int family, u8 prefix, + int sndid, int rcvid) +{ + struct tcp_ao_key *key; + struct tcp_ao_info *ao; + + ao = rcu_dereference_check(tcp_sk(sk)->ao_info, + lockdep_sock_is_held(sk)); + if (!ao) + return NULL; + + hlist_for_each_entry_rcu(key, &ao->head, node) { + u8 prefixlen = min(prefix, key->prefixlen); + + if (!tcp_ao_key_cmp(key, addr, prefixlen, family, sndid, rcvid)) + return key; + } + return NULL; +} + +static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags) +{ + struct tcp_ao_info *ao; + + ao = kzalloc(sizeof(*ao), flags); + if (!ao) + return NULL; + INIT_HLIST_HEAD(&ao->head); + + return ao; +} + +static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt) +{ + hlist_add_head_rcu(&mkt->node, &ao->head); +} + +static void tcp_ao_key_free_rcu(struct rcu_head *head) +{ + struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu); + + tcp_sigpool_release(key->tcp_sigpool_id); + kfree_sensitive(key); +} + +void tcp_ao_destroy_sock(struct sock *sk) +{ + struct tcp_ao_info *ao; + struct tcp_ao_key *key; + struct hlist_node *n; + + ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1); + tcp_sk(sk)->ao_info = NULL; + + if (!ao) + return; + + hlist_for_each_entry_safe(key, n, &ao->head, node) { + hlist_del_rcu(&key->node); + atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc); + call_rcu(&key->rcu, tcp_ao_key_free_rcu); + } + + kfree_rcu(ao, rcu); +} + +static bool tcp_ao_can_set_current_rnext(struct sock *sk) +{ + /* There aren't current/rnext keys on TCP_LISTEN sockets */ + if (sk->sk_state == TCP_LISTEN) + return false; + return true; +} + +static int tcp_ao_verify_ipv4(struct sock *sk, struct tcp_ao_add *cmd, + union tcp_ao_addr **addr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)&cmd->addr; + struct inet_sock *inet = inet_sk(sk); + + if (sin->sin_family != AF_INET) + return -EINVAL; + + /* Currently matching is not performed on port (or port ranges) */ + if (sin->sin_port != 0) + return -EINVAL; + + /* Check prefix and trailing 0's in addr */ + if (cmd->prefix != 0) { + __be32 mask; + + if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY) + return -EINVAL; + if (cmd->prefix > 32) + return -EINVAL; + + mask = inet_make_mask(cmd->prefix); + if (sin->sin_addr.s_addr & ~mask) + return -EINVAL; + + /* Check that MKT address is consistent with socket */ + if (ntohl(inet->inet_daddr) != INADDR_ANY && + (inet->inet_daddr & mask) != sin->sin_addr.s_addr) + return -EINVAL; + } else { + if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY) + return -EINVAL; + } + + *addr = (union tcp_ao_addr *)&sin->sin_addr; + return 0; +} + +static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key) +{ + unsigned int syn_tcp_option_space; + bool is_kdf_aes_128_cmac = false; + struct crypto_ahash *tfm; + struct tcp_sigpool hp; + void *tmp_key = NULL; + int err; + + /* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */ + if (!strcmp("cmac(aes128)", cmd->alg_name)) { + strscpy(cmd->alg_name, "cmac(aes)", sizeof(cmd->alg_name)); + is_kdf_aes_128_cmac = (cmd->keylen != 16); + tmp_key = kmalloc(cmd->keylen, GFP_KERNEL); + if (!tmp_key) + return -ENOMEM; + } + + key->maclen = cmd->maclen ?: 12; /* 12 is the default in RFC5925 */ + + /* Check: maclen + tcp-ao header <= (MAX_TCP_OPTION_SPACE - mss + * - tstamp - wscale - sackperm), + * see tcp_syn_options(), tcp_synack_options(), commit 33ad798c924b. + * + * In order to allow D-SACK with TCP-AO, the header size should be: + * (MAX_TCP_OPTION_SPACE - TCPOLEN_TSTAMP_ALIGNED + * - TCPOLEN_SACK_BASE_ALIGNED + * - 2 * TCPOLEN_SACK_PERBLOCK) = 8 (maclen = 4), + * see tcp_established_options(). + * + * RFC5925, 2.2: + * Typical MACs are 96-128 bits (12-16 bytes), but any length + * that fits in the header of the segment being authenticated + * is allowed. + * + * RFC5925, 7.6: + * TCP-AO continues to consume 16 bytes in non-SYN segments, + * leaving a total of 24 bytes for other options, of which + * the timestamp consumes 10. This leaves 14 bytes, of which 10 + * are used for a single SACK block. When two SACK blocks are used, + * such as to handle D-SACK, a smaller TCP-AO MAC would be required + * to make room for the additional SACK block (i.e., to leave 18 + * bytes for the D-SACK variant of the SACK option) [RFC2883]. + * Note that D-SACK is not supportable in TCP MD5 in the presence + * of timestamps, because TCP MD5’s MAC length is fixed and too + * large to leave sufficient option space. + */ + syn_tcp_option_space = MAX_TCP_OPTION_SPACE; + syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED; + syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED; + syn_tcp_option_space -= TCPOLEN_SACKPERM_ALIGNED; + if (tcp_ao_len(key) > syn_tcp_option_space) { + err = -EMSGSIZE; + goto err_kfree; + } + + key->keylen = cmd->keylen; + memcpy(key->key, cmd->key, cmd->keylen); + + err = tcp_sigpool_start(key->tcp_sigpool_id, &hp); + if (err) + goto err_kfree; + + tfm = crypto_ahash_reqtfm(hp.req); + if (is_kdf_aes_128_cmac) { + void *scratch = hp.scratch; + struct scatterlist sg; + + memcpy(tmp_key, cmd->key, cmd->keylen); + sg_init_one(&sg, tmp_key, cmd->keylen); + + /* Using zero-key of 16 bytes as described in RFC5926 */ + memset(scratch, 0, 16); + err = crypto_ahash_setkey(tfm, scratch, 16); + if (err) + goto err_pool_end; + + err = crypto_ahash_init(hp.req); + if (err) + goto err_pool_end; + + ahash_request_set_crypt(hp.req, &sg, key->key, cmd->keylen); + err = crypto_ahash_update(hp.req); + if (err) + goto err_pool_end; + + err |= crypto_ahash_final(hp.req); + if (err) + goto err_pool_end; + key->keylen = 16; + } + + err = crypto_ahash_setkey(tfm, key->key, key->keylen); + if (err) + goto err_pool_end; + + tcp_sigpool_end(&hp); + kfree_sensitive(tmp_key); + + if (tcp_ao_maclen(key) > key->digest_size) + return -EINVAL; + + return 0; + +err_pool_end: + tcp_sigpool_end(&hp); +err_kfree: + kfree_sensitive(tmp_key); + return err; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd, + union tcp_ao_addr **paddr, + unsigned short int *family) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd->addr; + struct in6_addr *addr = &sin6->sin6_addr; + u8 prefix = cmd->prefix; + + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + + /* Currently matching is not performed on port (or port ranges) */ + if (sin6->sin6_port != 0) + return -EINVAL; + + /* Check prefix and trailing 0's in addr */ + if (cmd->prefix != 0 && ipv6_addr_v4mapped(addr)) { + __be32 addr4 = addr->s6_addr32[3]; + __be32 mask; + + if (prefix > 32 || ntohl(addr4) == INADDR_ANY) + return -EINVAL; + + mask = inet_make_mask(prefix); + if (addr4 & ~mask) + return -EINVAL; + + /* Check that MKT address is consistent with socket */ + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + __be32 daddr4 = sk->sk_v6_daddr.s6_addr32[3]; + + if (!ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return -EINVAL; + if ((daddr4 & mask) != addr4) + return -EINVAL; + } + + *paddr = (union tcp_ao_addr *)&addr->s6_addr32[3]; + *family = AF_INET; + return 0; + } else if (cmd->prefix != 0) { + struct in6_addr pfx; + + if (ipv6_addr_any(addr) || prefix > 128) + return -EINVAL; + + ipv6_addr_prefix(&pfx, addr, prefix); + if (ipv6_addr_cmp(&pfx, addr)) + return -EINVAL; + + /* Check that MKT address is consistent with socket */ + if (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_prefix_equal(&sk->sk_v6_daddr, addr, prefix)) + + return -EINVAL; + } else { + if (!ipv6_addr_any(addr)) + return -EINVAL; + } + + *paddr = (union tcp_ao_addr *)addr; + return 0; +} +#else +static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd, + union tcp_ao_addr **paddr, + unsigned short int *family) +{ + return -EOPNOTSUPP; +} +#endif + +static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk) +{ + if (sk_fullsock(sk)) { + return rcu_dereference_protected(tcp_sk(sk)->ao_info, + lockdep_sock_is_held(sk)); + } + return ERR_PTR(-ESOCKTNOSUPPORT); +} + +#define TCP_AO_KEYF_ALL (0) + +static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk, + struct tcp_ao_add *cmd) +{ + const char *algo = cmd->alg_name; + unsigned int digest_size; + struct crypto_ahash *tfm; + struct tcp_ao_key *key; + struct tcp_sigpool hp; + int err, pool_id; + size_t size; + + /* Force null-termination of alg_name */ + cmd->alg_name[ARRAY_SIZE(cmd->alg_name) - 1] = '\0'; + + /* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */ + if (!strcmp("cmac(aes128)", algo)) + algo = "cmac(aes)"; + + /* Full TCP header (th->doff << 2) should fit into scratch area, + * see tcp_ao_hash_header(). + */ + pool_id = tcp_sigpool_alloc_ahash(algo, 60); + if (pool_id < 0) + return ERR_PTR(pool_id); + + err = tcp_sigpool_start(pool_id, &hp); + if (err) + goto err_free_pool; + + tfm = crypto_ahash_reqtfm(hp.req); + if (crypto_ahash_alignmask(tfm) > TCP_AO_KEY_ALIGN) { + err = -EOPNOTSUPP; + goto err_pool_end; + } + digest_size = crypto_ahash_digestsize(tfm); + tcp_sigpool_end(&hp); + + size = sizeof(struct tcp_ao_key) + (digest_size << 1); + key = sock_kmalloc(sk, size, GFP_KERNEL); + if (!key) { + err = -ENOMEM; + goto err_free_pool; + } + + key->tcp_sigpool_id = pool_id; + key->digest_size = digest_size; + return key; + +err_pool_end: + tcp_sigpool_end(&hp); +err_free_pool: + tcp_sigpool_release(pool_id); + return ERR_PTR(err); +} + +static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family, + sockptr_t optval, int optlen) +{ + struct tcp_ao_info *ao_info; + union tcp_ao_addr *addr; + struct tcp_ao_key *key; + struct tcp_ao_add cmd; + bool first = false; + int ret; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + ret = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen); + if (ret) + return ret; + + if (cmd.keylen > TCP_AO_MAXKEYLEN) + return -EINVAL; + + if (cmd.reserved != 0 || cmd.reserved2 != 0) + return -EINVAL; + + if (family == AF_INET) + ret = tcp_ao_verify_ipv4(sk, &cmd, &addr); + else + ret = tcp_ao_verify_ipv6(sk, &cmd, &addr, &family); + if (ret) + return ret; + + if (cmd.keyflags & ~TCP_AO_KEYF_ALL) + return -EINVAL; + + if (cmd.set_current || cmd.set_rnext) { + if (!tcp_ao_can_set_current_rnext(sk)) + return -EINVAL; + } + + ao_info = setsockopt_ao_info(sk); + if (IS_ERR(ao_info)) + return PTR_ERR(ao_info); + + if (!ao_info) { + ao_info = tcp_ao_alloc_info(GFP_KERNEL); + if (!ao_info) + return -ENOMEM; + first = true; + } else { + /* Check that neither RecvID nor SendID match any + * existing key for the peer, RFC5925 3.1: + * > The IDs of MKTs MUST NOT overlap where their + * > TCP connection identifiers overlap. + */ + if (__tcp_ao_do_lookup(sk, addr, family, + cmd.prefix, -1, cmd.rcvid)) + return -EEXIST; + if (__tcp_ao_do_lookup(sk, addr, family, + cmd.prefix, cmd.sndid, -1)) + return -EEXIST; + } + + key = tcp_ao_key_alloc(sk, &cmd); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto err_free_ao; + } + + INIT_HLIST_NODE(&key->node); + memcpy(&key->addr, addr, (family == AF_INET) ? sizeof(struct in_addr) : + sizeof(struct in6_addr)); + key->prefixlen = cmd.prefix; + key->family = family; + key->keyflags = cmd.keyflags; + key->sndid = cmd.sndid; + key->rcvid = cmd.rcvid; + + ret = tcp_ao_parse_crypto(&cmd, key); + if (ret < 0) + goto err_free_sock; + + tcp_ao_link_mkt(ao_info, key); + if (first) { + sk_gso_disable(sk); + rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info); + } + + if (cmd.set_current) + WRITE_ONCE(ao_info->current_key, key); + if (cmd.set_rnext) + WRITE_ONCE(ao_info->rnext_key, key); + return 0; + +err_free_sock: + atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc); + tcp_sigpool_release(key->tcp_sigpool_id); + kfree_sensitive(key); +err_free_ao: + if (first) + kfree(ao_info); + return ret; +} + +static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info, + struct tcp_ao_key *key, + struct tcp_ao_key *new_current, + struct tcp_ao_key *new_rnext) +{ + int err; + + hlist_del_rcu(&key->node); + + /* At this moment another CPU could have looked this key up + * while it was unlinked from the list. Wait for RCU grace period, + * after which the key is off-list and can't be looked up again; + * the rx path [just before RCU came] might have used it and set it + * as current_key (very unlikely). + */ + synchronize_rcu(); + if (new_current) + WRITE_ONCE(ao_info->current_key, new_current); + if (new_rnext) + WRITE_ONCE(ao_info->rnext_key, new_rnext); + + if (unlikely(READ_ONCE(ao_info->current_key) == key || + READ_ONCE(ao_info->rnext_key) == key)) { + err = -EBUSY; + goto add_key; + } + + atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc); + call_rcu(&key->rcu, tcp_ao_key_free_rcu); + + return 0; +add_key: + hlist_add_head_rcu(&key->node, &ao_info->head); + return err; +} + +static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, + sockptr_t optval, int optlen) +{ + struct tcp_ao_key *key, *new_current = NULL, *new_rnext = NULL; + struct tcp_ao_info *ao_info; + union tcp_ao_addr *addr; + struct tcp_ao_del cmd; + int addr_len; + __u8 prefix; + u16 port; + int err; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen); + if (err) + return err; + + if (cmd.reserved != 0 || cmd.reserved2 != 0) + return -EINVAL; + + if (cmd.set_current || cmd.set_rnext) { + if (!tcp_ao_can_set_current_rnext(sk)) + return -EINVAL; + } + + ao_info = setsockopt_ao_info(sk); + if (IS_ERR(ao_info)) + return PTR_ERR(ao_info); + if (!ao_info) + return -ENOENT; + + /* For sockets in TCP_CLOSED it's possible set keys that aren't + * matching the future peer (address/VRF/etc), + * tcp_ao_connect_init() will choose a correct matching MKT + * if there's any. + */ + if (cmd.set_current) { + new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1); + if (!new_current) + return -ENOENT; + } + if (cmd.set_rnext) { + new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext); + if (!new_rnext) + return -ENOENT; + } + + if (family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr; + + addr = (union tcp_ao_addr *)&sin->sin_addr; + addr_len = sizeof(struct in_addr); + port = ntohs(sin->sin_port); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.addr; + struct in6_addr *addr6 = &sin6->sin6_addr; + + if (ipv6_addr_v4mapped(addr6)) { + addr = (union tcp_ao_addr *)&addr6->s6_addr32[3]; + addr_len = sizeof(struct in_addr); + family = AF_INET; + } else { + addr = (union tcp_ao_addr *)addr6; + addr_len = sizeof(struct in6_addr); + } + port = ntohs(sin6->sin6_port); + } + prefix = cmd.prefix; + + /* Currently matching is not performed on port (or port ranges) */ + if (port != 0) + return -EINVAL; + + /* We could choose random present key here for current/rnext + * but that's less predictable. Let's be strict and don't + * allow removing a key that's in use. RFC5925 doesn't + * specify how-to coordinate key removal, but says: + * "It is presumed that an MKT affecting a particular + * connection cannot be destroyed during an active connection" + */ + hlist_for_each_entry_rcu(key, &ao_info->head, node) { + if (cmd.sndid != key->sndid || + cmd.rcvid != key->rcvid) + continue; + + if (family != key->family || + prefix != key->prefixlen || + memcmp(addr, &key->addr, addr_len)) + continue; + + if (key == new_current || key == new_rnext) + continue; + + return tcp_ao_delete_key(sk, ao_info, key, + new_current, new_rnext); + } + return -ENOENT; +} + +static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, + sockptr_t optval, int optlen) +{ + struct tcp_ao_key *new_current = NULL, *new_rnext = NULL; + struct tcp_ao_info *ao_info; + struct tcp_ao_info_opt cmd; + bool first = false; + int err; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen); + if (err) + return err; + + if (cmd.set_current || cmd.set_rnext) { + if (!tcp_ao_can_set_current_rnext(sk)) + return -EINVAL; + } + + if (cmd.reserved != 0) + return -EINVAL; + + ao_info = setsockopt_ao_info(sk); + if (IS_ERR(ao_info)) + return PTR_ERR(ao_info); + if (!ao_info) { + ao_info = tcp_ao_alloc_info(GFP_KERNEL); + if (!ao_info) + return -ENOMEM; + first = true; + } + + /* For sockets in TCP_CLOSED it's possible set keys that aren't + * matching the future peer (address/port/VRF/etc), + * tcp_ao_connect_init() will choose a correct matching MKT + * if there's any. + */ + if (cmd.set_current) { + new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1); + if (!new_current) { + err = -ENOENT; + goto out; + } + } + if (cmd.set_rnext) { + new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext); + if (!new_rnext) { + err = -ENOENT; + goto out; + } + } + + ao_info->ao_required = cmd.ao_required; + if (new_current) + WRITE_ONCE(ao_info->current_key, new_current); + if (new_rnext) + WRITE_ONCE(ao_info->rnext_key, new_rnext); + if (first) { + sk_gso_disable(sk); + rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info); + } + return 0; +out: + if (first) + kfree(ao_info); + return err; +} + +int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, + sockptr_t optval, int optlen) +{ + if (WARN_ON_ONCE(family != AF_INET && family != AF_INET6)) + return -EAFNOSUPPORT; + + switch (cmd) { + case TCP_AO_ADD_KEY: + return tcp_ao_add_cmd(sk, family, optval, optlen); + case TCP_AO_DEL_KEY: + return tcp_ao_del_cmd(sk, family, optval, optlen); + case TCP_AO_INFO: + return tcp_ao_info_cmd(sk, family, optval, optlen); + default: + WARN_ON_ONCE(1); + return -EINVAL; + } +} + +int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen) +{ + return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen); +} + diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7d81e90b6f5c..a4746c27f38a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2271,11 +2271,16 @@ const struct inet_connection_sock_af_ops ipv4_specific = { }; EXPORT_SYMBOL(ipv4_specific); -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v4_parse_md5_keys, +#endif +#ifdef CONFIG_TCP_AO + .ao_parse = tcp_v4_parse_ao, +#endif }; #endif @@ -2290,7 +2295,7 @@ static int tcp_v4_init_sock(struct sock *sk) icsk->icsk_af_ops = &ipv4_specific; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; #endif @@ -2341,6 +2346,7 @@ void tcp_v4_destroy_sock(struct sock *sk) rcu_assign_pointer(tp->md5sig_info, NULL); } #endif + tcp_ao_destroy_sock(sk); /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 3036a45e8a1e..d283c59df4c1 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -52,4 +52,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o obj-y += mcast_snoop.o +obj-$(CONFIG_TCP_AO) += tcp_ao.o endif diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c new file mode 100644 index 000000000000..049ddbabe049 --- /dev/null +++ b/net/ipv6/tcp_ao.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * INET An implementation of the TCP Authentication Option (TCP-AO). + * See RFC5925. + * + * Authors: Dmitry Safonov + * Francesco Ruggeri + * Salam Noureddine + */ +#include + +#include +#include + +int tcp_v6_parse_ao(struct sock *sk, int cmd, + sockptr_t optval, int optlen) +{ + return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen); +} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ee53dad20a59..30bd17d03239 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -76,16 +76,9 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; -#else -static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, - const struct in6_addr *addr, - int l3index) -{ - return NULL; -} #endif /* Helper returning the inet6 address from a given tcp socket. @@ -239,7 +232,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif @@ -252,7 +245,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; @@ -769,7 +762,13 @@ clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } - +#else /* CONFIG_TCP_MD5SIG */ +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, + const struct in6_addr *addr, + int l3index) +{ + return NULL; +} #endif static void tcp_v6_init_req(struct request_sock *req, @@ -1228,7 +1227,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif @@ -1896,11 +1895,16 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .mtu_reduced = tcp_v6_mtu_reduced, }; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, +#endif +#ifdef CONFIG_TCP_AO + .ao_parse = tcp_v6_parse_ao, +#endif }; #endif @@ -1922,11 +1926,16 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .mtu_reduced = tcp_v4_mtu_reduced, }; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, +#endif +#ifdef CONFIG_TCP_AO + .ao_parse = tcp_v6_parse_ao, +#endif }; #endif @@ -1941,7 +1950,7 @@ static int tcp_v6_init_sock(struct sock *sk) icsk->icsk_af_ops = &ipv6_specific; -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif -- cgit v1.2.3 From af09a341dcf63b34ce742295ad1ce876246c5de2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:22:05 +0100 Subject: net/tcp: Add TCP-AO segments counters Introduce segment counters that are useful for troubleshooting/debugging as well as for writing tests. Now there are global snmp counters as well as per-socket and per-key. Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 15 +++++++++++---- include/net/tcp.h | 15 +++++++++++---- include/net/tcp_ao.h | 10 ++++++++++ include/uapi/linux/snmp.h | 4 ++++ include/uapi/linux/tcp.h | 8 +++++++- net/ipv4/proc.c | 4 ++++ net/ipv4/tcp_ao.c | 30 +++++++++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 9 files changed, 77 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 7637137ae33e..3c70ad53a49c 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -168,17 +168,24 @@ enum skb_drop_reason { */ SKB_DROP_REASON_TCP_MD5FAILURE, /** - * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected + * @SKB_DROP_REASON_TCP_AONOTFOUND: no TCP-AO hash and one was expected, + * corresponding to LINUX_MIB_TCPAOREQUIRED */ SKB_DROP_REASON_TCP_AONOTFOUND, /** * @SKB_DROP_REASON_TCP_AOUNEXPECTED: TCP-AO hash is present and it - * was not expected. + * was not expected, corresponding to LINUX_MIB_TCPAOKEYNOTFOUND */ SKB_DROP_REASON_TCP_AOUNEXPECTED, - /** @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown */ + /** + * @SKB_DROP_REASON_TCP_AOKEYNOTFOUND: TCP-AO key is unknown, + * corresponding to LINUX_MIB_TCPAOKEYNOTFOUND + */ SKB_DROP_REASON_TCP_AOKEYNOTFOUND, - /** @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong */ + /** + * @SKB_DROP_REASON_TCP_AOFAILURE: TCP-AO hash is wrong, + * corresponding to LINUX_MIB_TCPAOBAD + */ SKB_DROP_REASON_TCP_AOFAILURE, /** * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog ( diff --git a/include/net/tcp.h b/include/net/tcp.h index 8e1f835bad22..50ae1ed244e5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2712,7 +2712,7 @@ static inline int tcp_parse_auth_options(const struct tcphdr *th, } static inline bool tcp_ao_required(struct sock *sk, const void *saddr, - int family) + int family, bool stat_inc) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -2724,8 +2724,13 @@ static inline bool tcp_ao_required(struct sock *sk, const void *saddr, return false; ao_key = tcp_ao_do_lookup(sk, saddr, family, -1, -1); - if (ao_info->ao_required || ao_key) + if (ao_info->ao_required || ao_key) { + if (stat_inc) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED); + atomic64_inc(&ao_info->counters.ao_required); + } return true; + } #endif return false; } @@ -2747,8 +2752,10 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, return SKB_DROP_REASON_TCP_AUTH_HDR; if (req) { - if (tcp_rsk_used_ao(req) != !!aoh) + if (tcp_rsk_used_ao(req) != !!aoh) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); return SKB_DROP_REASON_TCP_AOFAILURE; + } } /* sdif set, means packet ingressed via a device @@ -2763,7 +2770,7 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, * the last key is impossible to remove, so there's * always at least one current_key. */ - if (tcp_ao_required(sk, saddr, family)) + if (tcp_ao_required(sk, saddr, family, true)) return SKB_DROP_REASON_TCP_AONOTFOUND; if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 1c7c0a5d1877..cfb55bd9411b 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -19,6 +19,13 @@ struct tcp_ao_hdr { u8 rnext_keyid; }; +struct tcp_ao_counters { + atomic64_t pkt_good; + atomic64_t pkt_bad; + atomic64_t key_not_found; + atomic64_t ao_required; +}; + struct tcp_ao_key { struct hlist_node node; union tcp_ao_addr addr; @@ -33,6 +40,8 @@ struct tcp_ao_key { u8 rcvid; u8 maclen; struct rcu_head rcu; + atomic64_t pkt_good; + atomic64_t pkt_bad; u8 traffic_keys[]; }; @@ -81,6 +90,7 @@ struct tcp_ao_info { */ struct tcp_ao_key *current_key; struct tcp_ao_key *rnext_key; + struct tcp_ao_counters counters; u32 ao_required :1, __unused :31; __be32 lisn; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index b2b72886cb6d..3d5ea841bffe 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -297,6 +297,10 @@ enum LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */ LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */ LINUX_MIB_TCPPLBREHASH, /* TCPPLBRehash */ + LINUX_MIB_TCPAOREQUIRED, /* TCPAORequired */ + LINUX_MIB_TCPAOBAD, /* TCPAOBad */ + LINUX_MIB_TCPAOKEYNOTFOUND, /* TCPAOKeyNotFound */ + LINUX_MIB_TCPAOGOOD, /* TCPAOGood */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index fa49f03e62fe..9c48964849d1 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -404,9 +404,15 @@ struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */ __u32 set_current :1, /* corresponding ::current_key */ set_rnext :1, /* corresponding ::rnext */ ao_required :1, /* don't accept non-AO connects */ - reserved :29; /* must be 0 */ + set_counters :1, /* set/clear ::pkt_* counters */ + reserved :28; /* must be 0 */ + __u16 reserved2; /* padding, must be 0 */ __u8 current_key; /* KeyID to set as Current_key */ __u8 rnext; /* KeyID to set as Rnext_key */ + __u64 pkt_good; /* verified segments */ + __u64 pkt_bad; /* failed verification */ + __u64 pkt_key_not_found; /* could not find a key to verify */ + __u64 pkt_ao_required; /* segments missing TCP-AO sign */ } __attribute__((aligned(8))); /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a85b0aba3646..f5b37ebc18c0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -299,6 +299,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), + SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED), + SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD), + SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), + SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 6c5815713b73..1097e99a9ad6 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -182,6 +182,8 @@ static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk, *new_key = *key; INIT_HLIST_NODE(&new_key->node); tcp_sigpool_get(new_key->tcp_sigpool_id); + atomic64_set(&new_key->pkt_good, 0); + atomic64_set(&new_key->pkt_bad, 0); return new_key; } @@ -771,8 +773,12 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, const struct tcphdr *th = tcp_hdr(skb); void *hash_buf = NULL; - if (maclen != tcp_ao_maclen(key)) + if (maclen != tcp_ao_maclen(key)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); + atomic64_inc(&info->counters.pkt_bad); + atomic64_inc(&key->pkt_bad); return SKB_DROP_REASON_TCP_AOFAILURE; + } hash_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC); if (!hash_buf) @@ -782,9 +788,15 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key, (phash - (u8 *)th), sne); if (memcmp(phash, hash_buf, maclen)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); + atomic64_inc(&info->counters.pkt_bad); + atomic64_inc(&key->pkt_bad); kfree(hash_buf); return SKB_DROP_REASON_TCP_AOFAILURE; } + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOGOOD); + atomic64_inc(&info->counters.pkt_good); + atomic64_inc(&key->pkt_good); kfree(hash_buf); return SKB_NOT_DROPPED_YET; } @@ -804,8 +816,10 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, u32 sne = 0; info = rcu_dereference(tcp_sk(sk)->ao_info); - if (!info) + if (!info) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); return SKB_DROP_REASON_TCP_AOUNEXPECTED; + } if (unlikely(th->syn)) { sisn = th->seq; @@ -900,6 +914,8 @@ verify_hash: return ret; key_not_found: + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); + atomic64_inc(&info->counters.key_not_found); return SKB_DROP_REASON_TCP_AOKEYNOTFOUND; } @@ -1483,6 +1499,8 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family, key->keyflags = cmd.keyflags; key->sndid = cmd.sndid; key->rcvid = cmd.rcvid; + atomic64_set(&key->pkt_good, 0); + atomic64_set(&key->pkt_bad, 0); ret = tcp_ao_parse_crypto(&cmd, key); if (ret < 0) @@ -1699,7 +1717,7 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, return -EINVAL; } - if (cmd.reserved != 0) + if (cmd.reserved != 0 || cmd.reserved2 != 0) return -EINVAL; ao_info = setsockopt_ao_info(sk); @@ -1734,6 +1752,12 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, goto out; } } + if (cmd.set_counters) { + atomic64_set(&ao_info->counters.pkt_good, cmd.pkt_good); + atomic64_set(&ao_info->counters.pkt_bad, cmd.pkt_bad); + atomic64_set(&ao_info->counters.key_not_found, cmd.pkt_key_not_found); + atomic64_set(&ao_info->counters.ao_required, cmd.pkt_ao_required); + } ao_info->ao_required = cmd.ao_required; if (new_current) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f39ccefa78dc..ece95d5138e1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1531,7 +1531,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ - if (tcp_ao_required(sk, addr, AF_INET)) + if (tcp_ao_required(sk, addr, AF_INET, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d2724383d263..cc899caf348e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -661,7 +661,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ - if (tcp_ao_required(sk, addr, AF_INET)) + if (tcp_ao_required(sk, addr, AF_INET, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, @@ -673,7 +673,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ - if (tcp_ao_required(sk, addr, AF_INET6)) + if (tcp_ao_required(sk, addr, AF_INET6, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, -- cgit v1.2.3 From 953af8e3acb68d2db11937cec3bc5da31de5c12e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:22:08 +0100 Subject: net/tcp: Ignore specific ICMPs for TCP-AO connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similarly to IPsec, RFC5925 prescribes: ">> A TCP-AO implementation MUST default to ignore incoming ICMPv4 messages of Type 3 (destination unreachable), Codes 2-4 (protocol unreachable, port unreachable, and fragmentation needed -- ’hard errors’), and ICMPv6 Type 1 (destination unreachable), Code 1 (administratively prohibited) and Code 4 (port unreachable) intended for connections in synchronized states (ESTABLISHED, FIN-WAIT-1, FIN- WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT) that match MKTs." A selftest (later in patch series) verifies that this attack is not possible in this TCP-AO implementation. Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp_ao.h | 11 ++++++++- include/uapi/linux/snmp.h | 1 + include/uapi/linux/tcp.h | 4 +++- net/ipv4/proc.c | 1 + net/ipv4/tcp_ao.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 7 ++++++ net/ipv6/tcp_ipv6.c | 7 ++++++ 7 files changed, 87 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 4da6e3657913..a9d38b9e8bcb 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -24,6 +24,7 @@ struct tcp_ao_counters { atomic64_t pkt_bad; atomic64_t key_not_found; atomic64_t ao_required; + atomic64_t dropped_icmp; }; struct tcp_ao_key { @@ -92,7 +93,8 @@ struct tcp_ao_info { struct tcp_ao_key *rnext_key; struct tcp_ao_counters counters; u32 ao_required :1, - __unused :31; + accept_icmps :1, + __unused :30; __be32 lisn; __be32 risn; /* Sequence Number Extension (SNE) are upper 4 bytes for SEQ, @@ -191,6 +193,7 @@ int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, unsigned int len, struct tcp_sigpool *hp); void tcp_ao_destroy_sock(struct sock *sk, bool twsk); void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); +bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, @@ -274,6 +277,12 @@ static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, { } +static inline bool tcp_ao_ignore_icmp(const struct sock *sk, int family, + int type, int code) +{ + return false; +} + static inline enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, const struct tcp_ao_hdr *aoh) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 3d5ea841bffe..a0819c6a5988 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -301,6 +301,7 @@ enum LINUX_MIB_TCPAOBAD, /* TCPAOBad */ LINUX_MIB_TCPAOKEYNOTFOUND, /* TCPAOKeyNotFound */ LINUX_MIB_TCPAOGOOD, /* TCPAOGood */ + LINUX_MIB_TCPAODROPPEDICMPS, /* TCPAODroppedIcmps */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 9c48964849d1..d8b2ea23f12a 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -405,7 +405,8 @@ struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */ set_rnext :1, /* corresponding ::rnext */ ao_required :1, /* don't accept non-AO connects */ set_counters :1, /* set/clear ::pkt_* counters */ - reserved :28; /* must be 0 */ + accept_icmps :1, /* accept incoming ICMPs */ + reserved :27; /* must be 0 */ __u16 reserved2; /* padding, must be 0 */ __u8 current_key; /* KeyID to set as Current_key */ __u8 rnext; /* KeyID to set as Rnext_key */ @@ -413,6 +414,7 @@ struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */ __u64 pkt_bad; /* failed verification */ __u64 pkt_key_not_found; /* could not find a key to verify */ __u64 pkt_ao_required; /* segments missing TCP-AO sign */ + __u64 pkt_dropped_icmp; /* ICMPs that were ignored */ } __attribute__((aligned(8))); /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f5b37ebc18c0..5f4654ebff48 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -303,6 +303,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD), SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), + SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index f76fcb93499d..223af5c9eaf3 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -15,6 +15,7 @@ #include #include +#include int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, unsigned int len, struct tcp_sigpool *hp) @@ -44,6 +45,60 @@ clear_hash: return 1; } +bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) +{ + bool ignore_icmp = false; + struct tcp_ao_info *ao; + + /* RFC5925, 7.8: + * >> A TCP-AO implementation MUST default to ignore incoming ICMPv4 + * messages of Type 3 (destination unreachable), Codes 2-4 (protocol + * unreachable, port unreachable, and fragmentation needed -- ’hard + * errors’), and ICMPv6 Type 1 (destination unreachable), Code 1 + * (administratively prohibited) and Code 4 (port unreachable) intended + * for connections in synchronized states (ESTABLISHED, FIN-WAIT-1, FIN- + * WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT) that match MKTs. + */ + if (family == AF_INET) { + if (type != ICMP_DEST_UNREACH) + return false; + if (code < ICMP_PROT_UNREACH || code > ICMP_FRAG_NEEDED) + return false; + } else { + if (type != ICMPV6_DEST_UNREACH) + return false; + if (code != ICMPV6_ADM_PROHIBITED && code != ICMPV6_PORT_UNREACH) + return false; + } + + rcu_read_lock(); + switch (sk->sk_state) { + case TCP_TIME_WAIT: + ao = rcu_dereference(tcp_twsk(sk)->ao_info); + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: + case TCP_LISTEN: + case TCP_NEW_SYN_RECV: + /* RFC5925 specifies to ignore ICMPs *only* on connections + * in synchronized states. + */ + rcu_read_unlock(); + return false; + default: + ao = rcu_dereference(tcp_sk(sk)->ao_info); + } + + if (ao && !ao->accept_icmps) { + ignore_icmp = true; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAODROPPEDICMPS); + atomic64_inc(&ao->counters.dropped_icmp); + } + rcu_read_unlock(); + + return ignore_icmp; +} + /* Optimized version of tcp_ao_do_lookup(): only for sockets for which * it's known that the keys in ao_info are matching peer's * family/address/VRF/etc. @@ -1086,6 +1141,7 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, new_ao->lisn = htonl(tcp_rsk(req)->snt_isn); new_ao->risn = htonl(tcp_rsk(req)->rcv_isn); new_ao->ao_required = ao->ao_required; + new_ao->accept_icmps = ao->accept_icmps; if (family == AF_INET) { addr = (union tcp_ao_addr *)&newsk->sk_daddr; @@ -1792,9 +1848,11 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, atomic64_set(&ao_info->counters.pkt_bad, cmd.pkt_bad); atomic64_set(&ao_info->counters.key_not_found, cmd.pkt_key_not_found); atomic64_set(&ao_info->counters.ao_required, cmd.pkt_ao_required); + atomic64_set(&ao_info->counters.dropped_icmp, cmd.pkt_dropped_icmp); } ao_info->ao_required = cmd.ao_required; + ao_info->accept_icmps = cmd.accept_icmps; if (new_current) WRITE_ONCE(ao_info->current_key, new_current); if (new_rnext) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bdec99707028..8f98c58e2689 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -494,6 +494,8 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { + /* To increase the counter of ignored icmps for TCP-AO */ + tcp_ao_ignore_icmp(sk, AF_INET, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } @@ -507,6 +509,11 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) return 0; } + if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) { + sock_put(sk); + return 0; + } + bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4dcbc13e9ec8..fa7050579e9a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -396,6 +396,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (sk->sk_state == TCP_TIME_WAIT) { + /* To increase the counter of ignored icmps for TCP-AO */ + tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } @@ -406,6 +408,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } + if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { + sock_put(sk); + return 0; + } + bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); -- cgit v1.2.3 From 7753c2f0a857bfa6501e67deee03988dd0bcaae7 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:22:09 +0100 Subject: net/tcp: Add option for TCP-AO to (not) hash header Provide setsockopt() key flag that makes TCP-AO exclude hashing TCP header for peers that match the key. This is needed for interraction with middleboxes that may change TCP options, see RFC5925 (9.2). Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 5 +++++ net/ipv4/tcp_ao.c | 8 +++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d8b2ea23f12a..320aab010f9a 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -367,6 +367,11 @@ struct tcp_diag_md5sig { #define TCP_AO_MAXKEYLEN 80 #define TCP_AO_KEYF_IFINDEX (1 << 0) /* L3 ifindex for VRF */ +#define TCP_AO_KEYF_EXCLUDE_OPT (1 << 1) /* "Indicates whether TCP + * options other than TCP-AO + * are included in the MAC + * calculation" + */ struct tcp_ao_add { /* setsockopt(TCP_AO_ADD_KEY) */ struct __kernel_sockaddr_storage addr; /* peer's address for the key */ diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 223af5c9eaf3..10cc6be4d537 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -562,7 +562,8 @@ int tcp_ao_hash_hdr(unsigned short int family, char *ao_hash, WARN_ON_ONCE(1); goto clear_hash; } - if (tcp_ao_hash_header(&hp, th, false, + if (tcp_ao_hash_header(&hp, th, + !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT), ao_hash, hash_offset, tcp_ao_maclen(key))) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, hash_buf, 0); @@ -610,7 +611,8 @@ int tcp_ao_hash_skb(unsigned short int family, goto clear_hash; if (tcp_ao_hash_pseudoheader(family, sk, skb, &hp, skb->len)) goto clear_hash; - if (tcp_ao_hash_header(&hp, th, false, + if (tcp_ao_hash_header(&hp, th, + !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT), ao_hash, hash_offset, tcp_ao_maclen(key))) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) @@ -1454,7 +1456,7 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk) return ERR_PTR(-ESOCKTNOSUPPORT); } -#define TCP_AO_KEYF_ALL (0) +#define TCP_AO_KEYF_ALL (TCP_AO_KEYF_EXCLUDE_OPT) static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk, struct tcp_ao_add *cmd) -- cgit v1.2.3 From ef84703a911f4ee52ca585e8308b7084093941f4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:22:10 +0100 Subject: net/tcp: Add TCP-AO getsockopt()s Introduce getsockopt(TCP_AO_GET_KEYS) that lets a user get TCP-AO keys and their properties from a socket. The user can provide a filter to match the specific key to be dumped or ::get_all = 1 may be used to dump all keys in one syscall. Add another getsockopt(TCP_AO_INFO) for providing per-socket/per-ao_info stats: packet counters, Current_key/RNext_key and flags like ::ao_required and ::accept_icmps. Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp_ao.h | 12 ++ include/uapi/linux/tcp.h | 63 +++++++--- net/ipv4/tcp.c | 13 +++ net/ipv4/tcp_ao.c | 295 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 369 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index a9d38b9e8bcb..061c358a3c8a 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -194,6 +194,8 @@ int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx, void tcp_ao_destroy_sock(struct sock *sk, bool twsk); void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); +int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, @@ -316,6 +318,16 @@ static inline void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, static inline void tcp_ao_connect_init(struct sock *sk) { } + +static inline int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 320aab010f9a..201b3cbd6020 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -131,7 +131,8 @@ enum { #define TCP_AO_ADD_KEY 38 /* Add/Set MKT */ #define TCP_AO_DEL_KEY 39 /* Delete MKT */ -#define TCP_AO_INFO 40 /* Modify TCP-AO per-socket options */ +#define TCP_AO_INFO 40 /* Set/list TCP-AO per-socket options */ +#define TCP_AO_GET_KEYS 41 /* List MKT(s) */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 @@ -405,21 +406,55 @@ struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */ __u8 keyflags; /* see TCP_AO_KEYF_ */ } __attribute__((aligned(8))); -struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO) */ - __u32 set_current :1, /* corresponding ::current_key */ - set_rnext :1, /* corresponding ::rnext */ - ao_required :1, /* don't accept non-AO connects */ - set_counters :1, /* set/clear ::pkt_* counters */ - accept_icmps :1, /* accept incoming ICMPs */ +struct tcp_ao_info_opt { /* setsockopt(TCP_AO_INFO), getsockopt(TCP_AO_INFO) */ + /* Here 'in' is for setsockopt(), 'out' is for getsockopt() */ + __u32 set_current :1, /* in/out: corresponding ::current_key */ + set_rnext :1, /* in/out: corresponding ::rnext */ + ao_required :1, /* in/out: don't accept non-AO connects */ + set_counters :1, /* in: set/clear ::pkt_* counters */ + accept_icmps :1, /* in/out: accept incoming ICMPs */ reserved :27; /* must be 0 */ __u16 reserved2; /* padding, must be 0 */ - __u8 current_key; /* KeyID to set as Current_key */ - __u8 rnext; /* KeyID to set as Rnext_key */ - __u64 pkt_good; /* verified segments */ - __u64 pkt_bad; /* failed verification */ - __u64 pkt_key_not_found; /* could not find a key to verify */ - __u64 pkt_ao_required; /* segments missing TCP-AO sign */ - __u64 pkt_dropped_icmp; /* ICMPs that were ignored */ + __u8 current_key; /* in/out: KeyID of Current_key */ + __u8 rnext; /* in/out: keyid of RNext_key */ + __u64 pkt_good; /* in/out: verified segments */ + __u64 pkt_bad; /* in/out: failed verification */ + __u64 pkt_key_not_found; /* in/out: could not find a key to verify */ + __u64 pkt_ao_required; /* in/out: segments missing TCP-AO sign */ + __u64 pkt_dropped_icmp; /* in/out: ICMPs that were ignored */ +} __attribute__((aligned(8))); + +struct tcp_ao_getsockopt { /* getsockopt(TCP_AO_GET_KEYS) */ + struct __kernel_sockaddr_storage addr; /* in/out: dump keys for peer + * with this address/prefix + */ + char alg_name[64]; /* out: crypto hash algorithm */ + __u8 key[TCP_AO_MAXKEYLEN]; + __u32 nkeys; /* in: size of the userspace buffer + * @optval, measured in @optlen - the + * sizeof(struct tcp_ao_getsockopt) + * out: number of keys that matched + */ + __u16 is_current :1, /* in: match and dump Current_key, + * out: the dumped key is Current_key + */ + + is_rnext :1, /* in: match and dump RNext_key, + * out: the dumped key is RNext_key + */ + get_all :1, /* in: dump all keys */ + reserved :13; /* padding, must be 0 */ + __u8 sndid; /* in/out: dump keys with SendID */ + __u8 rcvid; /* in/out: dump keys with RecvID */ + __u8 prefix; /* in/out: dump keys with address/prefix */ + __u8 maclen; /* out: key's length of authentication + * code (hash) + */ + __u8 keyflags; /* in/out: see TCP_AO_KEYF_ */ + __u8 keylen; /* out: length of ::key */ + __s32 ifindex; /* in/out: L3 dev index for VRF */ + __u64 pkt_good; /* out: verified segments */ + __u64 pkt_bad; /* out: segments that failed verification */ } __attribute__((aligned(8))); /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1be6467a059a..ce33eee9d0f2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4284,6 +4284,19 @@ zerocopy_rcv_out: return err; } #endif + case TCP_AO_GET_KEYS: + case TCP_AO_INFO: { + int err; + + sockopt_lock_sock(sk); + if (optname == TCP_AO_GET_KEYS) + err = tcp_ao_get_mkts(sk, optval, optlen); + else + err = tcp_ao_get_sock_info(sk, optval, optlen); + sockopt_release_sock(sk); + + return err; + } default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 10cc6be4d537..cbc1ee0f5b9a 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1894,3 +1894,298 @@ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen) return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen); } +/* tcp_ao_copy_mkts_to_user(ao_info, optval, optlen) + * + * @ao_info: struct tcp_ao_info on the socket that + * socket getsockopt(TCP_AO_GET_KEYS) is executed on + * @optval: pointer to array of tcp_ao_getsockopt structures in user space. + * Must be != NULL. + * @optlen: pointer to size of tcp_ao_getsockopt structure. + * Must be != NULL. + * + * Return value: 0 on success, a negative error number otherwise. + * + * optval points to an array of tcp_ao_getsockopt structures in user space. + * optval[0] is used as both input and output to getsockopt. It determines + * which keys are returned by the kernel. + * optval[0].nkeys is the size of the array in user space. On return it contains + * the number of keys matching the search criteria. + * If tcp_ao_getsockopt::get_all is set, then all keys in the socket are + * returned, otherwise only keys matching + * in optval[0] are returned. + * optlen is also used as both input and output. The user provides the size + * of struct tcp_ao_getsockopt in user space, and the kernel returns the size + * of the structure in kernel space. + * The size of struct tcp_ao_getsockopt may differ between user and kernel. + * There are three cases to consider: + * * If usize == ksize, then keys are copied verbatim. + * * If usize < ksize, then the userspace has passed an old struct to a + * newer kernel. The rest of the trailing bytes in optval[0] + * (ksize - usize) are interpreted as 0 by the kernel. + * * If usize > ksize, then the userspace has passed a new struct to an + * older kernel. The trailing bytes unknown to the kernel (usize - ksize) + * are checked to ensure they are zeroed, otherwise -E2BIG is returned. + * On return the kernel fills in min(usize, ksize) in each entry of the array. + * The layout of the fields in the user and kernel structures is expected to + * be the same (including in the 32bit vs 64bit case). + */ +static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info, + sockptr_t optval, sockptr_t optlen) +{ + struct tcp_ao_getsockopt opt_in, opt_out; + struct tcp_ao_key *key, *current_key; + bool do_address_matching = true; + union tcp_ao_addr *addr = NULL; + unsigned int max_keys; /* maximum number of keys to copy to user */ + size_t out_offset = 0; + size_t bytes_to_write; /* number of bytes to write to user level */ + int err, user_len; + u32 matched_keys; /* keys from ao_info matched so far */ + int optlen_out; + __be16 port = 0; + + if (copy_from_sockptr(&user_len, optlen, sizeof(int))) + return -EFAULT; + + if (user_len <= 0) + return -EINVAL; + + memset(&opt_in, 0, sizeof(struct tcp_ao_getsockopt)); + err = copy_struct_from_sockptr(&opt_in, sizeof(opt_in), + optval, user_len); + if (err < 0) + return err; + + if (opt_in.pkt_good || opt_in.pkt_bad) + return -EINVAL; + + if (opt_in.reserved != 0) + return -EINVAL; + + max_keys = opt_in.nkeys; + + if (opt_in.get_all || opt_in.is_current || opt_in.is_rnext) { + if (opt_in.get_all && (opt_in.is_current || opt_in.is_rnext)) + return -EINVAL; + do_address_matching = false; + } + + switch (opt_in.addr.ss_family) { + case AF_INET: { + struct sockaddr_in *sin; + __be32 mask; + + sin = (struct sockaddr_in *)&opt_in.addr; + port = sin->sin_port; + addr = (union tcp_ao_addr *)&sin->sin_addr; + + if (opt_in.prefix > 32) + return -EINVAL; + + if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY && + opt_in.prefix != 0) + return -EINVAL; + + mask = inet_make_mask(opt_in.prefix); + if (sin->sin_addr.s_addr & ~mask) + return -EINVAL; + + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6; + struct in6_addr *addr6; + + sin6 = (struct sockaddr_in6 *)&opt_in.addr; + addr = (union tcp_ao_addr *)&sin6->sin6_addr; + addr6 = &sin6->sin6_addr; + port = sin6->sin6_port; + + /* We don't have to change family and @addr here if + * ipv6_addr_v4mapped() like in key adding: + * tcp_ao_key_cmp() does it. Do the sanity checks though. + */ + if (opt_in.prefix != 0) { + if (ipv6_addr_v4mapped(addr6)) { + __be32 mask, addr4 = addr6->s6_addr32[3]; + + if (opt_in.prefix > 32 || + ntohl(addr4) == INADDR_ANY) + return -EINVAL; + mask = inet_make_mask(opt_in.prefix); + if (addr4 & ~mask) + return -EINVAL; + } else { + struct in6_addr pfx; + + if (ipv6_addr_any(addr6) || + opt_in.prefix > 128) + return -EINVAL; + + ipv6_addr_prefix(&pfx, addr6, opt_in.prefix); + if (ipv6_addr_cmp(&pfx, addr6)) + return -EINVAL; + } + } else if (!ipv6_addr_any(addr6)) { + return -EINVAL; + } + break; + } + case 0: + if (!do_address_matching) + break; + fallthrough; + default: + return -EAFNOSUPPORT; + } + + if (!do_address_matching) { + /* We could just ignore those, but let's do stricter checks */ + if (addr || port) + return -EINVAL; + if (opt_in.prefix || opt_in.sndid || opt_in.rcvid) + return -EINVAL; + } + + bytes_to_write = min_t(int, user_len, sizeof(struct tcp_ao_getsockopt)); + matched_keys = 0; + /* May change in RX, while we're dumping, pre-fetch it */ + current_key = READ_ONCE(ao_info->current_key); + + hlist_for_each_entry_rcu(key, &ao_info->head, node) { + if (opt_in.get_all) + goto match; + + if (opt_in.is_current || opt_in.is_rnext) { + if (opt_in.is_current && key == current_key) + goto match; + if (opt_in.is_rnext && key == ao_info->rnext_key) + goto match; + continue; + } + + if (tcp_ao_key_cmp(key, addr, opt_in.prefix, + opt_in.addr.ss_family, + opt_in.sndid, opt_in.rcvid) != 0) + continue; +match: + matched_keys++; + if (matched_keys > max_keys) + continue; + + memset(&opt_out, 0, sizeof(struct tcp_ao_getsockopt)); + + if (key->family == AF_INET) { + struct sockaddr_in *sin_out = (struct sockaddr_in *)&opt_out.addr; + + sin_out->sin_family = key->family; + sin_out->sin_port = 0; + memcpy(&sin_out->sin_addr, &key->addr, sizeof(struct in_addr)); + } else { + struct sockaddr_in6 *sin6_out = (struct sockaddr_in6 *)&opt_out.addr; + + sin6_out->sin6_family = key->family; + sin6_out->sin6_port = 0; + memcpy(&sin6_out->sin6_addr, &key->addr, sizeof(struct in6_addr)); + } + opt_out.sndid = key->sndid; + opt_out.rcvid = key->rcvid; + opt_out.prefix = key->prefixlen; + opt_out.keyflags = key->keyflags; + opt_out.is_current = (key == current_key); + opt_out.is_rnext = (key == ao_info->rnext_key); + opt_out.nkeys = 0; + opt_out.maclen = key->maclen; + opt_out.keylen = key->keylen; + opt_out.pkt_good = atomic64_read(&key->pkt_good); + opt_out.pkt_bad = atomic64_read(&key->pkt_bad); + memcpy(&opt_out.key, key->key, key->keylen); + tcp_sigpool_algo(key->tcp_sigpool_id, opt_out.alg_name, 64); + + /* Copy key to user */ + if (copy_to_sockptr_offset(optval, out_offset, + &opt_out, bytes_to_write)) + return -EFAULT; + out_offset += user_len; + } + + optlen_out = (int)sizeof(struct tcp_ao_getsockopt); + if (copy_to_sockptr(optlen, &optlen_out, sizeof(int))) + return -EFAULT; + + out_offset = offsetof(struct tcp_ao_getsockopt, nkeys); + if (copy_to_sockptr_offset(optval, out_offset, + &matched_keys, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + struct tcp_ao_info *ao_info; + + ao_info = setsockopt_ao_info(sk); + if (IS_ERR(ao_info)) + return PTR_ERR(ao_info); + if (!ao_info) + return -ENOENT; + + return tcp_ao_copy_mkts_to_user(ao_info, optval, optlen); +} + +int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + struct tcp_ao_info_opt out, in = {}; + struct tcp_ao_key *current_key; + struct tcp_ao_info *ao; + int err, len; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + + if (len <= 0) + return -EINVAL; + + /* Copying this "in" only to check ::reserved, ::reserved2, + * that may be needed to extend (struct tcp_ao_info_opt) and + * what getsockopt() provides in future. + */ + err = copy_struct_from_sockptr(&in, sizeof(in), optval, len); + if (err) + return err; + + if (in.reserved != 0 || in.reserved2 != 0) + return -EINVAL; + + ao = setsockopt_ao_info(sk); + if (IS_ERR(ao)) + return PTR_ERR(ao); + if (!ao) + return -ENOENT; + + memset(&out, 0, sizeof(out)); + out.ao_required = ao->ao_required; + out.accept_icmps = ao->accept_icmps; + out.pkt_good = atomic64_read(&ao->counters.pkt_good); + out.pkt_bad = atomic64_read(&ao->counters.pkt_bad); + out.pkt_key_not_found = atomic64_read(&ao->counters.key_not_found); + out.pkt_ao_required = atomic64_read(&ao->counters.ao_required); + out.pkt_dropped_icmp = atomic64_read(&ao->counters.dropped_icmp); + + current_key = READ_ONCE(ao->current_key); + if (current_key) { + out.set_current = 1; + out.current_key = current_key->sndid; + } + if (ao->rnext_key) { + out.set_rnext = 1; + out.rnext = ao->rnext_key->rcvid; + } + + if (copy_to_sockptr(optval, &out, min_t(int, len, sizeof(out)))) + return -EFAULT; + + return 0; +} + -- cgit v1.2.3 From d6732b95b6fbbc6d5bb9d2f809e275763640c4a2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:22:11 +0100 Subject: net/tcp: Allow asynchronous delete for TCP-AO keys (MKTs) Delete becomes very, very fast - almost free, but after setsockopt() syscall returns, the key is still alive until next RCU grace period. Which is fine for listen sockets as userspace needs to be aware of setsockopt(TCP_AO) and accept() race and resolve it with verification by getsockopt() after TCP connection was accepted. The benchmark results (on non-loaded box, worse with more RCU work pending): > ok 33 Worst case delete 16384 keys: min=5ms max=10ms mean=6.93904ms stddev=0.263421 > ok 34 Add a new key 16384 keys: min=1ms max=4ms mean=2.17751ms stddev=0.147564 > ok 35 Remove random-search 16384 keys: min=5ms max=10ms mean=6.50243ms stddev=0.254999 > ok 36 Remove async 16384 keys: min=0ms max=0ms mean=0.0296107ms stddev=0.0172078 Co-developed-by: Francesco Ruggeri Signed-off-by: Francesco Ruggeri Co-developed-by: Salam Noureddine Signed-off-by: Salam Noureddine Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 3 ++- net/ipv4/tcp_ao.c | 21 ++++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 201b3cbd6020..be34d7c5c531 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -396,7 +396,8 @@ struct tcp_ao_del { /* setsockopt(TCP_AO_DEL_KEY) */ __s32 ifindex; /* L3 dev index for VRF */ __u32 set_current :1, /* corresponding ::current_key */ set_rnext :1, /* corresponding ::rnext */ - reserved :30; /* must be 0 */ + del_async :1, /* only valid for listen sockets */ + reserved :29; /* must be 0 */ __u16 reserved2; /* padding, must be 0 */ __u8 prefix; /* peer's address prefix */ __u8 sndid; /* SendID for outgoing segments */ diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index cbc1ee0f5b9a..acbeb635fe29 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1628,7 +1628,7 @@ err_free_ao: } static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info, - struct tcp_ao_key *key, + bool del_async, struct tcp_ao_key *key, struct tcp_ao_key *new_current, struct tcp_ao_key *new_rnext) { @@ -1636,11 +1636,24 @@ static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info, hlist_del_rcu(&key->node); + /* Support for async delete on listening sockets: as they don't + * need current_key/rnext_key maintaining, we don't need to check + * them and we can just free all resources in RCU fashion. + */ + if (del_async) { + atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc); + call_rcu(&key->rcu, tcp_ao_key_free_rcu); + return 0; + } + /* At this moment another CPU could have looked this key up * while it was unlinked from the list. Wait for RCU grace period, * after which the key is off-list and can't be looked up again; * the rx path [just before RCU came] might have used it and set it * as current_key (very unlikely). + * Free the key with next RCU grace period (in case it was + * current_key before tcp_ao_current_rnext() might have + * changed it in forced-delete). */ synchronize_rcu(); if (new_current) @@ -1711,6 +1724,8 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, if (!new_rnext) return -ENOENT; } + if (cmd.del_async && sk->sk_state != TCP_LISTEN) + return -EINVAL; if (family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr; @@ -1758,8 +1773,8 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, if (key == new_current || key == new_rnext) continue; - return tcp_ao_delete_key(sk, ao_info, key, - new_current, new_rnext); + return tcp_ao_delete_key(sk, ao_info, cmd.del_async, key, + new_current, new_rnext); } return -ENOENT; } -- cgit v1.2.3 From faadfaba5e018ca0f9595f17115ff48416b7b85e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 23 Oct 2023 20:22:14 +0100 Subject: net/tcp: Add TCP_AO_REPAIR Add TCP_AO_REPAIR setsockopt(), getsockopt(). They let a user to repair TCP-AO ISNs/SNEs. Also let the user hack around when (tp->repair) is on and add ao_info on a socket in any supported state. As SNEs now can be read/written at any moment, use WRITE_ONCE()/READ_ONCE() to set/read them. Signed-off-by: Dmitry Safonov Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp_ao.h | 14 ++++++++ include/uapi/linux/tcp.h | 8 +++++ net/ipv4/tcp.c | 24 +++++++++---- net/ipv4/tcp_ao.c | 90 +++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 125 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index edd6748b2cfa..a375a171ef3c 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -199,6 +199,8 @@ void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp); bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code); int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen); int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen); +int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen); enum skb_drop_reason tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, unsigned short int family, const struct request_sock *req, int l3index, @@ -330,6 +332,18 @@ static inline int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockpt { return -ENOPROTOOPT; } + +static inline int tcp_ao_get_repair(struct sock *sk, + sockptr_t optval, sockptr_t optlen) +{ + return -ENOPROTOOPT; +} + +static inline int tcp_ao_set_repair(struct sock *sk, + sockptr_t optval, unsigned int optlen) +{ + return -ENOPROTOOPT; +} #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index be34d7c5c531..c07e9f90c084 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -133,6 +133,7 @@ enum { #define TCP_AO_DEL_KEY 39 /* Delete MKT */ #define TCP_AO_INFO 40 /* Set/list TCP-AO per-socket options */ #define TCP_AO_GET_KEYS 41 /* List MKT(s) */ +#define TCP_AO_REPAIR 42 /* Get/Set SNEs and ISNs */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 @@ -458,6 +459,13 @@ struct tcp_ao_getsockopt { /* getsockopt(TCP_AO_GET_KEYS) */ __u64 pkt_bad; /* out: segments that failed verification */ } __attribute__((aligned(8))); +struct tcp_ao_repair { /* {s,g}etsockopt(TCP_AO_REPAIR) */ + __be32 snt_isn; + __be32 rcv_isn; + __u32 snd_sne; + __u32 rcv_sne; +} __attribute__((aligned(8))); + /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ #define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ce33eee9d0f2..53bcc17c91e4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3593,20 +3593,28 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, __tcp_sock_set_quickack(sk, val); break; + case TCP_AO_REPAIR: + err = tcp_ao_set_repair(sk, optval, optlen); + break; #ifdef CONFIG_TCP_AO case TCP_AO_ADD_KEY: case TCP_AO_DEL_KEY: case TCP_AO_INFO: { /* If this is the first TCP-AO setsockopt() on the socket, - * sk_state has to be LISTEN or CLOSE + * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR + * in any state. */ - if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) || - rcu_dereference_protected(tcp_sk(sk)->ao_info, + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto ao_parse; + if (rcu_dereference_protected(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk))) - err = tp->af_specific->ao_parse(sk, optname, optval, - optlen); - else - err = -EISCONN; + goto ao_parse; + if (tp->repair) + goto ao_parse; + err = -EISCONN; + break; +ao_parse: + err = tp->af_specific->ao_parse(sk, optname, optval, optlen); break; } #endif @@ -4284,6 +4292,8 @@ zerocopy_rcv_out: return err; } #endif + case TCP_AO_REPAIR: + return tcp_ao_get_repair(sk, optval, optlen); case TCP_AO_GET_KEYS: case TCP_AO_INFO: { int err; diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index b5ac3e73e1da..6a845e906a1d 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1490,6 +1490,16 @@ static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk) return ERR_PTR(-ESOCKTNOSUPPORT); } +static struct tcp_ao_info *getsockopt_ao_info(struct sock *sk) +{ + if (sk_fullsock(sk)) + return rcu_dereference(tcp_sk(sk)->ao_info); + else if (sk->sk_state == TCP_TIME_WAIT) + return rcu_dereference(tcp_twsk(sk)->ao_info); + + return ERR_PTR(-ESOCKTNOSUPPORT); +} + #define TCP_AO_KEYF_ALL (TCP_AO_KEYF_IFINDEX | TCP_AO_KEYF_EXCLUDE_OPT) #define TCP_AO_GET_KEYF_VALID (TCP_AO_KEYF_IFINDEX) @@ -1671,11 +1681,13 @@ static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family, if (ret < 0) goto err_free_sock; - /* Change this condition if we allow adding keys in states - * like close_wait, syn_sent or fin_wait... - */ - if (sk->sk_state == TCP_ESTABLISHED) + if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) { tcp_ao_cache_traffic_keys(sk, ao_info, key); + if (first) { + ao_info->current_key = key; + ao_info->rnext_key = key; + } + } tcp_ao_link_mkt(ao_info, key); if (first) { @@ -1926,6 +1938,8 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, if (IS_ERR(ao_info)) return PTR_ERR(ao_info); if (!ao_info) { + if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) + return -EINVAL; ao_info = tcp_ao_alloc_info(GFP_KERNEL); if (!ao_info) return -ENOMEM; @@ -2308,3 +2322,71 @@ int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) return 0; } +int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_ao_repair cmd; + struct tcp_ao_key *key; + struct tcp_ao_info *ao; + int err; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen); + if (err) + return err; + + if (!tp->repair) + return -EPERM; + + ao = setsockopt_ao_info(sk); + if (IS_ERR(ao)) + return PTR_ERR(ao); + if (!ao) + return -ENOENT; + + WRITE_ONCE(ao->lisn, cmd.snt_isn); + WRITE_ONCE(ao->risn, cmd.rcv_isn); + WRITE_ONCE(ao->snd_sne, cmd.snd_sne); + WRITE_ONCE(ao->rcv_sne, cmd.rcv_sne); + + hlist_for_each_entry_rcu(key, &ao->head, node) + tcp_ao_cache_traffic_keys(sk, ao, key); + + return 0; +} + +int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_ao_repair opt; + struct tcp_ao_info *ao; + int len; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + + if (len <= 0) + return -EINVAL; + + if (!tp->repair) + return -EPERM; + + rcu_read_lock(); + ao = getsockopt_ao_info(sk); + if (IS_ERR_OR_NULL(ao)) { + rcu_read_unlock(); + return ao ? PTR_ERR(ao) : -ENOENT; + } + + opt.snt_isn = ao->lisn; + opt.rcv_isn = ao->risn; + opt.snd_sne = READ_ONCE(ao->snd_sne); + opt.rcv_sne = READ_ONCE(ao->rcv_sne); + rcu_read_unlock(); + + if (copy_to_sockptr(optval, &opt, min_t(int, len, sizeof(opt)))) + return -EFAULT; + return 0; +} -- cgit v1.2.3 From 83c1bbeb864f2a197603b91b3e0f748cca64543d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 25 Oct 2023 15:30:14 +0300 Subject: bridge: add MDB get uAPI attributes Add MDB get attributes that correspond to the MDB set attributes used in RTM_NEWMDB messages. Specifically, add 'MDBA_GET_ENTRY' which will hold a 'struct br_mdb_entry' and 'MDBA_GET_ENTRY_ATTRS' which will hold 'MDBE_ATTR_*' attributes that are used as indexes (source IP and source VNI). An example request will look as follows: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_GET_ENTRY ] struct br_mdb_entry [ MDBA_GET_ENTRY_ATTRS ] [ MDBE_ATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBE_ATTR_SRC_VNI ] u32 Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index f95326fce6bb..2e23f99dc0f1 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -723,6 +723,24 @@ enum { }; #define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1) +/* [MDBA_GET_ENTRY] = { + * struct br_mdb_entry + * [MDBA_GET_ENTRY_ATTRS] = { + * [MDBE_ATTR_SOURCE] + * struct in_addr / struct in6_addr + * [MDBE_ATTR_SRC_VNI] + * u32 + * } + * } + */ +enum { + MDBA_GET_ENTRY_UNSPEC, + MDBA_GET_ENTRY, + MDBA_GET_ENTRY_ATTRS, + __MDBA_GET_ENTRY_MAX, +}; +#define MDBA_GET_ENTRY_MAX (__MDBA_GET_ENTRY_MAX - 1) + /* [MDBA_SET_ENTRY_ATTRS] = { * [MDBE_ATTR_xxx] * ... -- cgit v1.2.3 From 6479c975b20a7fe6ecacec3a60dc6f838ecee9d6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 27 Oct 2023 16:04:54 +0200 Subject: doc/netlink: Update schema to support cmd-cnt-name and cmd-max-name allow specifying cmd-cnt-name and cmd-max-name in netlink specs, in accordance with Documentation/userspace-api/netlink/c-code-gen.rst. Use cmd-cnt-name and attr-cnt-name in the mptcp yaml spec and in the corresponding uAPI headers, to preserve the #defines we had in the past and avoid adding new ones. v2: - squash modification in mptcp.yaml and MPTCP uAPI headers Suggested-by: Jakub Kicinski Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/12d4ed0116d8883cf4b533b856f3125a34e56749.1698415310.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-c.yaml | 6 ++++++ Documentation/netlink/genetlink-legacy.yaml | 6 ++++++ Documentation/netlink/netlink-raw.yaml | 6 ++++++ Documentation/netlink/specs/mptcp.yaml | 2 ++ include/uapi/linux/mptcp.h | 4 ---- include/uapi/linux/mptcp_pm.h | 8 ++++---- 6 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 9d13bbb7ae47..c58f7153fcf8 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -47,6 +47,12 @@ properties: max-by-define: description: Makes the number of attributes and commands be specified by a define, not an enum value. type: boolean + cmd-max-name: + description: Name of the define for the last operation in the list. + type: string + cmd-cnt-name: + description: The explicit name for constant holding the count of operations (last operation + 1). + type: string # End genetlink-c definitions: diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 0daf40402a29..938703088306 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -47,6 +47,12 @@ properties: max-by-define: description: Makes the number of attributes and commands be specified by a define, not an enum value. type: boolean + cmd-max-name: + description: Name of the define for the last operation in the list. + type: string + cmd-cnt-name: + description: The explicit name for constant holding the count of operations (last operation + 1). + type: string # End genetlink-c # Start genetlink-legacy kernel-policy: diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index 48db31f1d059..775cce8c548a 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -47,6 +47,12 @@ properties: max-by-define: description: Makes the number of attributes and commands be specified by a define, not an enum value. type: boolean + cmd-max-name: + description: Name of the define for the last operation in the list. + type: string + cmd-cnt-name: + description: The explicit name for constant holding the count of operations (last operation + 1). + type: string # End genetlink-c # Start genetlink-legacy kernel-policy: diff --git a/Documentation/netlink/specs/mptcp.yaml b/Documentation/netlink/specs/mptcp.yaml index ec5c454a87ea..49f90cfb4698 100644 --- a/Documentation/netlink/specs/mptcp.yaml +++ b/Documentation/netlink/specs/mptcp.yaml @@ -8,6 +8,7 @@ c-family-name: mptcp-pm-name c-version-name: mptcp-pm-ver max-by-define: true kernel-policy: per-op +cmd-cnt-name: --mptcp-pm-cmd-after-last definitions: - @@ -167,6 +168,7 @@ attribute-sets: - name: attr name-prefix: mptcp-pm-attr- + attr-cnt-name: --mptcp-attr-after-last attributes: - name: unspec diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 64ecc8a3f9f2..a6451561f3f8 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -28,10 +28,6 @@ #include -/* for backward compatibility */ -#define __MPTCP_PM_CMD_AFTER_LAST __MPTCP_PM_CMD_MAX -#define __MPTCP_ATTR_AFTER_LAST __MPTCP_ATTR_MAX - #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 0ad598fe940b..b5d11aece408 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -100,9 +100,9 @@ enum { MPTCP_PM_ATTR_LOC_ID, MPTCP_PM_ATTR_ADDR_REMOTE, - __MPTCP_PM_ATTR_MAX + __MPTCP_ATTR_AFTER_LAST }; -#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) +#define MPTCP_PM_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) enum mptcp_event_attr { MPTCP_ATTR_UNSPEC, @@ -143,8 +143,8 @@ enum { MPTCP_PM_CMD_SUBFLOW_CREATE, MPTCP_PM_CMD_SUBFLOW_DESTROY, - __MPTCP_PM_CMD_MAX + __MPTCP_PM_CMD_AFTER_LAST }; -#define MPTCP_PM_CMD_MAX (__MPTCP_PM_CMD_MAX - 1) +#define MPTCP_PM_CMD_MAX (__MPTCP_PM_CMD_AFTER_LAST - 1) #endif /* _UAPI_LINUX_MPTCP_PM_H */ -- cgit v1.2.3