diff options
Diffstat (limited to 'include/net')
86 files changed, 1848 insertions, 764 deletions
diff --git a/include/net/Space.h b/include/net/Space.h index 08ca9cef0213..c29f3d51c078 100644 --- a/include/net/Space.h +++ b/include/net/Space.h @@ -3,18 +3,11 @@ * ethernet adaptor have the name "eth[0123...]". */ -struct net_device *hp100_probe(int unit); struct net_device *ultra_probe(int unit); struct net_device *wd_probe(int unit); struct net_device *ne_probe(int unit); -struct net_device *fmv18x_probe(int unit); -struct net_device *ni65_probe(int unit); -struct net_device *sonic_probe(int unit); struct net_device *smc_init(int unit); struct net_device *cs89x0_probe(int unit); struct net_device *tc515_probe(int unit); struct net_device *lance_probe(int unit); struct net_device *cops_probe(int unit); - -/* Fibre Channel adapters */ -int iph5526_probe(struct net_device *dev); diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 0e7504a42925..b01cf9ac2437 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -201,7 +201,6 @@ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) return !list_empty(&vsk->connected_table); } -void vsock_release_pending(struct sock *pending); void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); @@ -225,7 +224,6 @@ struct vsock_tap { struct list_head list; }; -int vsock_init_tap(void); int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index af729859385e..aa90adc3b2a4 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -386,6 +386,7 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, + BT_SK_PKT_STATUS }; struct bt_sock_list { @@ -400,6 +401,8 @@ int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); +struct sock *bt_sock_alloc(struct net *net, struct socket *sock, + struct proto *prot, int proto, gfp_t prio, int kern); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, @@ -430,10 +433,6 @@ struct l2cap_ctrl { struct l2cap_chan *chan; }; -struct sco_ctrl { - u8 pkt_status; -}; - struct hci_dev; typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); @@ -464,16 +463,18 @@ struct bt_skb_cb { u8 force_active; u16 expect; u8 incoming:1; + u8 pkt_status:2; union { struct l2cap_ctrl l2cap; - struct sco_ctrl sco; struct hci_ctrl hci; struct mgmt_ctrl mgmt; + struct scm_creds creds; }; }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type +#define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode #define hci_skb_event(skb) bt_cb((skb))->hci.req_event diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 872dcb91a540..87d92accc26e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -309,6 +309,26 @@ enum { * to support it. */ HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, + + /* When this quirk is set, MSFT extension monitor tracking by + * address filter is supported. Since tracking quantity of each + * pattern is limited, this feature supports tracking multiple + * devices concurrently if controller supports multiple + * address filters. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, + + /* + * When this quirk is set, LE Coded PHY shall not be used. This is + * required for some Intel controllers which erroneously claim to + * support it but it causes problems with extended scanning. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_LE_CODED, }; /* HCI device flags */ @@ -577,6 +597,7 @@ enum { #define HCI_LE_CIS_CENTRAL 0x10 #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 +#define HCI_LE_ISO_SYNC_RECEIVER 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 @@ -2760,6 +2781,17 @@ struct hci_ev_le_enh_conn_complete { __u8 clk_accurancy; } __packed; +#define HCI_EV_LE_PER_ADV_REPORT 0x0f +struct hci_ev_le_per_adv_report { + __le16 sync_handle; + __u8 tx_power; + __u8 rssi; + __u8 cte_type; + __u8 data_status; + __u8 length; + __u8 data[]; +} __packed; + #define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 struct hci_evt_le_ext_adv_set_term { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index e01d52cb668c..e6359f7346f1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -83,7 +83,7 @@ struct discovery_state { u8 last_adv_addr_type; s8 last_adv_rssi; u32 last_adv_flags; - u8 last_adv_data[HCI_MAX_AD_LENGTH]; + u8 last_adv_data[HCI_MAX_EXT_AD_LENGTH]; u8 last_adv_data_len; bool report_invalid_rssi; bool result_filtering; @@ -290,7 +290,7 @@ struct adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[HCI_MAX_AD_LENGTH]; + __u8 value[HCI_MAX_EXT_AD_LENGTH]; }; struct adv_rssi_thresholds { @@ -321,8 +321,8 @@ struct adv_monitor { #define HCI_MAX_SHORT_NAME_LENGTH 10 -#define HCI_CONN_HANDLE_UNSET 0xffff #define HCI_CONN_HANDLE_MAX 0x0eff +#define HCI_CONN_HANDLE_UNSET(_handle) (_handle > HCI_CONN_HANDLE_MAX) /* Min encryption key size to match with SMP */ #define HCI_MIN_ENC_KEY_SIZE 7 @@ -726,7 +726,7 @@ struct hci_conn { __u16 le_conn_interval; __u16 le_conn_latency; __u16 le_supv_timeout; - __u8 le_adv_data[HCI_MAX_AD_LENGTH]; + __u8 le_adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 le_adv_data_len; __u8 le_per_adv_data[HCI_MAX_PER_AD_LENGTH]; __u8 le_per_adv_data_len; @@ -739,6 +739,7 @@ struct hci_conn { unsigned long flags; enum conn_reasons conn_reason; + __u8 abort_reason; __u32 clock; __u16 clock_accuracy; @@ -758,7 +759,6 @@ struct hci_conn { struct delayed_work auto_accept_work; struct delayed_work idle_work; struct delayed_work le_conn_timeout; - struct work_struct le_scan_cleanup; struct device dev; struct dentry *debugfs; @@ -974,6 +974,12 @@ enum { HCI_CONN_SCANNING, HCI_CONN_AUTH_FAILURE, HCI_CONN_PER_ADV, + HCI_CONN_BIG_CREATED, + HCI_CONN_CREATE_CIS, + HCI_CONN_BIG_SYNC, + HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_PA_SYNC, + HCI_CONN_PA_SYNC_FAILED, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) @@ -1093,8 +1099,7 @@ static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle) } static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, - bdaddr_t *ba, - __u8 big, __u8 bis) + bdaddr_t *ba, __u8 bis) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1105,7 +1110,33 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, if (bacmp(&c->dst, ba) || c->type != ISO_LINK) continue; - if (c->iso_qos.bcast.big == big && c->iso_qos.bcast.bis == bis) { + if (c->iso_qos.bcast.bis == bis) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, + bdaddr_t *ba, + __u8 big, __u8 bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (bacmp(&c->dst, ba) || c->type != ISO_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big && + c->iso_qos.bcast.bis == bis) { rcu_read_unlock(); return c; } @@ -1190,7 +1221,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) continue; /* Match CIG ID if set */ @@ -1222,7 +1253,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) continue; if (handle == c->iso_qos.ucast.cig) { @@ -1259,6 +1290,52 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *hdev, + __u8 handle) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK) + continue; + + if (handle != BT_ISO_QOS_BIG_UNSET && handle == c->iso_qos.bcast.big) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct hci_conn * +hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, __u8 type, __u16 state) { @@ -1320,11 +1397,33 @@ static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) return NULL; } +/* Returns true if an le connection is in the scanning state */ +static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return true; + } + } + + rcu_read_unlock(); + + return false; +} + int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); -int hci_le_create_cis(struct hci_conn *conn); +int hci_le_create_cis_pending(struct hci_dev *hdev); +int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role); @@ -1351,6 +1450,9 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, + struct bt_iso_qos *qos, + __u8 base_len, __u8 *base); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, @@ -1358,7 +1460,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 data_len, __u8 *data); int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); -int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos, +int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); @@ -1369,6 +1472,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role); void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); void hci_conn_failed(struct hci_conn *conn, u8 status); +u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an @@ -1713,7 +1817,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) -#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED)) +#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ + !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ + &(dev)->quirks)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) @@ -1745,6 +1851,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* Maximum advertising length */ +#define max_adv_len(dev) \ + (ext_adv_capable(dev) ? HCI_MAX_EXT_AD_LENGTH : HCI_MAX_AD_LENGTH) + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: * * C24: Mandatory if the LE Controller supports Connection State and either @@ -1765,6 +1875,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) +#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 2495be4d8b82..57eeb07aeb25 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -5,6 +5,9 @@ * Copyright (C) 2021 Intel Corporation */ +#define UINT_PTR(_handle) ((void *)((uintptr_t)_handle)) +#define PTR_UINT(_ptr) ((uintptr_t)((void *)_ptr)) + typedef int (*hci_cmd_sync_work_func_t)(struct hci_dev *hdev, void *data); typedef void (*hci_cmd_sync_work_destroy_t)(struct hci_dev *hdev, void *data, int err); @@ -124,7 +127,7 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason); int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn); -int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_le_create_cis_sync(struct hci_dev *hdev); int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 5e68b3dd4422..d382679efd2b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -111,6 +111,8 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_WIDEBAND_SPEECH BIT(17) #define MGMT_SETTING_CIS_CENTRAL BIT(18) #define MGMT_SETTING_CIS_PERIPHERAL BIT(19) +#define MGMT_SETTING_ISO_BROADCASTER BIT(20) +#define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 diff --git a/include/net/bluetooth/sco.h b/include/net/bluetooth/sco.h index 1aa2e14b6c94..f40ddb4264fc 100644 --- a/include/net/bluetooth/sco.h +++ b/include/net/bluetooth/sco.h @@ -46,6 +46,4 @@ struct sco_conninfo { __u8 dev_class[3]; }; -#define SCO_CMSG_PKT_STATUS 0x01 - #endif /* __SCO_H */ diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index a016f275cb01..c5e57c6bd873 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -301,7 +301,6 @@ int __bond_3ad_get_active_agg_info(struct bonding *bond, int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); -void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); diff --git a/include/net/bonding.h b/include/net/bonding.h index 30ac427cf0c6..5b8b1b644a2d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -722,23 +722,14 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond, } /* Caller must hold rcu_read_lock() for read */ -static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac) +static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; - struct netdev_hw_addr *ha; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; - - if (netdev_uc_empty(bond->dev)) - return false; - - netdev_for_each_uc_addr(ha, bond->dev) - if (ether_addr_equal_64bits(mac, ha->addr)) - return true; - return false; } diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f90f0021f5f2..4dabeb6c76d3 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -16,6 +16,7 @@ #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> +#include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h index bd5440977f7f..5ee7b322e18b 100644 --- a/include/net/caif/cfsrvl.h +++ b/include/net/caif/cfsrvl.h @@ -33,9 +33,6 @@ struct cflayer *cfrfml_create(u8 linkid, struct dev_info *dev_info, int mtu_size); struct cflayer *cfdbgl_create(u8 linkid, struct dev_info *dev_info); -void cfsrvl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid); - bool cfsrvl_phyid_match(struct cflayer *layer, int phyid); void cfsrvl_init(struct cfsrvl *service, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7c7d03aa9d06..3a4b684f89bf 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -263,7 +263,7 @@ enum ieee80211_privacy { * are only for driver use when pointers to this structure are * passed around. * - * @flags: rate-specific flags + * @flags: rate-specific flags from &enum ieee80211_rate_flags * @bitrate: bitrate in units of 100 Kbps * @hw_value: driver/hardware value for this rate * @hw_value_short: driver/hardware value for this rate when @@ -562,6 +562,9 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) return NULL; + if (iftype == NL80211_IFTYPE_AP_VLAN) + iftype = NL80211_IFTYPE_AP; + for (i = 0; i < sband->n_iftype_data; i++) { const struct ieee80211_sband_iftype_data *data = &sband->iftype_data[i]; @@ -808,7 +811,7 @@ struct cfg80211_tid_cfg { struct cfg80211_tid_config { const u8 *peer; u32 n_tid_conf; - struct cfg80211_tid_cfg tid_conf[]; + struct cfg80211_tid_cfg tid_conf[] __counted_by(n_tid_conf); }; /** @@ -1184,7 +1187,7 @@ struct cfg80211_mbssid_elems { struct { const u8 *data; size_t len; - } elem[]; + } elem[] __counted_by(cnt); }; /** @@ -1201,7 +1204,7 @@ struct cfg80211_rnr_elems { struct { const u8 *data; size_t len; - } elem[]; + } elem[] __counted_by(cnt); }; /** @@ -1279,7 +1282,7 @@ struct cfg80211_acl_data { int n_acl_entries; /* Keep it last */ - struct mac_address mac_addrs[]; + struct mac_address mac_addrs[] __counted_by(n_acl_entries); }; /** @@ -1350,7 +1353,7 @@ struct cfg80211_unsol_bcast_probe_resp { * @twt_responder: Enable Target Wait Time * @he_required: stations must support HE * @sae_h2e_required: stations must support direct H2E technique in SAE - * @flags: flags, as defined in enum cfg80211_ap_settings_flags + * @flags: flags, as defined in &enum nl80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters @@ -1479,7 +1482,6 @@ struct iface_combination_params { * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp) * @STATION_PARAM_APPLY_CAPABILITY: apply new capability * @STATION_PARAM_APPLY_PLINK_STATE: apply new plink state - * @STATION_PARAM_APPLY_STA_TXPOWER: apply tx power for STA * * Not all station parameters have in-band "no change" signalling, * for those that don't these flags will are used. @@ -2153,7 +2155,7 @@ enum mpath_info_flags { * @sn: target sequence number * @metric: metric (cost) of this mesh path * @exptime: expiration time for the mesh path from now, in msecs - * @flags: mesh path flags + * @flags: mesh path flags from &enum mesh_path_flags * @discovery_timeout: total mesh path discovery timeout, in msecs * @discovery_retries: mesh path discovery retries * @generation: generation number for nl80211 dumps. @@ -2493,7 +2495,7 @@ struct cfg80211_scan_6ghz_params { * the actual dwell time may be shorter. * @duration_mandatory: if set, the scan duration must be as specified by the * %duration field. - * @flags: bit field of flags controlling operation + * @flags: control flags from &enum nl80211_scan_flags * @rates: bitmap of rates to advertise for each band * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started @@ -2541,7 +2543,7 @@ struct cfg80211_scan_request { struct cfg80211_scan_6ghz_params *scan_6ghz_params; /* keep last */ - struct ieee80211_channel *channels[]; + struct ieee80211_channel *channels[] __counted_by(n_channels); }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2613,7 +2615,7 @@ struct cfg80211_bss_select_adjust { * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets - * @flags: bit field of flags controlling operation + * @flags: control flags from &enum nl80211_scan_flags * @match_sets: sets of parameters to be matched for a scan result * entry to be considered valid and to be passed to the host * (others are filtered out). @@ -3945,7 +3947,7 @@ struct cfg80211_pmsr_request { struct list_head list; - struct cfg80211_pmsr_request_peer peers[]; + struct cfg80211_pmsr_request_peer peers[] __counted_by(n_peers); }; /** @@ -8115,7 +8117,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * @link_id: the ID of the link the frame was received on * @buf: Management frame (header + body) * @len: length of the frame data - * @flags: flags, as defined in enum nl80211_rxmgmt_flags + * @flags: flags, as defined in &enum nl80211_rxmgmt_flags * @rx_tstamp: Hardware timestamp of frame RX in nanoseconds * @ack_tstamp: Hardware timestamp of ack TX in nanoseconds */ diff --git a/include/net/datalink.h b/include/net/datalink.h index c837ffc7ebf8..6c529a40e00d 100644 --- a/include/net/datalink.h +++ b/include/net/datalink.h @@ -23,6 +23,4 @@ struct datalink_proto { struct list_head node; }; -struct datalink_proto *make_EII_client(void); -void destroy_EII_client(struct datalink_proto *dl); #endif diff --git a/include/net/devlink.h b/include/net/devlink.h index 0cdb4b16e5b5..29fd1b4ee654 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1583,6 +1583,24 @@ void devlink_free(struct devlink *devlink); * Should be used by device drivers set * the admin state of a function managed * by the devlink port. + * @port_fn_ipsec_crypto_get: Callback used to get port function's ipsec_crypto + * capability. Should be used by device drivers + * to report the current state of ipsec_crypto + * capability of a function managed by the devlink + * port. + * @port_fn_ipsec_crypto_set: Callback used to set port function's ipsec_crypto + * capability. Should be used by device drivers to + * enable/disable ipsec_crypto capability of a + * function managed by the devlink port. + * @port_fn_ipsec_packet_get: Callback used to get port function's ipsec_packet + * capability. Should be used by device drivers + * to report the current state of ipsec_packet + * capability of a function managed by the devlink + * port. + * @port_fn_ipsec_packet_set: Callback used to set port function's ipsec_packet + * capability. Should be used by device drivers to + * enable/disable ipsec_packet capability of a + * function managed by the devlink port. * * Note: Driver should return -EOPNOTSUPP if it doesn't support * port function (@port_fn_*) handling for a particular port. @@ -1620,6 +1638,18 @@ struct devlink_port_ops { int (*port_fn_state_set)(struct devlink_port *port, enum devlink_port_fn_state state, struct netlink_ext_ack *extack); + int (*port_fn_ipsec_crypto_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_crypto_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_packet_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_packet_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); }; void devlink_port_init(struct devlink *devlink, @@ -1743,9 +1773,6 @@ int devl_resource_size_get(struct devlink *devlink, int devl_dpipe_table_resource_set(struct devlink *devlink, const char *table_name, u64 resource_id, u64 resource_units); -int devlink_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units); void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, @@ -1790,8 +1817,6 @@ devlink_port_region_create(struct devlink_port *port, u32 region_max_snapshots, u64 region_size); void devl_region_destroy(struct devlink_region *region); void devlink_region_destroy(struct devlink_region *region); -void devlink_port_region_destroy(struct devlink_region *region); - int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id); void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id); int devlink_region_snapshot_create(struct devlink_region *region, diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2b953b57689..a587e83fc169 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -30,6 +30,7 @@ FN(TCP_OVERWINDOW) \ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ + FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ @@ -78,6 +79,7 @@ FN(IPV6_NDISC_BAD_CODE) \ FN(IPV6_NDISC_BAD_OPTIONS) \ FN(IPV6_NDISC_NS_OTHERHOST) \ + FN(QUEUE_PURGE) \ FNe(MAX) /** @@ -188,6 +190,8 @@ enum skb_drop_reason { * LINUX_MIB_PAWSESTABREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ + SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */ @@ -339,6 +343,8 @@ enum skb_drop_reason { * for another host. */ SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, + /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ + SKB_DROP_REASON_QUEUE_PURGE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 685fb37df8e8..56cb7be92244 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -23,6 +23,12 @@ enum skb_drop_reason_subsys { */ SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, + /** + * @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons, + * see net/openvswitch/drop.h + */ + SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + /** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */ SKB_DROP_REASON_SUBSYS_NUM }; diff --git a/include/net/dsa.h b/include/net/dsa.h index d309ee7ed04b..0b9c6aa27047 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -873,8 +873,6 @@ struct dsa_switch_ops { struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, int port, phy_interface_t iface); - int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, - struct phylink_link_state *state); int (*phylink_mac_prepare)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); @@ -884,7 +882,6 @@ struct dsa_switch_ops { int (*phylink_mac_finish)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); - void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 632086b2f644..6d1c8541183d 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -23,7 +23,7 @@ struct dst_ops { u32 * (*cow_metrics)(struct dst_entry *, unsigned long); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, - struct net_device *dev, int how); + struct net_device *dev); struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8664ed4fbbdf..1a7131d6cb0e 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -302,6 +302,14 @@ struct flow_dissector_key_l2tpv3 { }; /** + * struct flow_dissector_key_ipsec: + * @spi: identifier for a ipsec connection + */ +struct flow_dissector_key_ipsec { + __be32 spi; +}; + +/** * struct flow_dissector_key_cfm * @mdl_ver: maintenance domain level (mdl) and cfm protocol version * @opcode: code specifying a type of cfm protocol packet @@ -354,6 +362,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */ + FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */ FLOW_DISSECTOR_KEY_MAX, }; @@ -370,7 +379,8 @@ struct flow_dissector_key { }; struct flow_dissector { - unsigned int used_keys; /* each bit repesents presence of one key id */ + unsigned long long used_keys; + /* each bit represents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; @@ -430,7 +440,7 @@ void skb_flow_get_icmp_tci(const struct sk_buff *skb, static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - return flow_dissector->used_keys & (1 << key_id); + return flow_dissector->used_keys & (1ULL << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 118082eae48c..9efa9a59e81f 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -64,6 +64,10 @@ struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; +struct flow_match_ipsec { + struct flow_dissector_key_ipsec *key, *mask; +}; + struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; @@ -116,6 +120,8 @@ void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, diff --git a/include/net/fq.h b/include/net/fq.h index 07b5aff6ec58..99fbe4127b95 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -98,9 +98,4 @@ typedef bool fq_skb_filter_t(struct fq *, struct sk_buff *, void *); -typedef struct fq_flow *fq_flow_get_default_t(struct fq *, - struct fq_tin *, - int idx, - struct sk_buff *); - #endif diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ed4622dd4828..e18a4c0d69ee 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -93,9 +93,9 @@ struct genl_family { * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender + * @family: generic netlink family * @nlhdr: netlink message header * @genlhdr: generic netlink message header - * @userhdr: user specific header * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers @@ -104,16 +104,16 @@ struct genl_family { struct genl_info { u32 snd_seq; u32 snd_portid; - struct nlmsghdr * nlhdr; + const struct genl_family *family; + const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; - void * userhdr; struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; struct netlink_ext_ack *extack; }; -static inline struct net *genl_info_net(struct genl_info *info) +static inline struct net *genl_info_net(const struct genl_info *info) { return read_pnet(&info->_net); } @@ -123,6 +123,11 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) write_pnet(&info->_net, net); } +static inline void *genl_info_userhdr(const struct genl_info *info) +{ + return (u8 *)info->genlhdr + GENL_HDRLEN; +} + #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) #define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ @@ -244,14 +249,13 @@ struct genl_split_ops { /** * struct genl_dumpit_info - info that is available during dumpit op call - * @family: generic netlink family - for internal genl code usage * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes + * @info: struct genl_info describing the request */ struct genl_dumpit_info { - const struct genl_family *family; struct genl_split_ops op; - struct nlattr **attrs; + struct genl_info info; }; static inline const struct genl_dumpit_info * @@ -260,6 +264,38 @@ genl_dumpit_info(struct netlink_callback *cb) return cb->data; } +static inline const struct genl_info * +genl_info_dump(struct netlink_callback *cb) +{ + return &genl_dumpit_info(cb)->info; +} + +/** + * genl_info_init_ntf() - initialize genl_info for notifications + * @info: genl_info struct to set up + * @family: pointer to the genetlink family + * @cmd: command to be used in the notification + * + * Initialize a locally declared struct genl_info to pass to various APIs. + * Intended to be used when creating notifications. + */ +static inline void +genl_info_init_ntf(struct genl_info *info, const struct genl_family *family, + u8 cmd) +{ + struct genlmsghdr *hdr = (void *) &info->user_ptr[0]; + + memset(info, 0, sizeof(*info)); + info->family = family; + info->genlhdr = hdr; + hdr->cmd = cmd; +} + +static inline bool genl_info_is_ntf(const struct genl_info *info) +{ + return !info->nlhdr; +} + int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, @@ -268,6 +304,32 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd); +static inline void * +__genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) +{ + return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family, + flags, info->genlhdr->cmd); +} + +/** + * genlmsg_iput - start genetlink message based on genl_info + * @skb: skb in which message header will be placed + * @info: genl_info as provided to do/dump handlers + * + * Convenience wrapper which starts a genetlink message based on + * information in user request. @info should be either the struct passed + * by genetlink core to do/dump handlers (when constructing replies to + * such requests) or a struct initialized by genl_info_init_ntf() + * when constructing notifications. + * + * Returns pointer to new genetlink header. + */ +static inline void * +genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) +{ + return __genlmsg_iput(skb, info, 0); +} + /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() diff --git a/include/net/gro.h b/include/net/gro.h index 75efa6fb8441..88644b3ca660 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -452,6 +452,49 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, gro_normal_list(napi); } +/* This function is the alternative of 'inet_iif' and 'inet_sdif' + * functions in case we can not rely on fields of IPCB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + *iif = inet_iif(skb) ?: skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + +/* This function is the alternative of 'inet6_iif' and 'inet6_sdif' + * functions in case we can not rely on fields of IP6CB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ + *iif = skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + extern struct list_head offload_base; #endif /* _NET_IPV6_GRO_H */ diff --git a/include/net/handshake.h b/include/net/handshake.h index 2e26e436e85f..8ebd4f9ed26e 100644 --- a/include/net/handshake.h +++ b/include/net/handshake.h @@ -40,5 +40,10 @@ int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags); int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags); bool tls_handshake_cancel(struct sock *sk); +void tls_handshake_close(struct socket *sock); + +u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *msg); +void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, + u8 *level, u8 *description); #endif /* _NET_HANDSHAKE_H */ diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index c4722a9963de..2338f8d2a8b3 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -21,7 +21,7 @@ #include <asm/unaligned.h> /** - * struct ieee82011_radiotap_header - base radiotap header + * struct ieee80211_radiotap_header - base radiotap header */ struct ieee80211_radiotap_header { /** @@ -575,6 +575,7 @@ enum ieee80211_radiotap_eht_usig_tb { /** * ieee80211_get_radiotap_len - get radiotap header length + * @data: pointer to the header */ static inline u16 ieee80211_get_radiotap_len(const char *data) { diff --git a/include/net/ila.h b/include/net/ila.h deleted file mode 100644 index 73ebe5eab272..000000000000 --- a/include/net/ila.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ILA kernel interface - * - * Copyright (c) 2015 Tom Herbert <tom@herbertland.com> - */ - -#ifndef _NET_ILA_H -#define _NET_ILA_H - -struct sk_buff; - -int ila_xlat_outgoing(struct sk_buff *skb); -int ila_xlat_incoming(struct sk_buff *skb); - -#endif /* _NET_ILA_H */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 56f1286583d3..533a7337865a 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,6 +48,22 @@ struct sock *__inet6_lookup_established(struct net *net, const u16 hnum, const int dif, const int sdif); +typedef u32 (inet6_ehashfn_t)(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + +inet6_ehashfn_t inet6_ehashfn; + +INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); + +struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum, + inet6_ehashfn_t *ehashfn); + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -57,6 +73,15 @@ struct sock *inet6_lookup_listener(struct net *net, const unsigned short hnum, const int dif, const int sdif); +struct sock *inet6_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn); + static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -78,6 +103,46 @@ static inline struct sock *__inet6_lookup(struct net *net, daddr, hnum, dif, sdif); } +static inline +struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, + const struct in6_addr *daddr, const __be16 dport, + bool *refcounted, inet6_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched || !sk_fullsock(sk)) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, @@ -85,14 +150,20 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); - + struct net *net = dev_net(skb_dst(skb)->dev); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct sock *sk; + + sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport, + refcounted, inet6_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, - doff, &ipv6_hdr(skb)->saddr, sport, - &ipv6_hdr(skb)->daddr, ntohs(dport), + return __inet6_lookup(net, hashinfo, skb, + doff, &ip6h->saddr, sport, + &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } diff --git a/include/net/inet_common.h b/include/net/inet_common.h index b86b8e21de7f..f50a644d87a9 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -40,8 +40,10 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); +int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..5d2fcc137b88 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t { ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ + ICSK_ACK_NOMEM = 32, }; void inet_csk_init_xmit_timers(struct sock *sk, @@ -341,9 +342,9 @@ static inline bool inet_csk_in_pingpong_mode(struct sock *sk) return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH; } -static inline bool inet_csk_has_ulp(struct sock *sk) +static inline bool inet_csk_has_ulp(const struct sock *sk) { - return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops; + return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 99bd823e97f6..3ecfeadbfa06 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -177,7 +177,7 @@ struct inet_hashinfo { struct inet_listen_hashbucket *lhash2; bool pernet; -}; +} ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { @@ -379,6 +379,27 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif, const int sdif); +typedef u32 (inet_ehashfn_t)(const struct net *net, + const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); + +inet_ehashfn_t inet_ehashfn; + +INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); + +struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum, + inet_ehashfn_t *ehashfn); + +struct sock *inet_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif, + inet_ehashfn_t *ehashfn); + static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, @@ -428,6 +449,46 @@ static inline struct sock *inet_lookup(struct net *net, return sk; } +static inline +struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + bool *refcounted, inet_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched || !sk_fullsock(sk)) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -436,22 +497,23 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); + struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); + struct sock *sk; + sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, + refcounted, inet_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } -u32 inet6_ehashfn(const struct net *net, - const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const __be16 fport); - static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index caa20a905531..2de0e4d4a027 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -107,11 +107,12 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) + u32 mark = READ_ONCE(sk->sk_mark); + + if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; - return sk->sk_mark; + return mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, @@ -193,13 +194,13 @@ struct rtable; * @inet_rcv_saddr - Bound local IPv4 addr * @inet_dport - Destination port * @inet_num - Local port + * @inet_flags - various atomic flags * @inet_saddr - Sending source * @uc_ttl - Unicast TTL * @inet_sport - Source port * @inet_id - ID counter for DF pkts * @tos - TOS * @mc_ttl - Multicasting TTL - * @is_icsk - is this an inet_connection_sock? * @uc_index - Unicast outgoing device index * @mc_index - Multicast device index * @mc_list - Group array @@ -217,57 +218,88 @@ struct inet_sock { #define inet_dport sk.__sk_common.skc_dport #define inet_num sk.__sk_common.skc_num + unsigned long inet_flags; __be32 inet_saddr; __s16 uc_ttl; - __u16 cmsg_flags; - struct ip_options_rcu __rcu *inet_opt; __be16 inet_sport; - __u16 inet_id; + struct ip_options_rcu __rcu *inet_opt; + atomic_t inet_id; __u8 tos; __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 recverr:1, - is_icsk:1, - freebind:1, - hdrincl:1, - mc_loop:1, - transparent:1, - mc_all:1, - nodefrag:1; - __u8 bind_address_no_port:1, - recverr_rfc4884:1, - defer_connect:1; /* Indicates that fastopen_connect is set - * and cookie exists so we defer connect - * until first data frame is written - */ __u8 rcv_tos; __u8 convert_csum; int uc_index; int mc_index; __be32 mc_addr; - struct ip_mc_socklist __rcu *mc_list; - struct inet_cork_full cork; struct { __u16 lo; __u16 hi; } local_port_range; + + struct ip_mc_socklist __rcu *mc_list; + struct inet_cork_full cork; }; #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ +enum { + INET_FLAGS_PKTINFO = 0, + INET_FLAGS_TTL = 1, + INET_FLAGS_TOS = 2, + INET_FLAGS_RECVOPTS = 3, + INET_FLAGS_RETOPTS = 4, + INET_FLAGS_PASSSEC = 5, + INET_FLAGS_ORIGDSTADDR = 6, + INET_FLAGS_CHECKSUM = 7, + INET_FLAGS_RECVFRAGSIZE = 8, + + INET_FLAGS_RECVERR = 9, + INET_FLAGS_RECVERR_RFC4884 = 10, + INET_FLAGS_FREEBIND = 11, + INET_FLAGS_HDRINCL = 12, + INET_FLAGS_MC_LOOP = 13, + INET_FLAGS_MC_ALL = 14, + INET_FLAGS_TRANSPARENT = 15, + INET_FLAGS_IS_ICSK = 16, + INET_FLAGS_NODEFRAG = 17, + INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, + INET_FLAGS_DEFER_CONNECT = 19, +}; + /* cmsg flags for inet */ -#define IP_CMSG_PKTINFO BIT(0) -#define IP_CMSG_TTL BIT(1) -#define IP_CMSG_TOS BIT(2) -#define IP_CMSG_RECVOPTS BIT(3) -#define IP_CMSG_RETOPTS BIT(4) -#define IP_CMSG_PASSSEC BIT(5) -#define IP_CMSG_ORIGDSTADDR BIT(6) -#define IP_CMSG_CHECKSUM BIT(7) -#define IP_CMSG_RECVFRAGSIZE BIT(8) +#define IP_CMSG_PKTINFO BIT(INET_FLAGS_PKTINFO) +#define IP_CMSG_TTL BIT(INET_FLAGS_TTL) +#define IP_CMSG_TOS BIT(INET_FLAGS_TOS) +#define IP_CMSG_RECVOPTS BIT(INET_FLAGS_RECVOPTS) +#define IP_CMSG_RETOPTS BIT(INET_FLAGS_RETOPTS) +#define IP_CMSG_PASSSEC BIT(INET_FLAGS_PASSSEC) +#define IP_CMSG_ORIGDSTADDR BIT(INET_FLAGS_ORIGDSTADDR) +#define IP_CMSG_CHECKSUM BIT(INET_FLAGS_CHECKSUM) +#define IP_CMSG_RECVFRAGSIZE BIT(INET_FLAGS_RECVFRAGSIZE) + +#define IP_CMSG_ALL (IP_CMSG_PKTINFO | IP_CMSG_TTL | \ + IP_CMSG_TOS | IP_CMSG_RECVOPTS | \ + IP_CMSG_RETOPTS | IP_CMSG_PASSSEC | \ + IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM | \ + IP_CMSG_RECVFRAGSIZE) + +static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) +{ + return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; +} + +#define inet_test_bit(nr, sk) \ + test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_set_bit(nr, sk) \ + set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_clear_bit(nr, sk) \ + clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_assign_bit(nr, sk, val) \ + assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) static inline bool sk_is_inet(struct sock *sk) { @@ -362,7 +394,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; - if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) + if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk)) flags |= FLOWI_FLAG_ANYSRC; return flags; } @@ -388,7 +420,8 @@ static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || - inet->freebind || inet->transparent; + test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || + test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } static inline bool inet_addr_valid_or_nonlocal(struct net *net, diff --git a/include/net/ip.h b/include/net/ip.h index 50d435855ae2..19adacd5ece0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -93,7 +93,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); - ipcm->sockc.mark = inet->sk.sk_mark; + ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; @@ -538,8 +538,19 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, * generator as much as we can. */ if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; + int val; + + /* avoid atomic operations for TCP, + * as we hold socket lock at this point. + */ + if (sk_is_tcp(sk)) { + sock_owned_by_me(sk); + val = atomic_read(&inet_sk(sk)->inet_id); + atomic_set(&inet_sk(sk)->inet_id, val + segs); + } else { + val = atomic_add_return(segs, &inet_sk(sk)->inet_id); + } + iph->id = htons(val); return; } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 05e6f756feaf..c9ff23cf313e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -179,6 +179,9 @@ struct fib6_info { refcount_t fib6_ref; unsigned long expires; + + struct hlist_node gc_link; + struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] @@ -247,19 +250,6 @@ static inline bool fib6_requires_src(const struct fib6_info *rt) return rt->fib6_src.plen > 0; } -static inline void fib6_clean_expires(struct fib6_info *f6i) -{ - f6i->fib6_flags &= ~RTF_EXPIRES; - f6i->expires = 0; -} - -static inline void fib6_set_expires(struct fib6_info *f6i, - unsigned long expires) -{ - f6i->expires = expires; - f6i->fib6_flags |= RTF_EXPIRES; -} - static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) @@ -267,6 +257,11 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } +static inline bool fib6_has_expires(const struct fib6_info *f6i) +{ + return f6i->fib6_flags & RTF_EXPIRES; +} + /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely @@ -388,6 +383,7 @@ struct fib6_table { struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; + struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -504,6 +500,48 @@ void fib6_gc_cleanup(void); int fib6_init(void); +/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be + * NULL. + */ +static inline void fib6_set_expires_locked(struct fib6_info *f6i, + unsigned long expires) +{ + struct fib6_table *tb6; + + tb6 = f6i->fib6_table; + f6i->expires = expires; + if (tb6 && !fib6_has_expires(f6i)) + hlist_add_head(&f6i->gc_link, &tb6->tb6_gc_hlist); + f6i->fib6_flags |= RTF_EXPIRES; +} + +/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be + * NULL. If fib6_table is NULL, the fib6_info will no be inserted into the + * list of GC candidates until it is inserted into a table. + */ +static inline void fib6_set_expires(struct fib6_info *f6i, + unsigned long expires) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_set_expires_locked(f6i, expires); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + +static inline void fib6_clean_expires_locked(struct fib6_info *f6i) +{ + if (fib6_has_expires(f6i)) + hlist_del_init(&f6i->gc_link); + f6i->fib6_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +static inline void fib6_clean_expires(struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_clean_expires_locked(f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3556595ce59a..b32539bb0fb0 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -156,7 +156,7 @@ void fib6_force_start_gc(struct net *net); struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, - gfp_t gfp_flags); + gfp_t gfp_flags, struct netlink_ext_ack *extack); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index ed4b6ad3fcac..e8750b4ef7e1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -52,6 +52,7 @@ struct ip_tunnel_key { u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be32 label; /* Flow Label for IPv6 */ + u32 nhid; __be16 tp_src; __be16 tp_dst; __u8 flow_flags; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2acc4c808d45..d40d8238d4c2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -937,7 +937,8 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || - inet->freebind || inet->transparent; + test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || + test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } /* Sysctl settings for net ipv6.auto_flowlabels */ @@ -1216,6 +1217,7 @@ void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index d2ea5863eedc..b2cf243ebe44 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -426,17 +426,10 @@ struct iw_public_data { /**************************** PROTOTYPES ****************************/ /* - * Functions part of the Wireless Extensions (defined in net/core/wireless.c). - * Those may be called only within the kernel. + * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). + * Those may be called by driver modules. */ -/* First : function strictly used inside the kernel */ - -/* Handle /proc/net/wireless, called in net/code/dev.c */ -int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length); - -/* Second : functions that may be called by driver modules */ - /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); diff --git a/include/net/llc_c_ac.h b/include/net/llc_c_ac.h index 3e1f76786d7b..7620a9196922 100644 --- a/include/net/llc_c_ac.h +++ b/include/net/llc_c_ac.h @@ -175,7 +175,6 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb); -int llc_conn_ac_send_i_rsp_as_ack(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb); void llc_conn_busy_tmr_cb(struct timer_list *t); diff --git a/include/net/llc_c_ev.h b/include/net/llc_c_ev.h index 3948cf111dd0..241889955157 100644 --- a/include/net/llc_c_ev.h +++ b/include/net/llc_c_ev.h @@ -158,7 +158,6 @@ int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb); -int llc_conn_ev_sendack_tmr_exp(struct sock *sk, struct sk_buff *skb); /* NOT_USED functions and their variations */ int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb); int llc_conn_ev_rx_xxx_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 6f15e6fa154e..53bd2d02a4f0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -16,9 +16,12 @@ #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) +/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return + * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. + */ enum { LWTUNNEL_XMIT_DONE, - LWTUNNEL_XMIT_CONTINUE, + LWTUNNEL_XMIT_CONTINUE = 0x100, }; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 3a8a2d2c58c3..7c707358d15c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1192,9 +1192,11 @@ struct ieee80211_tx_info { u8 ampdu_ack_len; u8 ampdu_len; u8 antenna; + u8 pad; u16 tx_time; u8 flags; - void *status_driver_data[18 / sizeof(void *)]; + u8 pad2; + void *status_driver_data[16 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ @@ -2259,6 +2261,7 @@ struct ieee80211_sta_aggregates { * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA + * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the * station can receive at the moment, changed by operating mode @@ -6612,6 +6615,7 @@ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) + * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, diff --git a/include/net/macsec.h b/include/net/macsec.h index 441ed8fd4b5f..75a6f4863c83 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -312,6 +312,8 @@ static inline bool macsec_send_sci(const struct macsec_secy *secy) return tx_sc->send_sci || (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); } +struct net_device *macsec_get_real_dev(const struct net_device *dev); +bool macsec_netdev_is_offloaded(struct net_device *dev); static inline void *macsec_netdev_priv(const struct net_device *dev) { diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 96c120160f15..88b6ef7ce1a6 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -33,6 +33,7 @@ enum gdma_request_type { GDMA_DESTROY_PD = 30, GDMA_CREATE_MR = 31, GDMA_DESTROY_MR = 32, + GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */ }; #define GDMA_RESOURCE_DOORBELL_PAGE 27 @@ -57,6 +58,8 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, GDMA_EQE_HWC_INIT_DATA = 130, GDMA_EQE_HWC_INIT_DONE = 131, + GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, }; enum { @@ -531,10 +534,12 @@ enum { * so the driver is able to reliably support features like busy_poll. */ #define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2) +#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ - GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX) + GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ + GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) #define GDMA_DRV_CAP_FLAGS2 0 @@ -664,6 +669,19 @@ struct gdma_disable_queue_req { u32 alloc_res_id_on_creation; }; /* HW DATA */ +/* GDMA_QUERY_HWC_TIMEOUT */ +struct gdma_query_hwc_timeout_req { + struct gdma_req_hdr hdr; + u32 timeout_ms; + u32 reserved; +}; + +struct gdma_query_hwc_timeout_resp { + struct gdma_resp_hdr hdr; + u32 timeout_ms; + u32 reserved; +}; + enum atb_page_size { ATB_PAGE_SIZE_4K, ATB_PAGE_SIZE_8K, diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h index 6a757a6e2732..3d3b5c881bc1 100644 --- a/include/net/mana/hw_channel.h +++ b/include/net/mana/hw_channel.h @@ -23,6 +23,10 @@ #define HWC_INIT_DATA_PF_DEST_RQ_ID 10 #define HWC_INIT_DATA_PF_DEST_CQ_ID 11 +#define HWC_DATA_CFG_HWC_TIMEOUT 1 + +#define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000 + /* Structures labeled with "HW DATA" are exchanged with the hardware. All of * them are naturally aligned and hence don't need __packed. */ @@ -182,6 +186,7 @@ struct hw_channel_context { u32 pf_dest_vrq_id; u32 pf_dest_vrcq_id; + u32 hwc_timeout; struct hwc_caller_ctx *caller_ctx; }; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 024ad8ddb27e..9f70b4332238 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -4,6 +4,8 @@ #ifndef _MANA_H #define _MANA_H +#include <net/xdp.h> + #include "gdma.h" #include "hw_channel.h" @@ -280,6 +282,7 @@ struct mana_recv_buf_oob { struct gdma_wqe_request wqe_req; void *buf_va; + bool from_pool; /* allocated from a page pool */ /* SGL of the buffer going to be sent has part of the work request. */ u32 num_sge; @@ -330,6 +333,8 @@ struct mana_rxq { bool xdp_flush; int xdp_rc; /* XDP redirect return code */ + struct page_pool *page_pool; + /* MUST BE THE LAST MEMBER: * Each receive buffer has an associated mana_recv_buf_oob. */ @@ -347,6 +352,13 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 hc_tx_bytes; + u64 hc_tx_ucast_pkts; + u64 hc_tx_ucast_bytes; + u64 hc_tx_bcast_pkts; + u64 hc_tx_bcast_bytes; + u64 hc_tx_mcast_pkts; + u64 hc_tx_mcast_bytes; u64 tx_cqe_err; u64 tx_cqe_unknown_type; u64 rx_coalesced_err; @@ -437,6 +449,7 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); +void mana_query_gf_stats(struct mana_port_context *apc); extern const struct ethtool_ops mana_ethtool_ops; @@ -578,6 +591,49 @@ struct mana_fence_rq_resp { struct gdma_resp_hdr hdr; }; /* HW DATA */ +/* Query stats RQ */ +struct mana_query_gf_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_gf_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + /* rx errors/discards */ + u64 discard_rx_nowqe; + u64 err_rx_vport_disabled; + /* rx bytes/packets */ + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + /* tx errors */ + u64 err_tx_gf_disabled; + u64 err_tx_vport_disabled; + u64 err_tx_inval_vport_offset_pkt; + u64 err_tx_vlan_enforcement; + u64 err_tx_ethtype_enforcement; + u64 err_tx_SA_enforecement; + u64 err_tx_SQPDID_enforcement; + u64 err_tx_CQPDID_enforcement; + u64 err_tx_mtu_violation; + u64 err_tx_inval_oob; + /* tx bytes/packets */ + u64 hc_tx_bytes; + u64 hc_tx_ucast_pkts; + u64 hc_tx_ucast_bytes; + u64 hc_tx_bcast_pkts; + u64 hc_tx_bcast_bytes; + u64 hc_tx_mcast_pkts; + u64 hc_tx_mcast_bytes; + /* tx error */ + u64 err_tx_gdma; +}; /* HW DATA */ + /* Configure vPort Rx Steering */ struct mana_cfg_rx_steer_req_v2 { struct gdma_req_hdr hdr; @@ -657,6 +713,42 @@ struct mana_deregister_filter_resp { struct gdma_resp_hdr hdr; }; /* HW DATA */ +/* Requested GF stats Flags */ +/* Rx discards/Errors */ +#define STATISTICS_FLAGS_RX_DISCARDS_NO_WQE 0x0000000000000001 +#define STATISTICS_FLAGS_RX_ERRORS_VPORT_DISABLED 0x0000000000000002 +/* Rx bytes/pkts */ +#define STATISTICS_FLAGS_HC_RX_BYTES 0x0000000000000004 +#define STATISTICS_FLAGS_HC_RX_UCAST_PACKETS 0x0000000000000008 +#define STATISTICS_FLAGS_HC_RX_UCAST_BYTES 0x0000000000000010 +#define STATISTICS_FLAGS_HC_RX_MCAST_PACKETS 0x0000000000000020 +#define STATISTICS_FLAGS_HC_RX_MCAST_BYTES 0x0000000000000040 +#define STATISTICS_FLAGS_HC_RX_BCAST_PACKETS 0x0000000000000080 +#define STATISTICS_FLAGS_HC_RX_BCAST_BYTES 0x0000000000000100 +/* Tx errors */ +#define STATISTICS_FLAGS_TX_ERRORS_GF_DISABLED 0x0000000000000200 +#define STATISTICS_FLAGS_TX_ERRORS_VPORT_DISABLED 0x0000000000000400 +#define STATISTICS_FLAGS_TX_ERRORS_INVAL_VPORT_OFFSET_PACKETS \ + 0x0000000000000800 +#define STATISTICS_FLAGS_TX_ERRORS_VLAN_ENFORCEMENT 0x0000000000001000 +#define STATISTICS_FLAGS_TX_ERRORS_ETH_TYPE_ENFORCEMENT \ + 0x0000000000002000 +#define STATISTICS_FLAGS_TX_ERRORS_SA_ENFORCEMENT 0x0000000000004000 +#define STATISTICS_FLAGS_TX_ERRORS_SQPDID_ENFORCEMENT 0x0000000000008000 +#define STATISTICS_FLAGS_TX_ERRORS_CQPDID_ENFORCEMENT 0x0000000000010000 +#define STATISTICS_FLAGS_TX_ERRORS_MTU_VIOLATION 0x0000000000020000 +#define STATISTICS_FLAGS_TX_ERRORS_INVALID_OOB 0x0000000000040000 +/* Tx bytes/pkts */ +#define STATISTICS_FLAGS_HC_TX_BYTES 0x0000000000080000 +#define STATISTICS_FLAGS_HC_TX_UCAST_PACKETS 0x0000000000100000 +#define STATISTICS_FLAGS_HC_TX_UCAST_BYTES 0x0000000000200000 +#define STATISTICS_FLAGS_HC_TX_MCAST_PACKETS 0x0000000000400000 +#define STATISTICS_FLAGS_HC_TX_MCAST_BYTES 0x0000000000800000 +#define STATISTICS_FLAGS_HC_TX_BCAST_PACKETS 0x0000000001000000 +#define STATISTICS_FLAGS_HC_TX_BCAST_BYTES 0x0000000002000000 +/* Tx error */ +#define STATISTICS_FLAGS_TX_ERRORS_GDMA_ERROR 0x0000000004000000 + #define MANA_MAX_NUM_QUEUES 64 #define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3c5c68618fcc..fb996124b3d5 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -96,6 +96,27 @@ struct mptcp_out_options { #endif }; +#define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_SUBFLOWS_MAX 8 + +struct mptcp_sched_data { + bool reinject; + u8 subflows; + struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; +}; + +struct mptcp_sched_ops { + int (*get_subflow)(struct mptcp_sock *msk, + struct mptcp_sched_data *data); + + char name[MPTCP_SCHED_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 52eae0943433..9bbdf6eaa942 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -488,9 +488,6 @@ void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); -int ndisc_ifinfo_sysctl_strategy(struct ctl_table *ctl, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen); #endif void inet6_ifinfo_notify(int event, struct inet6_dev *idev); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f6a8ecc6b1fa..6da68886fabb 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -394,8 +394,6 @@ void neigh_for_each(struct neigh_table *tbl, void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); -void pneigh_for_each(struct neigh_table *tbl, - void (*cb)(struct pneigh_entry *)); struct neigh_seq_state { struct seq_net_private p; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 78beaa765c73..9f6add96de2d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -42,6 +42,7 @@ #include <linux/idr.h> #include <linux/skbuff.h> #include <linux/notifier.h> +#include <linux/xarray.h> struct user_namespace; struct proc_dir_entry; @@ -69,7 +70,7 @@ struct net { atomic_t dev_unreg_count; unsigned int dev_base_seq; /* protected by rtnl_mutex */ - int ifindex; + u32 ifindex; spinlock_t nsid_lock; atomic_t fnhe_genid; @@ -110,6 +111,7 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct xarray dev_by_index; struct raw_notifier_head netdev_chain; /* Note that @hash_mix can be read millions times per second, diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h new file mode 100644 index 000000000000..cdcafb30d437 --- /dev/null +++ b/include/net/netdev_rx_queue.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NETDEV_RX_QUEUE_H +#define _LINUX_NETDEV_RX_QUEUE_H + +#include <linux/kobject.h> +#include <linux/netdevice.h> +#include <linux/sysfs.h> +#include <net/xdp.h> + +/* This structure contains an instance of an RX queue. */ +struct netdev_rx_queue { + struct xdp_rxq_info xdp_rxq; +#ifdef CONFIG_RPS + struct rps_map __rcu *rps_map; + struct rps_dev_flow_table __rcu *rps_flow_table; +#endif + struct kobject kobj; + struct net_device *dev; + netdevice_tracker dev_tracker; + +#ifdef CONFIG_XDP_SOCKETS + struct xsk_buff_pool *pool; +#endif +} ____cacheline_aligned_in_smp; + +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + const char *buf, size_t len); +}; + +static inline struct netdev_rx_queue * +__netif_get_rx_queue(struct net_device *dev, unsigned int rxq) +{ + return dev->_rx + rxq; +} + +#ifdef CONFIG_SYSFS +static inline unsigned int +get_netdev_rx_queue_index(struct netdev_rx_queue *queue) +{ + struct net_device *dev = queue->dev; + int index = queue - dev->_rx; + + BUG_ON(index >= dev->num_rx_queues); + return index; +} +#endif +#endif diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a72028dbef0c..4085765c3370 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -190,10 +190,6 @@ static inline void nf_ct_put(struct nf_conn *ct) nf_ct_destroy(&ct->ct_general); } -/* Protocol module loading */ -int nf_ct_l3proto_try_module_get(unsigned short l3proto); -void nf_ct_l3proto_module_put(unsigned short l3proto); - /* load module; enable/disable conntrack in this namespace */ int nf_ct_netns_get(struct net *net, u8 nfproto); void nf_ct_netns_put(struct net *net, u8 nfproto); diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index 4b2b7f8914ea..a120685cac93 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -78,6 +78,4 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir, void nf_conntrack_acct_pernet_init(struct net *net); -void nf_conntrack_acct_fini(void); - #endif /* _NF_CONNTRACK_ACCT_H */ diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cf0d81be5a96..165e7a03b8e9 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -100,7 +100,7 @@ nf_ct_expect_find_get(struct net *net, struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index f30b1694b690..de2f956abf34 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -136,8 +136,6 @@ static inline void *nfct_help_data(const struct nf_conn *ct) return (void *)help->data; } -void nf_conntrack_helper_pernet_init(struct net *net); - int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); @@ -182,5 +180,4 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); -void nf_ct_set_auto_assign_helper_warned(struct net *net); #endif /*_NF_CONNTRACK_HELPER_H*/ diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index 66bab6c60d12..fcb19a4e8f2b 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -52,7 +52,6 @@ int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words); #ifdef CONFIG_NF_CONNTRACK_LABELS -int nf_conntrack_labels_init(void); int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 640441a2f926..dd40c75011d2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -512,6 +512,7 @@ struct nft_set_elem_expr { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -533,6 +534,7 @@ struct nft_set_elem_expr { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length @@ -541,6 +543,7 @@ struct nft_set_elem_expr { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -562,7 +565,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -583,6 +587,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); @@ -596,7 +605,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); -void *nft_set_catchall_gc(const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -813,62 +821,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} - struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type @@ -1557,39 +1509,30 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) +static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); } -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** @@ -1732,6 +1675,38 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1758,6 +1733,8 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; unsigned int base_seq; + unsigned int gc_seq; + u8 validate_state; }; extern unsigned int nf_tables_net_id; diff --git a/include/net/netlink.h b/include/net/netlink.h index b12cd957abb4..8a7cd1170e1f 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -375,12 +375,11 @@ struct nla_policy { #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } -#define __NLA_IS_UINT_TYPE(tp) \ - (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64) +#define __NLA_IS_UINT_TYPE(tp) \ + (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ + tp == NLA_U64 || tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) -#define __NLA_IS_BEINT_TYPE(tp) \ - (tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ @@ -394,7 +393,6 @@ struct nla_policy { #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ - __NLA_IS_BEINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ @@ -402,8 +400,6 @@ struct nla_policy { tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) -#define NLA_ENSURE_BEINT_TYPE(tp) \ - (__NLA_ENSURE(__NLA_IS_BEINT_TYPE(tp)) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f00374718159..7a41c4791536 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -152,7 +152,7 @@ struct netns_ipv4 { u8 sysctl_tcp_abort_on_overflow; u8 sysctl_tcp_fack; /* obsolete */ int sysctl_tcp_max_reordering; - int sysctl_tcp_adv_win_scale; + int sysctl_tcp_adv_win_scale; /* obsolete */ u8 sysctl_tcp_dsack; u8 sysctl_tcp_app_win; u8 sysctl_tcp_frto; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 8c77832d0240..cc8060c017d5 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -2,8 +2,6 @@ #ifndef _NETNS_NFTABLES_H_ #define _NETNS_NFTABLES_H_ -#include <linux/list.h> - struct netns_nftables { u8 gencursor; }; diff --git a/include/net/p8022.h b/include/net/p8022.h index b690ffcad66b..a29e224ac498 100644 --- a/include/net/p8022.h +++ b/include/net/p8022.h @@ -13,7 +13,4 @@ register_8022_client(unsigned char type, struct packet_type *pt, struct net_device *orig_dev)); void unregister_8022_client(struct datalink_proto *proto); - -struct datalink_proto *make_8023_client(void); -void destroy_8023_client(struct datalink_proto *dl); #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h deleted file mode 100644 index 126f9e294389..000000000000 --- a/include/net/page_pool.h +++ /dev/null @@ -1,402 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * page_pool.h - * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> - * Copyright (C) 2016 Red Hat, Inc. - */ - -/** - * DOC: page_pool allocator - * - * This page_pool allocator is optimized for the XDP mode that - * uses one-frame-per-page, but have fallbacks that act like the - * regular page allocator APIs. - * - * Basic use involve replacing alloc_pages() calls with the - * page_pool_alloc_pages() call. Drivers should likely use - * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). - * - * API keeps track of in-flight pages, in-order to let API user know - * when it is safe to dealloactor page_pool object. Thus, API users - * must make sure to call page_pool_release_page() when a page is - * "leaving" the page_pool. Or call page_pool_put_page() where - * appropiate. For maintaining correct accounting. - * - * API user must only call page_pool_put_page() once on a page, as it - * will either recycle the page, or in case of elevated refcnt, it - * will release the DMA mapping and in-flight state accounting. We - * hope to lift this requirement in the future. - */ -#ifndef _NET_PAGE_POOL_H -#define _NET_PAGE_POOL_H - -#include <linux/mm.h> /* Needed by ptr_ring */ -#include <linux/ptr_ring.h> -#include <linux/dma-direction.h> - -#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA - * map/unmap - */ -#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets - * from page_pool will be - * DMA-synced-for-device according to - * the length provided by the device - * driver. - * Please note DMA-sync-for-CPU is still - * device driver responsibility - */ -#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ - PP_FLAG_DMA_SYNC_DEV |\ - PP_FLAG_PAGE_FRAG) - -/* - * Fast allocation side cache array/stack - * - * The cache size and refill watermark is related to the network - * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX - * ring is usually refilled and the max consumed elements will be 64, - * thus a natural max size of objects needed in the cache. - * - * Keeping room for more objects, is due to XDP_DROP use-case. As - * XDP_DROP allows the opportunity to recycle objects directly into - * this array, as it shares the same softirq/NAPI protection. If - * cache is already full (or partly full) then the XDP_DROP recycles - * would have to take a slower code path. - */ -#define PP_ALLOC_CACHE_SIZE 128 -#define PP_ALLOC_CACHE_REFILL 64 -struct pp_alloc_cache { - u32 count; - struct page *cache[PP_ALLOC_CACHE_SIZE]; -}; - -struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; /* Numa node id to allocate from pages from */ - struct device *dev; /* device, for DMA pre-mapping purposes */ - struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */ - enum dma_data_direction dma_dir; /* DMA mapping direction */ - unsigned int max_len; /* max DMA sync memory size */ - unsigned int offset; /* DMA addr offset */ - void (*init_callback)(struct page *page, void *arg); - void *init_arg; -}; - -#ifdef CONFIG_PAGE_POOL_STATS -struct page_pool_alloc_stats { - u64 fast; /* fast path allocations */ - u64 slow; /* slow-path order 0 allocations */ - u64 slow_high_order; /* slow-path high order allocations */ - u64 empty; /* failed refills due to empty ptr ring, forcing - * slow path allocation - */ - u64 refill; /* allocations via successful refill */ - u64 waive; /* failed refills due to numa zone mismatch */ -}; - -struct page_pool_recycle_stats { - u64 cached; /* recycling placed page in the cache. */ - u64 cache_full; /* cache was full */ - u64 ring; /* recycling placed page back into ptr ring */ - u64 ring_full; /* page was released from page-pool because - * PTR ring was full. - */ - u64 released_refcnt; /* page released because of elevated - * refcnt - */ -}; - -/* This struct wraps the above stats structs so users of the - * page_pool_get_stats API can pass a single argument when requesting the - * stats for the page pool. - */ -struct page_pool_stats { - struct page_pool_alloc_stats alloc_stats; - struct page_pool_recycle_stats recycle_stats; -}; - -int page_pool_ethtool_stats_get_count(void); -u8 *page_pool_ethtool_stats_get_strings(u8 *data); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); - -/* - * Drivers that wish to harvest page pool stats and report them to users - * (perhaps via ethtool, debugfs, or another mechanism) can allocate a - * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. - */ -bool page_pool_get_stats(struct page_pool *pool, - struct page_pool_stats *stats); -#else - -static inline int page_pool_ethtool_stats_get_count(void) -{ - return 0; -} - -static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) -{ - return data; -} - -static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) -{ - return data; -} - -#endif - -struct page_pool { - struct page_pool_params p; - - struct delayed_work release_dw; - void (*disconnect)(void *); - unsigned long defer_start; - unsigned long defer_warn; - - u32 pages_state_hold_cnt; - unsigned int frag_offset; - struct page *frag_page; - long frag_users; - -#ifdef CONFIG_PAGE_POOL_STATS - /* these stats are incremented while in softirq context */ - struct page_pool_alloc_stats alloc_stats; -#endif - u32 xdp_mem_id; - - /* - * Data structure for allocation side - * - * Drivers allocation side usually already perform some kind - * of resource protection. Piggyback on this protection, and - * require driver to protect allocation side. - * - * For NIC drivers this means, allocate a page_pool per - * RX-queue. As the RX-queue is already protected by - * Softirq/BH scheduling and napi_schedule. NAPI schedule - * guarantee that a single napi_struct will only be scheduled - * on a single CPU (see napi_schedule). - */ - struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; - - /* Data structure for storing recycled pages. - * - * Returning/freeing pages is more complicated synchronization - * wise, because free's can happen on remote CPUs, with no - * association with allocation resource. - * - * Use ptr_ring, as it separates consumer and producer - * effeciently, it a way that doesn't bounce cache-lines. - * - * TODO: Implement bulk return pages into this structure. - */ - struct ptr_ring ring; - -#ifdef CONFIG_PAGE_POOL_STATS - /* recycle stats are per-cpu to avoid locking */ - struct page_pool_recycle_stats __percpu *recycle_stats; -#endif - atomic_t pages_state_release_cnt; - - /* A page_pool is strictly tied to a single RX-queue being - * protected by NAPI, due to above pp_alloc_cache. This - * refcnt serves purpose is to simplify drivers error handling. - */ - refcount_t user_cnt; - - u64 destroy_cnt; -}; - -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); - -static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) -{ - gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); - - return page_pool_alloc_pages(pool, gfp); -} - -struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, - unsigned int size, gfp_t gfp); - -static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, - unsigned int *offset, - unsigned int size) -{ - gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); - - return page_pool_alloc_frag(pool, offset, size, gfp); -} - -/* get the stored dma direction. A driver might decide to treat this locally and - * avoid the extra cache line from page_pool to determine the direction - */ -static -inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) -{ - return pool->p.dma_dir; -} - -bool page_pool_return_skb_page(struct page *page, bool napi_safe); - -struct page_pool *page_pool_create(const struct page_pool_params *params); - -struct xdp_mem_info; - -#ifdef CONFIG_PAGE_POOL -void page_pool_unlink_napi(struct page_pool *pool); -void page_pool_destroy(struct page_pool *pool); -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem); -void page_pool_release_page(struct page_pool *pool, struct page *page); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); -#else -static inline void page_pool_unlink_napi(struct page_pool *pool) -{ -} - -static inline void page_pool_destroy(struct page_pool *pool) -{ -} - -static inline void page_pool_use_xdp_mem(struct page_pool *pool, - void (*disconnect)(void *), - struct xdp_mem_info *mem) -{ -} -static inline void page_pool_release_page(struct page_pool *pool, - struct page *page) -{ -} - -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) -{ -} -#endif - -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, - bool allow_direct); - -/* pp_frag_count represents the number of writers who can update the page - * either by updating skb->data or via DMA mappings for the device. - * We can't rely on the page refcnt for that as we don't know who might be - * holding page references and we can't reliably destroy or sync DMA mappings - * of the fragments. - * - * When pp_frag_count reaches 0 we can either recycle the page if the page - * refcnt is 1 or return it back to the memory allocator and destroy any - * mappings we have. - */ -static inline void page_pool_fragment_page(struct page *page, long nr) -{ - atomic_long_set(&page->pp_frag_count, nr); -} - -static inline long page_pool_defrag_page(struct page *page, long nr) -{ - long ret; - - /* If nr == pp_frag_count then we have cleared all remaining - * references to the page. No need to actually overwrite it, instead - * we can leave this to be overwritten by the calling function. - * - * The main advantage to doing this is that an atomic_read is - * generally a much cheaper operation than an atomic update, - * especially when dealing with a page that may be partitioned - * into only 2 or 3 pieces. - */ - if (atomic_long_read(&page->pp_frag_count) == nr) - return 0; - - ret = atomic_long_sub_return(nr, &page->pp_frag_count); - WARN_ON(ret < 0); - return ret; -} - -static inline bool page_pool_is_last_frag(struct page_pool *pool, - struct page *page) -{ - /* If fragments aren't enabled or count is 0 we were the last user */ - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || - (page_pool_defrag_page(page, 1) == 0); -} - -static inline void page_pool_put_page(struct page_pool *pool, - struct page *page, - unsigned int dma_sync_size, - bool allow_direct) -{ - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ -#ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(pool, page)) - return; - - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); -#endif -} - -/* Same as above but will try to sync the entire area pool->max_len */ -static inline void page_pool_put_full_page(struct page_pool *pool, - struct page *page, bool allow_direct) -{ - page_pool_put_page(pool, page, -1, allow_direct); -} - -/* Same as above but the caller must guarantee safe context. e.g NAPI */ -static inline void page_pool_recycle_direct(struct page_pool *pool, - struct page *page) -{ - page_pool_put_full_page(pool, page, true); -} - -#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ - (sizeof(dma_addr_t) > sizeof(unsigned long)) - -static inline dma_addr_t page_pool_get_dma_addr(struct page *page) -{ - dma_addr_t ret = page->dma_addr; - - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; - - return ret; -} - -static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) -{ - page->dma_addr = addr; - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - page->dma_addr_upper = upper_32_bits(addr); -} - -static inline bool is_page_pool_compiled_in(void) -{ -#ifdef CONFIG_PAGE_POOL - return true; -#else - return false; -#endif -} - -static inline bool page_pool_put(struct page_pool *pool) -{ - return refcount_dec_and_test(&pool->user_cnt); -} - -/* Caller must provide appropriate safe context, e.g. NAPI. */ -void page_pool_update_nid(struct page_pool *pool, int new_nid); -static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) -{ - if (unlikely(pool->p.nid != new_nid)) - page_pool_update_nid(pool, new_nid); -} - -#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h new file mode 100644 index 000000000000..94231533a369 --- /dev/null +++ b/include/net/page_pool/helpers.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool/helpers.h + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * The page_pool allocator is optimized for the XDP mode that + * uses one frame per-page, but it can fallback on the + * regular page allocator APIs. + * + * Basic use involves replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * API keeps track of in-flight pages, in order to let API user know + * when it is safe to free a page_pool object. Thus, API users + * must call page_pool_put_page() to free the page, or attach + * the page to a page_pool-aware objects like skbs marked with + * skb_mark_for_recycle(). + * + * API user must call page_pool_put_page() once on a page, as it + * will either recycle the page, or in case of refcnt > 1, it will + * release the DMA mapping and in-flight state accounting. + */ +#ifndef _NET_PAGE_POOL_HELPERS_H +#define _NET_PAGE_POOL_HELPERS_H + +#include <net/page_pool/types.h> + +#ifdef CONFIG_PAGE_POOL_STATS +int page_pool_ethtool_stats_get_count(void); +u8 *page_pool_ethtool_stats_get_strings(u8 *data); +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); + +/* + * Drivers that wish to harvest page pool stats and report them to users + * (perhaps via ethtool, debugfs, or another mechanism) can allocate a + * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. + */ +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats); +#else +static inline int page_pool_ethtool_stats_get_count(void) +{ + return 0; +} + +static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + return data; +} + +static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + return data; +} +#endif + +/** + * page_pool_dev_alloc_pages() - allocate a page. + * @pool: pool from which to allocate + * + * Get a page from the page allocator or page_pool caches. + */ +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + +/** + * page_pool_get_dma_dir() - Retrieve the stored DMA direction. + * @pool: pool from which page was allocated + * + * Get the stored dma direction. A driver might decide to store this locally + * and avoid the extra cache line from page_pool to determine the direction. + */ +static +inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) +{ + return pool->p.dma_dir; +} + +/* pp_frag_count represents the number of writers who can update the page + * either by updating skb->data or via DMA mappings for the device. + * We can't rely on the page refcnt for that as we don't know who might be + * holding page references and we can't reliably destroy or sync DMA mappings + * of the fragments. + * + * When pp_frag_count reaches 0 we can either recycle the page if the page + * refcnt is 1 or return it back to the memory allocator and destroy any + * mappings we have. + */ +static inline void page_pool_fragment_page(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_defrag_page(struct page *page, long nr) +{ + long ret; + + /* If nr == pp_frag_count then we have cleared all remaining + * references to the page. No need to actually overwrite it, instead + * we can leave this to be overwritten by the calling function. + * + * The main advantage to doing this is that an atomic_read is + * generally a much cheaper operation than an atomic update, + * especially when dealing with a page that may be partitioned + * into only 2 or 3 pieces. + */ + if (atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; +} + +static inline bool page_pool_is_last_frag(struct page_pool *pool, + struct page *page) +{ + /* If fragments aren't enabled or count is 0 we were the last user */ + return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || + (page_pool_defrag_page(page, 1) == 0); +} + +/** + * page_pool_put_page() - release a reference to a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @dma_sync_size: how much of the page may have been touched by the device + * @allow_direct: released by the consumer, allow lockless caching + * + * The outcome of this depends on the page refcnt. If the driver bumps + * the refcnt > 1 this will unmap the page. If the page refcnt is 1 + * the allocator owns the page and will try to recycle it in one of the pool + * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device + * using dma_sync_single_range_for_device(). + */ +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size, + bool allow_direct) +{ + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL + if (!page_pool_is_last_frag(pool, page)) + return; + + page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); +#endif +} + +/** + * page_pool_put_full_page() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @allow_direct: released by the consumer, allow lockless caching + * + * Similar to page_pool_put_page(), but will DMA sync the entire memory area + * as configured in &page_pool_params.max_len. + */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + page_pool_put_page(pool, page, -1, allow_direct); +} + +/** + * page_pool_recycle_direct() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * + * Similar to page_pool_put_full_page() but caller must guarantee safe context + * (e.g NAPI), since it will recycle the page directly into the pool fast cache. + */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + page_pool_put_full_page(pool, page, true); +} + +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + +/** + * page_pool_get_dma_addr() - Retrieve the stored DMA address. + * @page: page allocated from a page pool + * + * Fetch the DMA address of the page. The page pool to which the page belongs + * must had been created with PP_FLAG_DMA_MAP. + */ +static inline dma_addr_t page_pool_get_dma_addr(struct page *page) +{ + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; + + return ret; +} + +static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + page->dma_addr = addr; + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + page->dma_addr_upper = upper_32_bits(addr); +} + +static inline bool page_pool_put(struct page_pool *pool) +{ + return refcount_dec_and_test(&pool->user_cnt); +} + +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} + +#endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h new file mode 100644 index 000000000000..887e7946a597 --- /dev/null +++ b/include/net/page_pool/types.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NET_PAGE_POOL_TYPES_H +#define _NET_PAGE_POOL_TYPES_H + +#include <linux/dma-direction.h> +#include <linux/ptr_ring.h> + +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGE_FRAG) + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + struct page *cache[PP_ALLOC_CACHE_SIZE]; +}; + +/** + * struct page_pool_params - page pool parameters + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG + * @order: 2^order pages on allocation + * @pool_size: size of the ptr_ring + * @nid: NUMA node id to allocate from pages from + * @dev: device, for DMA pre-mapping purposes + * @napi: NAPI which is the sole consumer of pages, otherwise NULL + * @dma_dir: DMA mapping direction + * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV + * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV + */ +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; +/* private: used by test code only */ + void (*init_callback)(struct page *page, void *arg); + void *init_arg; +}; + +#ifdef CONFIG_PAGE_POOL_STATS +/** + * struct page_pool_alloc_stats - allocation statistics + * @fast: successful fast path allocations + * @slow: slow path order-0 allocations + * @slow_high_order: slow path high order allocations + * @empty: ptr ring is empty, so a slow path allocation was forced + * @refill: an allocation which triggered a refill of the cache + * @waive: pages obtained from the ptr ring that cannot be added to + * the cache due to a NUMA mismatch + */ +struct page_pool_alloc_stats { + u64 fast; + u64 slow; + u64 slow_high_order; + u64 empty; + u64 refill; + u64 waive; +}; + +/** + * struct page_pool_recycle_stats - recycling (freeing) statistics + * @cached: recycling placed page in the page pool cache + * @cache_full: page pool cache was full + * @ring: page placed into the ptr ring + * @ring_full: page released from page pool because the ptr ring was full + * @released_refcnt: page released (and not recycled) because refcnt > 1 + */ +struct page_pool_recycle_stats { + u64 cached; + u64 cache_full; + u64 ring; + u64 ring_full; + u64 released_refcnt; +}; + +/** + * struct page_pool_stats - combined page pool use statistics + * @alloc_stats: see struct page_pool_alloc_stats + * @recycle_stats: see struct page_pool_recycle_stats + * + * Wrapper struct for combining page pool stats with different storage + * requirements. + */ +struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; +}; +#endif + +struct page_pool { + struct page_pool_params p; + + long frag_users; + struct page *frag_page; + unsigned int frag_offset; + u32 pages_state_hold_cnt; + + struct delayed_work release_dw; + void (*disconnect)(void *pool); + unsigned long defer_start; + unsigned long defer_warn; + +#ifdef CONFIG_PAGE_POOL_STATS + /* these stats are incremented while in softirq context */ + struct page_pool_alloc_stats alloc_stats; +#endif + u32 xdp_mem_id; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * efficiently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; + +#ifdef CONFIG_PAGE_POOL_STATS + /* recycle stats are per-cpu to avoid locking */ + struct page_pool_recycle_stats __percpu *recycle_stats; +#endif + atomic_t pages_state_release_cnt; + + /* A page_pool is strictly tied to a single RX-queue being + * protected by NAPI, due to above pp_alloc_cache. This + * refcnt serves purpose is to simplify drivers error handling. + */ + refcount_t user_cnt; + + u64 destroy_cnt; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp); +struct page_pool *page_pool_create(const struct page_pool_params *params); + +struct xdp_mem_info; + +#ifdef CONFIG_PAGE_POOL +void page_pool_unlink_napi(struct page_pool *pool); +void page_pool_destroy(struct page_pool *pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); +#else +static inline void page_pool_unlink_napi(struct page_pool *pool) +{ +} + +static inline void page_pool_destroy(struct page_pool *pool) +{ +} + +static inline void page_pool_use_xdp_mem(struct page_pool *pool, + void (*disconnect)(void *), + struct xdp_mem_info *mem) +{ +} + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} +#endif + +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); + +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); + +#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a2ea45c7b53e..f308e8268651 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -138,19 +138,6 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block) return NULL; } -static inline -int tc_setup_cb_block_register(struct tcf_block *block, flow_setup_cb_t *cb, - void *cb_priv) -{ - return 0; -} - -static inline -void tc_setup_cb_block_unregister(struct tcf_block *block, flow_setup_cb_t *cb, - void *cb_priv) -{ -} - static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, @@ -866,6 +853,7 @@ struct tc_htb_qopt_offload { u32 parent_classid; u16 classid; u16 qid; + u32 quantum; u64 rate; u64 ceil; u8 prio; diff --git a/include/net/route.h b/include/net/route.h index 5a5c726472bd..51a45b1887b5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -163,12 +163,12 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, } static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4, - struct sock *sk, + const struct sock *sk, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) { - flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, + flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); @@ -298,10 +298,10 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, { __u8 flow_flags = 0; - if (inet_sk(sk)->transparent) + if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; - flowi4_init_output(fl4, oif, sk->sk_mark, ip_sock_rt_tos(sk), + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); } @@ -309,7 +309,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { struct net *net = sock_net(sk); struct rtable *rt; @@ -330,7 +330,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt, __be16 orig_sport, __be16 orig_dport, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { if (sport != orig_sport || dport != orig_dport) { fl4->fl4_dport = dport; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index d9076a7a430c..6506221c5fe3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -190,8 +190,8 @@ int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr * int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh); -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr); +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e92f73bb3198..f232512505f8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -599,6 +599,7 @@ get_default_qdisc_ops(const struct net_device *dev, int ntx) struct Qdisc_class_common { u32 classid; + unsigned int filter_cnt; struct hlist_node hnode; }; @@ -633,6 +634,31 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) return NULL; } +static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) +{ + return cl->filter_cnt > 0; +} + +static inline void qdisc_class_get(struct Qdisc_class_common *cl) +{ + unsigned int res; + + if (check_add_overflow(cl->filter_cnt, 1, &res)) + WARN(1, "Qdisc class overflow"); + + cl->filter_cnt = res; +} + +static inline void qdisc_class_put(struct Qdisc_class_common *cl) +{ + unsigned int res; + + if (check_sub_overflow(cl->filter_cnt, 1, &res)) + WARN(1, "Qdisc class underflow"); + + cl->filter_cnt = res; +} + static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; @@ -703,7 +729,7 @@ int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 2a67100b2a17..a2310fa995f6 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -148,8 +148,6 @@ void sctp_icmp_redirect(struct sock *, struct sctp_transport *, void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t); -void sctp_backlog_migrate(struct sctp_association *assoc, - struct sock *oldsk, struct sock *newsk); int sctp_transport_hashtable_init(void); void sctp_transport_hashtable_destroy(void); int sctp_hash_transport(struct sctp_transport *t); diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index f37c7a558d6d..64c42bd56bb2 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -156,7 +156,6 @@ sctp_state_fn_t sctp_sf_do_6_2_sack; sctp_state_fn_t sctp_sf_autoclose_timer_expire; /* Prototypes for utility support functions. */ -__u8 sctp_get_chunk_type(struct sctp_chunk *chunk); const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, @@ -166,8 +165,6 @@ int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); -__u32 sctp_generate_verification_tag(void); -void sctp_populate_tie_tags(__u8 *cookie, __u32 curTag, __u32 hisTag); /* Prototypes for chunk-building functions. */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 5c72d1864dd6..5a24d6d8522a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1122,8 +1122,6 @@ void sctp_outq_free(struct sctp_outq*); void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); -void sctp_outq_restart(struct sctp_outq *); - void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); diff --git a/include/net/sock.h b/include/net/sock.h index 2eb916d1ff64..11d503417591 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1323,6 +1323,7 @@ struct proto { /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. + * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ @@ -1339,6 +1340,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; + unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ @@ -1420,6 +1422,12 @@ static inline bool sk_has_memory_pressure(const struct sock *sk) return sk->sk_prot->memory_pressure != NULL; } +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure && + !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) @@ -1429,7 +1437,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; - return !!*sk->sk_prot->memory_pressure; + return !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline long @@ -1506,7 +1514,7 @@ proto_memory_pressure(struct proto *prot) { if (!prot->memory_pressure) return false; - return !!*prot->memory_pressure; + return !!READ_ONCE(*prot->memory_pressure); } @@ -2814,20 +2822,23 @@ sk_is_refcounted(struct sock *sk) * skb_steal_sock - steal a socket from an sk_buff * @skb: sk_buff to steal the socket from * @refcounted: is set to true if the socket is reference-counted + * @prefetched: is set to true if the socket was assigned from bpf */ static inline struct sock * -skb_steal_sock(struct sk_buff *skb, bool *refcounted) +skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) { if (skb->sk) { struct sock *sk = skb->sk; *refcounted = true; - if (skb_sk_is_prefetched(skb)) + *prefetched = skb_sk_is_prefetched(skb); + if (*prefetched) *refcounted = sk_is_refcounted(sk); skb->destructor = NULL; skb->sk = NULL; return sk; } + *prefetched = false; *refcounted = false; return NULL; } diff --git a/include/net/switchdev.h b/include/net/switchdev.h index ca0312b78294..a43062d4c734 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -201,8 +201,6 @@ struct switchdev_obj_in_state_mrp { #define SWITCHDEV_OBJ_IN_STATE_MRP(OBJ) \ container_of((OBJ), struct switchdev_obj_in_state_mrp, obj) -typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); - struct switchdev_brport { struct net_device *dev; const void *ctx; @@ -231,6 +229,7 @@ enum switchdev_notifier_type { SWITCHDEV_BRPORT_OFFLOADED, SWITCHDEV_BRPORT_UNOFFLOADED, + SWITCHDEV_BRPORT_REPLAY, }; struct switchdev_notifier_info { @@ -299,6 +298,11 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, @@ -322,10 +326,6 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack); -void switchdev_port_fwd_mark_set(struct net_device *dev, - struct net_device *group_dev, - bool joining); - int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, const struct switchdev_notifier_fdb_info *fdb_info, bool (*check_cb)(const struct net_device *dev), diff --git a/include/net/tcp.h b/include/net/tcp.h index 3a818fe1a8a5..91688d0dadcd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -322,7 +322,6 @@ int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); void tcp_remove_empty_skb(struct sock *sk); -int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, @@ -349,7 +348,6 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { @@ -605,7 +603,6 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); -void tcp_send_partial(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); @@ -623,7 +620,6 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk, struct sk_buff *skb); -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); @@ -1431,13 +1427,39 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); +static inline int __tcp_win_from_space(u8 scaling_ratio, int space) +{ + s64 scaled_space = (s64)space * scaling_ratio; + + return scaled_space >> TCP_RMEM_TO_WIN_SCALE; +} + static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); + return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); +} + +/* inverse of __tcp_win_from_space() */ +static inline int __tcp_space_from_win(u8 scaling_ratio, int win) +{ + u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; - return tcp_adv_win_scale <= 0 ? - (space>>(-tcp_adv_win_scale)) : - space - (space>>tcp_adv_win_scale); + do_div(val, scaling_ratio); + return val; +} + +static inline int tcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); +} + +static inline void tcp_scaling_ratio_init(struct sock *sk) +{ + /* Assume a conservative default of 1200 bytes of payload per 4K page. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ + tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) / + SKB_TRUESIZE(4096); } /* Note: caller must be prepared to deal with negative returns */ @@ -2008,7 +2030,7 @@ static inline bool inet_sk_transparent(const struct sock *sk) case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } - return inet_sk(sk)->transparent; + return inet_test_bit(TRANSPARENT, sk); } /* Determines whether this is a thin stream (which may suffer from @@ -2335,7 +2357,6 @@ struct sk_msg; struct sk_psock; #ifdef CONFIG_BPF_SYSCALL -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/tcx.h b/include/net/tcx.h new file mode 100644 index 000000000000..264f147953ba --- /dev/null +++ b/include/net/tcx.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_TCX_H +#define __NET_TCX_H + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> + +#include <net/sch_generic.h> + +struct mini_Qdisc; + +struct tcx_entry { + struct mini_Qdisc __rcu *miniq; + struct bpf_mprog_bundle bundle; + bool miniq_active; + struct rcu_head rcu; +}; + +struct tcx_link { + struct bpf_link link; + struct net_device *dev; + u32 location; +}; + +static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) +{ +#ifdef CONFIG_NET_XGRESS + skb->tc_at_ingress = ingress; +#endif +} + +#ifdef CONFIG_NET_XGRESS +static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) +{ + struct bpf_mprog_bundle *bundle = entry->parent; + + return container_of(bundle, struct tcx_entry, bundle); +} + +static inline struct tcx_link *tcx_link(struct bpf_link *link) +{ + return container_of(link, struct tcx_link, link); +} + +static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link) +{ + return tcx_link((struct bpf_link *)link); +} + +void tcx_inc(void); +void tcx_dec(void); + +static inline void tcx_entry_sync(void) +{ + /* bpf_mprog_entry got a/b swapped, therefore ensure that + * there are no inflight users on the old one anymore. + */ + synchronize_rcu(); +} + +static inline void +tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, + bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + rcu_assign_pointer(dev->tcx_ingress, entry); + else + rcu_assign_pointer(dev->tcx_egress, entry); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch(struct net_device *dev, bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + return rcu_dereference_rtnl(dev->tcx_ingress); + else + return rcu_dereference_rtnl(dev->tcx_egress); +} + +static inline struct bpf_mprog_entry *tcx_entry_create(void) +{ + struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); + + if (tcx) { + bpf_mprog_bundle_init(&tcx->bundle); + return &tcx->bundle.a; + } + return NULL; +} + +static inline void tcx_entry_free(struct bpf_mprog_entry *entry) +{ + kfree_rcu(tcx_entry(entry), rcu); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) +{ + struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); + + *created = false; + if (!entry) { + entry = tcx_entry_create(); + if (!entry) + return NULL; + *created = true; + } + return entry; +} + +static inline void tcx_skeys_inc(bool ingress) +{ + tcx_inc(); + if (ingress) + net_inc_ingress_queue(); + else + net_inc_egress_queue(); +} + +static inline void tcx_skeys_dec(bool ingress) +{ + if (ingress) + net_dec_ingress_queue(); + else + net_dec_egress_queue(); + tcx_dec(); +} + +static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, + const bool active) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active = active; +} + +static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; +} + +static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, + int code) +{ + switch (code) { + case TCX_PASS: + skb->tc_index = qdisc_skb_cb(skb)->tc_classid; + fallthrough; + case TCX_DROP: + case TCX_REDIRECT: + return code; + case TCX_NEXT: + default: + return TCX_NEXT; + } +} +#endif /* CONFIG_NET_XGRESS */ + +#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +void tcx_uninstall(struct net_device *dev, bool ingress); + +int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ + ASSERT_RTNL(); + tcx_uninstall(dev, true); + tcx_uninstall(dev, false); +} +#else +static inline int tcx_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ +} +#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ +#endif /* __NET_TCX_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 5e71dd3df8ca..a2b44578dcb7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -51,16 +51,6 @@ struct tls_rec; -struct tls_cipher_size_desc { - unsigned int iv; - unsigned int key; - unsigned int salt; - unsigned int tag; - unsigned int rec_seq; -}; - -extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; - /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) @@ -69,10 +59,6 @@ extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) -#define TLS_RECORD_TYPE_ALERT 0x15 -#define TLS_RECORD_TYPE_HANDSHAKE 0x16 -#define TLS_RECORD_TYPE_DATA 0x17 - #define TLS_AAD_SPACE_SIZE 13 #define MAX_IV_SIZE 16 diff --git a/include/net/tls_prot.h b/include/net/tls_prot.h new file mode 100644 index 000000000000..68a40756440b --- /dev/null +++ b/include/net/tls_prot.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. + * + * TLS Protocol definitions + * + * From https://www.iana.org/assignments/tls-parameters/tls-parameters.xhtml + */ + +#ifndef _TLS_PROT_H +#define _TLS_PROT_H + +/* + * TLS Record protocol: ContentType + */ +enum { + TLS_RECORD_TYPE_CHANGE_CIPHER_SPEC = 20, + TLS_RECORD_TYPE_ALERT = 21, + TLS_RECORD_TYPE_HANDSHAKE = 22, + TLS_RECORD_TYPE_DATA = 23, + TLS_RECORD_TYPE_HEARTBEAT = 24, + TLS_RECORD_TYPE_TLS12_CID = 25, + TLS_RECORD_TYPE_ACK = 26, +}; + +/* + * TLS Alert protocol: AlertLevel + */ +enum { + TLS_ALERT_LEVEL_WARNING = 1, + TLS_ALERT_LEVEL_FATAL = 2, +}; + +/* + * TLS Alert protocol: AlertDescription + */ +enum { + TLS_ALERT_DESC_CLOSE_NOTIFY = 0, + TLS_ALERT_DESC_UNEXPECTED_MESSAGE = 10, + TLS_ALERT_DESC_BAD_RECORD_MAC = 20, + TLS_ALERT_DESC_RECORD_OVERFLOW = 22, + TLS_ALERT_DESC_HANDSHAKE_FAILURE = 40, + TLS_ALERT_DESC_BAD_CERTIFICATE = 42, + TLS_ALERT_DESC_UNSUPPORTED_CERTIFICATE = 43, + TLS_ALERT_DESC_CERTIFICATE_REVOKED = 44, + TLS_ALERT_DESC_CERTIFICATE_EXPIRED = 45, + TLS_ALERT_DESC_CERTIFICATE_UNKNOWN = 46, + TLS_ALERT_DESC_ILLEGAL_PARAMETER = 47, + TLS_ALERT_DESC_UNKNOWN_CA = 48, + TLS_ALERT_DESC_ACCESS_DENIED = 49, + TLS_ALERT_DESC_DECODE_ERROR = 50, + TLS_ALERT_DESC_DECRYPT_ERROR = 51, + TLS_ALERT_DESC_TOO_MANY_CIDS_REQUESTED = 52, + TLS_ALERT_DESC_PROTOCOL_VERSION = 70, + TLS_ALERT_DESC_INSUFFICIENT_SECURITY = 71, + TLS_ALERT_DESC_INTERNAL_ERROR = 80, + TLS_ALERT_DESC_INAPPROPRIATE_FALLBACK = 86, + TLS_ALERT_DESC_USER_CANCELED = 90, + TLS_ALERT_DESC_MISSING_EXTENSION = 109, + TLS_ALERT_DESC_UNSUPPORTED_EXTENSION = 110, + TLS_ALERT_DESC_UNRECOGNIZED_NAME = 112, + TLS_ALERT_DESC_BAD_CERTIFICATE_STATUS_RESPONSE = 113, + TLS_ALERT_DESC_UNKNOWN_PSK_IDENTITY = 115, + TLS_ALERT_DESC_CERTIFICATE_REQUIRED = 116, + TLS_ALERT_DESC_NO_APPLICATION_PROTOCOL = 120, +}; + +#endif /* _TLS_PROT_H */ diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index d27b1caf3753..1a97e3f32029 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -33,8 +33,6 @@ void udplitev6_exit(void); int tcpv6_init(void); void tcpv6_exit(void); -int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); - /* this does all the common and the specific ctl work */ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb); diff --git a/include/net/udp.h b/include/net/udp.h index 4d13424f8f72..488a6d2babcc 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -273,9 +273,6 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); -int udp_get_port(struct sock *sk, unsigned short snum, - int (*saddr_cmp)(const struct sock *, - const struct sock *)); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); @@ -529,7 +526,6 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) #ifdef CONFIG_BPF_SYSCALL struct sk_psock; -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif diff --git a/include/net/udplite.h b/include/net/udplite.h index 299c14ce2bb9..bd33ff2b8f42 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -81,6 +81,4 @@ static inline __wsum udplite_csum(struct sk_buff *skb) } void udplite4_register(void); -int udplite_get_port(struct sock *sk, unsigned short snum, - int (*scmp)(const struct sock *, const struct sock *)); #endif /* _UDPLITE_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 1648240c9668..6a9f8a5f387c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -556,12 +556,12 @@ static inline void vxlan_flag_attr_error(int attrtype, } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, - int hash, + u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; - nhc = nexthop_path_fdb_result(nh, hash); + nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; diff --git a/include/net/xdp.h b/include/net/xdp.h index d1c5381fc95f..de08c8e0d134 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -6,9 +6,10 @@ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ -#include <linux/skbuff.h> /* skb_shared_info */ -#include <uapi/linux/netdev.h> #include <linux/bitfield.h> +#include <linux/filter.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> /* skb_shared_info */ /** * DOC: XDP RX-queue information @@ -45,8 +46,6 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; -typedef u32 xdp_features_t; - /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH @@ -443,6 +442,12 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; +struct xdp_metadata_ops { + int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); + int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); +}; + #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); @@ -474,4 +479,20 @@ static inline void xdp_clear_features_flag(struct net_device *dev) xdp_set_features_flag(dev, 0); } +static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, + struct xdp_buff *xdp) +{ + /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus + * under local_bh_disable(), which provides the needed RCU protection + * for accessing map entries. + */ + u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + + if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { + if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) + act = xdp_master_redirect(xdp); + } + + return act; +} #endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e96a1151ec75..1617af380162 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -52,6 +52,7 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; + bool sg; enum { XSK_READY = 0, XSK_BOUND, @@ -67,6 +68,12 @@ struct xdp_sock { u64 rx_dropped; u64 rx_queue_full; + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + struct sk_buff *skb; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c243f906ebed..1f6fc8c7a84c 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -89,6 +89,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return !xp_mb_desc(desc); +} + /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { @@ -103,10 +108,45 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + struct list_head *xskb_list = &xskb->pool->xskb_list; + struct xdp_buff_xsk *pos, *tmp; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_del(&pos->xskb_list_node); + xp_free(pos); + } + + xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; +out: xp_free(xskb); } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + + list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff *ret = NULL; + struct xdp_buff_xsk *frag; + + frag = list_first_entry_or_null(&xskb->pool->xskb_list, + struct xdp_buff_xsk, xskb_list_node); + if (frag) { + list_del(&frag->xskb_list_node); + ret = &frag->xdp; + } + + return ret; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -241,6 +281,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } +static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +{ + return false; +} + static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; @@ -255,6 +300,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 151ca95dd08d..363c7d510554 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1984,6 +1984,7 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); } } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a8d7b8a3688a..b0bdff26fc88 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,6 +29,7 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; + struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) @@ -54,6 +55,7 @@ struct xsk_buff_pool { struct xdp_umem *umem; struct work_struct work; struct list_head free_list; + struct list_head xskb_list; u32 heads_cnt; u16 queue_id; @@ -184,6 +186,11 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } +static inline bool xp_mb_desc(struct xdp_desc *desc) +{ + return desc->options & XDP_PKT_CONTD; +} + static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; |