summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-07-10 23:54:00 +0300
committerDavid S. Miller <davem@davemloft.net>2020-07-10 23:54:00 +0300
commit0ea460474d70d809eac0640c1cf408ec54e23966 (patch)
tree5c81ecec7badd61bc89d833bf60ed100c869b7a4 /include
parent8fb49c0109f47e4a25e8ba36abd8381afbfa7a08 (diff)
parentfb6f8970bd9e6ecce03fbe2453fe03592595ebc9 (diff)
downloadlinux-0ea460474d70d809eac0640c1cf408ec54e23966.tar.xz
Merge branch 'udp_tunnel-add-NIC-RX-port-offload-infrastructure'
Jakub Kicinski says: ==================== udp_tunnel: add NIC RX port offload infrastructure Kernel has a facility to notify drivers about the UDP tunnel ports so that devices can recognize tunneled packets. This is important mostly for RX - devices which don't support CHECKSUM_COMPLETE can report checksums of inner packets, and compute RSS over inner headers. Some drivers also match the UDP tunnel ports also for TX, although doing so may lead to false positives and negatives. Unfortunately the user experience when trying to take adavantage of these facilities is suboptimal. First of all there is no way for users to check which ports are offloaded. Many drivers resort to printing messages to aid debugging, other use debugfs. Even worse the availability of the RX features (NETIF_F_RX_UDP_TUNNEL_PORT) is established purely on the basis of the driver having the ndos installed. For most drivers, however, the ability to perform offloads is contingent on device capabilities (driver support multiple device and firmware versions). Unless driver resorts to hackish clearing of features set incorrectly by the core - users are left guessing whether their device really supports UDP tunnel port offload or not. There is currently no way to indicate or configure whether RX features include just the checksum offload or checksum and using inner headers for RSS. Many drivers default to not using inner headers for RSS because most implementations populate the source port with entropy from the inner headers. This, however, is not always the case, for example certain switches are only able to use a fixed source port during encapsulation. We have also seen many driver authors get the intricacies of UDP tunnel port offloads wrong. Most commonly the drivers forget to perform reference counting, or take sleeping locks in the callbacks. This work tries to improve the situation by pulling the UDP tunnel port table maintenance out of the drivers. It turns out that almost all drivers maintain a fixed size table of ports (in most cases one per tunnel type), so we can take care of all the refcounting in the core, and let the driver specify if they need to sleep in the callbacks or not. The new common implementation will also support replacing ports - when a port is removed from a full table it will try to find a previously missing port to take its place. This patch only implements the core functionality along with a few drivers I was hoping to test manually [1] along with a test based on a netdevsim implementation. Following patches will convert all the drivers. Once that's complete we can remove the ndos, and rely directly on the new infrastrucutre. Then after RSS (RXFH) is converted to netlink we can add the ability to configure the use of inner RSS headers for UDP tunnels. [1] Unfortunately I wasn't able to, turns out 2 of the devices I had access to were older generation or had old FW, and they did not actually support UDP tunnel port notifications (see the second paragraph). The thrid device appears to program the UDP ports correctly but it generates bad UDP checksums with or without these patches. Long story short - I'd appreciate reviews and testing here.. v4: - better build fix (hopefully this one does it..) v3: - fix build issue; - improve bnxt changes. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/debugfs.h12
-rw-r--r--include/linux/netdevice.h8
-rw-r--r--include/net/udp_tunnel.h164
-rw-r--r--include/uapi/linux/ethtool.h2
-rw-r--r--include/uapi/linux/ethtool_netlink.h55
5 files changed, 235 insertions, 6 deletions
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 63cb3606dea7..851dd1f9a8a5 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -38,6 +38,11 @@ struct debugfs_regset32 {
struct device *dev; /* Optional device for Runtime PM */
};
+struct debugfs_u32_array {
+ u32 *array;
+ u32 n_elements;
+};
+
extern struct dentry *arch_debugfs_dir;
#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \
@@ -136,7 +141,8 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
int nregs, void __iomem *base, char *prefix);
void debugfs_create_u32_array(const char *name, umode_t mode,
- struct dentry *parent, u32 *array, u32 elements);
+ struct dentry *parent,
+ struct debugfs_u32_array *array);
struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
struct dentry *parent,
@@ -316,8 +322,8 @@ static inline bool debugfs_initialized(void)
}
static inline void debugfs_create_u32_array(const char *name, umode_t mode,
- struct dentry *parent, u32 *array,
- u32 elements)
+ struct dentry *parent,
+ struct debugfs_u32_array *array)
{
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 39e28e11863c..ac2cd3f49aba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -65,6 +65,8 @@ struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
+struct udp_tunnel_nic_info;
+struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
@@ -1836,6 +1838,10 @@ enum netdev_priv_flags {
*
* @macsec_ops: MACsec offloading ops
*
+ * @udp_tunnel_nic_info: static structure describing the UDP tunnel
+ * offload capabilities of the device
+ * @udp_tunnel_nic: UDP tunnel offload state
+ *
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@@ -2134,6 +2140,8 @@ struct net_device {
/* MACsec management functions */
const struct macsec_ops *macsec_ops;
#endif
+ const struct udp_tunnel_nic_info *udp_tunnel_nic_info;
+ struct udp_tunnel_nic *udp_tunnel_nic;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index e7312ceb2794..dd20ce99740c 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -106,15 +106,16 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
* call this function to perform Tx offloads on outgoing traffic.
*/
enum udp_parsable_tunnel_type {
- UDP_TUNNEL_TYPE_VXLAN, /* RFC 7348 */
- UDP_TUNNEL_TYPE_GENEVE, /* draft-ietf-nvo3-geneve */
- UDP_TUNNEL_TYPE_VXLAN_GPE, /* draft-ietf-nvo3-vxlan-gpe */
+ UDP_TUNNEL_TYPE_VXLAN = BIT(0), /* RFC 7348 */
+ UDP_TUNNEL_TYPE_GENEVE = BIT(1), /* draft-ietf-nvo3-geneve */
+ UDP_TUNNEL_TYPE_VXLAN_GPE = BIT(2), /* draft-ietf-nvo3-vxlan-gpe */
};
struct udp_tunnel_info {
unsigned short type;
sa_family_t sa_family;
__be16 port;
+ u8 hw_priv;
};
/* Notify network devices of offloadable types */
@@ -181,4 +182,161 @@ static inline void udp_tunnel_encap_enable(struct socket *sock)
udp_encap_enable();
}
+#define UDP_TUNNEL_NIC_MAX_TABLES 4
+
+enum udp_tunnel_nic_info_flags {
+ /* Device callbacks may sleep */
+ UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0),
+ /* Device only supports offloads when it's open, all ports
+ * will be removed before close and re-added after open.
+ */
+ UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1),
+ /* Device supports only IPv4 tunnels */
+ UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2),
+};
+
+/**
+ * struct udp_tunnel_nic_info - driver UDP tunnel offload information
+ * @set_port: callback for adding a new port
+ * @unset_port: callback for removing a port
+ * @sync_table: callback for syncing the entire port table at once
+ * @flags: device flags from enum udp_tunnel_nic_info_flags
+ * @tables: UDP port tables this device has
+ * @tables.n_entries: number of entries in this table
+ * @tables.tunnel_types: types of tunnels this table accepts
+ *
+ * Drivers are expected to provide either @set_port and @unset_port callbacks
+ * or the @sync_table callback. Callbacks are invoked with rtnl lock held.
+ *
+ * Known limitations:
+ * - UDP tunnel port notifications are fundamentally best-effort -
+ * it is likely the driver will both see skbs which use a UDP tunnel port,
+ * while not being a tunneled skb, and tunnel skbs from other ports -
+ * drivers should only use these ports for non-critical RX-side offloads,
+ * e.g. the checksum offload;
+ * - none of the devices care about the socket family at present, so we don't
+ * track it. Please extend this code if you care.
+ */
+struct udp_tunnel_nic_info {
+ /* one-by-one */
+ int (*set_port)(struct net_device *dev,
+ unsigned int table, unsigned int entry,
+ struct udp_tunnel_info *ti);
+ int (*unset_port)(struct net_device *dev,
+ unsigned int table, unsigned int entry,
+ struct udp_tunnel_info *ti);
+
+ /* all at once */
+ int (*sync_table)(struct net_device *dev, unsigned int table);
+
+ unsigned int flags;
+
+ struct udp_tunnel_nic_table_info {
+ unsigned int n_entries;
+ unsigned int tunnel_types;
+ } tables[UDP_TUNNEL_NIC_MAX_TABLES];
+};
+
+/* UDP tunnel module dependencies
+ *
+ * Tunnel drivers are expected to have a hard dependency on the udp_tunnel
+ * module. NIC drivers are not, they just attach their
+ * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come.
+ * Loading a tunnel driver will cause the udp_tunnel module to be loaded
+ * and only then will all the required state structures be allocated.
+ * Since we want a weak dependency from the drivers and the core to udp_tunnel
+ * we call things through the following stubs.
+ */
+struct udp_tunnel_nic_ops {
+ void (*get_port)(struct net_device *dev, unsigned int table,
+ unsigned int idx, struct udp_tunnel_info *ti);
+ void (*set_port_priv)(struct net_device *dev, unsigned int table,
+ unsigned int idx, u8 priv);
+ void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti);
+ void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti);
+ void (*reset_ntf)(struct net_device *dev);
+
+ size_t (*dump_size)(struct net_device *dev, unsigned int table);
+ int (*dump_write)(struct net_device *dev, unsigned int table,
+ struct sk_buff *skb);
+};
+
+#ifdef CONFIG_INET
+extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
+#else
+#define udp_tunnel_nic_ops ((struct udp_tunnel_nic_ops *)NULL)
+#endif
+
+static inline void
+udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
+ unsigned int idx, struct udp_tunnel_info *ti)
+{
+ /* This helper is used from .sync_table, we indicate empty entries
+ * by zero'ed @ti. Drivers which need to know the details of a port
+ * when it gets deleted should use the .set_port / .unset_port
+ * callbacks.
+ * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings.
+ */
+ memset(ti, 0, sizeof(*ti));
+
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->get_port(dev, table, idx, ti);
+}
+
+static inline void
+udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
+ unsigned int idx, u8 priv)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
+}
+
+static inline void
+udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->add_port(dev, ti);
+}
+
+static inline void
+udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->del_port(dev, ti);
+}
+
+/**
+ * udp_tunnel_nic_reset_ntf() - device-originating reset notification
+ * @dev: network interface device structure
+ *
+ * Called by the driver to inform the core that the entire UDP tunnel port
+ * state has been lost, usually due to device reset. Core will assume device
+ * forgot all the ports and issue .set_port and .sync_table callbacks as
+ * necessary.
+ *
+ * This function must be called with rtnl lock held, and will issue all
+ * the callbacks before returning.
+ */
+static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->reset_ntf(dev);
+}
+
+static inline size_t
+udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
+{
+ if (!udp_tunnel_nic_ops)
+ return 0;
+ return udp_tunnel_nic_ops->dump_size(dev, table);
+}
+
+static inline int
+udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
+ struct sk_buff *skb)
+{
+ if (!udp_tunnel_nic_ops)
+ return 0;
+ return udp_tunnel_nic_ops->dump_write(dev, table, skb);
+}
#endif
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 60856e0f9618..b4f2d134e713 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -669,6 +669,7 @@ enum ethtool_link_ext_substate_cable_issue {
* @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags
* @ETH_SS_TS_TX_TYPES: timestamping Tx types
* @ETH_SS_TS_RX_FILTERS: timestamping Rx filters
+ * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
*/
enum ethtool_stringset {
ETH_SS_TEST = 0,
@@ -686,6 +687,7 @@ enum ethtool_stringset {
ETH_SS_SOF_TIMESTAMPING,
ETH_SS_TS_TX_TYPES,
ETH_SS_TS_RX_FILTERS,
+ ETH_SS_UDP_TUNNEL_TYPES,
/* add new constants above here */
ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index c12ce4df4b6b..5dcd24cb33ea 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -41,6 +41,7 @@ enum {
ETHTOOL_MSG_TSINFO_GET,
ETHTOOL_MSG_CABLE_TEST_ACT,
ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
+ ETHTOOL_MSG_TUNNEL_INFO_GET,
/* add new constants above here */
__ETHTOOL_MSG_USER_CNT,
@@ -556,6 +557,60 @@ enum {
ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1
};
+/* TUNNEL INFO */
+
+enum {
+ ETHTOOL_UDP_TUNNEL_TYPE_VXLAN,
+ ETHTOOL_UDP_TUNNEL_TYPE_GENEVE,
+ ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE,
+
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT
+};
+
+enum {
+ ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC,
+
+ ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */
+ ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */
+
+ /* add new constants above here */
+ __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT,
+ ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1)
+};
+
+enum {
+ ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC,
+
+ ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */
+ ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */
+ ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */
+
+ /* add new constants above here */
+ __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT,
+ ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1)
+};
+
+enum {
+ ETHTOOL_A_TUNNEL_UDP_UNSPEC,
+
+ ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */
+
+ /* add new constants above here */
+ __ETHTOOL_A_TUNNEL_UDP_CNT,
+ ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1)
+};
+
+enum {
+ ETHTOOL_A_TUNNEL_INFO_UNSPEC,
+ ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */
+
+ ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */
+
+ /* add new constants above here */
+ __ETHTOOL_A_TUNNEL_INFO_CNT,
+ ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1)
+};
+
/* generic netlink info */
#define ETHTOOL_GENL_NAME "ethtool"
#define ETHTOOL_GENL_VERSION 1