summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2025-05-13 12:13:05 +0300
committerPaolo Abeni <pabeni@redhat.com>2025-05-13 12:14:46 +0300
commitac4d1baf97fdaa6ef789273f0fd485c0d0d6f100 (patch)
tree36eee849396486fd2c4ef1465faf2252d8249f99 /include
parente39d14a760c039af0653e3df967e7525413924a0 (diff)
parent2f1a805f32ba37545209a7ddbf0845ac8802dfe9 (diff)
downloadlinux-ac4d1baf97fdaa6ef789273f0fd485c0d0d6f100.tar.xz
Merge branch 'device-memory-tcp-tx'
Mina Almasry says: ==================== Device memory TCP TX The TX path had been dropped from the Device Memory TCP patch series post RFCv1 [1], to make that series slightly easier to review. This series rebases the implementation of the TX path on top of the net_iov/netmem framework agreed upon and merged. The motivation for the feature is thoroughly described in the docs & cover letter of the original proposal, so I don't repeat the lengthy descriptions here, but they are available in [1]. Full outline on usage of the TX path is detailed in the documentation included with this series. Test example is available via the kselftest included in the series as well. The series is relatively small, as the TX path for this feature largely piggybacks on the existing MSG_ZEROCOPY implementation. Patch Overview: --------------- 1. Documentation & tests to give high level overview of the feature being added. 1. Add netmem refcounting needed for the TX path. 2. Devmem TX netlink API. 3. Devmem TX net stack implementation. 4. Make dma-buf unbinding scheduled work to handle TX cases where it gets freed from contexts where we can't sleep. 5. Add devmem TX documentation. 6. Add scaffolding enabling driver support for netmem_tx. Add helpers, driver feature flag, and docs to enable drivers to declare netmem_tx support. 7. Guard netmem_tx against being enabled against drivers that don't support it. 8. Add devmem_tx selftests. Add TX path to ncdevmem and add a test to devmem.py. Testing: -------- Testing is very similar to devmem TCP RX path. The ncdevmem test used for the RX path is now augemented with client functionality to test TX path. * Test Setup: Kernel: net-next with this RFC and memory provider API cherry-picked locally. Hardware: Google Cloud A3 VMs. NIC: GVE with header split & RSS & flow steering support. Performance results are not included with this version, unfortunately. I'm having issues running the dma-buf exporter driver against the upstream kernel on my test setup. The issues are specific to that dma-buf exporter and do not affect this patch series. I plan to follow up this series with perf fixes if the tests point to issues once they're up and running. Special thanks to Stan who took a stab at rebasing the TX implementation on top of the netmem/net_iov framework merged. Parts of his proposal [2] that are reused as-is are forked off into their own patches to give full credit. [1] https://lore.kernel.org/netdev/20240909054318.1809580-1-almasrymina@google.com/ [2] https://lore.kernel.org/netdev/20240913150913.1280238-2-sdf@fomichev.me/T/#m066dd407fbed108828e2c40ae50e3f4376ef57fd Cc: sdf@fomichev.me Cc: asml.silence@gmail.com Cc: dw@davidwei.uk Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Victor Nogueira <victor@mojatatu.com> Cc: Pedro Tammela <pctammela@mojatatu.com> Cc: Samiullah Khawaja <skhawaja@google.com> Cc: Kuniyuki Iwashima <kuniyu@amazon.com> v14: https://lore.kernel.org/netdev/20250429032645.363766-1-almasrymina@google.com/ v13: https://lore.kernel.org/netdev/20250425204743.617260-1-almasrymina@google.com/ v12: https://lore.kernel.org/netdev/20250423031117.907681-1-almasrymina@google.com/ v11: https://lore.kernel.org/netdev/20250423031117.907681-1-almasrymina@google.com/ v10: https://lore.kernel.org/netdev/20250417231540.2780723-1-almasrymina@google.com/ v9: https://lore.kernel.org/netdev/20250415224756.152002-1-almasrymina@google.com/ v8: https://lore.kernel.org/netdev/20250308214045.1160445-1-almasrymina@google.com/ v7: https://lore.kernel.org/netdev/20250227041209.2031104-1-almasrymina@google.com/ v6: https://lore.kernel.org/netdev/20250222191517.743530-1-almasrymina@google.com/ v5: https://lore.kernel.org/netdev/20250220020914.895431-1-almasrymina@google.com/ v4: https://lore.kernel.org/netdev/20250203223916.1064540-1-almasrymina@google.com/ v3: https://patchwork.kernel.org/project/netdevbpf/list/?series=929401&state=* RFC v2: https://patchwork.kernel.org/project/netdevbpf/list/?series=920056&state=* ==================== Link: https://patch.msgid.link/20250508004830.4100853-1-almasrymina@google.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'include')
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--include/linux/skbuff.h17
-rw-r--r--include/linux/skbuff_ref.h4
-rw-r--r--include/net/netmem.h34
-rw-r--r--include/net/sock.h1
-rw-r--r--include/uapi/linux/netdev.h1
6 files changed, 52 insertions, 7 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 773167508c82..32a1e41636a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1772,6 +1772,7 @@ enum netdev_reg_state {
* @lltx: device supports lockless Tx. Deprecated for real HW
* drivers. Mainly used by logical interfaces, such as
* bonding and tunnels
+ * @netmem_tx: device support netmem_tx.
*
* @name: This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
@@ -2087,6 +2088,7 @@ struct net_device {
struct_group(priv_flags_fast,
unsigned long priv_flags:32;
unsigned long lltx:1;
+ unsigned long netmem_tx:1;
);
const struct net_device_ops *netdev_ops;
const struct header_ops *header_ops;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f3e72be6f634..c7397b17bb08 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg);
+ struct ubuf_info *uarg, bool devmem);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
+struct net_devmem_dmabuf_binding;
+
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
- size_t length);
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding);
int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
struct iov_iter *from, size_t length);
@@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
struct msghdr *msg, int len)
{
- return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
+ return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
+ NULL);
}
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg);
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding);
/* Internal */
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
size_t offset, size_t size,
enum dma_data_direction dir)
{
+ if (skb_frag_is_net_iov(frag)) {
+ return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
+ frag->offset;
+ }
return dma_map_page(dev, skb_frag_page(frag),
skb_frag_off(frag) + offset, size, dir);
}
diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
index 0f3c58007488..9e49372ef1a0 100644
--- a/include/linux/skbuff_ref.h
+++ b/include/linux/skbuff_ref.h
@@ -17,7 +17,7 @@
*/
static inline void __skb_frag_ref(skb_frag_t *frag)
{
- get_page(skb_frag_page(frag));
+ get_netmem(skb_frag_netmem(frag));
}
/**
@@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle)
if (recycle && napi_pp_put_page(netmem))
return;
#endif
- put_page(netmem_to_page(netmem));
+ put_netmem(netmem);
}
/**
diff --git a/include/net/netmem.h b/include/net/netmem.h
index c61d5b21e7b4..386164fb9c18 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -8,6 +8,7 @@
#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H
+#include <linux/dma-mapping.h>
#include <linux/mm.h>
#include <net/net_debug.h>
@@ -20,8 +21,17 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
*/
#define NET_IOV 0x01UL
+enum net_iov_type {
+ NET_IOV_DMABUF,
+ NET_IOV_IOURING,
+
+ /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass.
+ */
+ NET_IOV_MAX = ULONG_MAX
+};
+
struct net_iov {
- unsigned long __unused_padding;
+ enum net_iov_type type;
unsigned long pp_magic;
struct page_pool *pp;
struct net_iov_area *owner;
@@ -264,4 +274,26 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
return __netmem_clear_lsb(netmem)->dma_addr;
}
+void get_netmem(netmem_ref netmem);
+void put_netmem(netmem_ref netmem);
+
+#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \
+ do { \
+ if (!netmem_is_net_iov(NETMEM)) \
+ dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \
+ else \
+ dma_unmap_addr_set(PTR, ADDR_NAME, 0); \
+ } while (0)
+
+static inline void netmem_dma_unmap_page_attrs(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ if (!addr)
+ return;
+
+ dma_unmap_page_attrs(dev, addr, size, dir, attrs);
+}
+
#endif /* _NET_NETMEM_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index f0fabb9fd28a..3e15d7105ad2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1851,6 +1851,7 @@ struct sockcm_cookie {
u32 tsflags;
u32 ts_opt_id;
u32 priority;
+ u32 dmabuf_id;
};
static inline void sockcm_init(struct sockcm_cookie *sockc,
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7600bf62dbdf..7eb9571786b8 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -219,6 +219,7 @@ enum {
NETDEV_CMD_QSTATS_GET,
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
+ NETDEV_CMD_BIND_TX,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)