summaryrefslogtreecommitdiff
path: root/net/core/netdev_rx_queue.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-04-10 04:24:34 +0300
committerJakub Kicinski <kuba@kernel.org>2026-04-10 04:24:35 +0300
commit15089225889ba4b29f0263757cd66932fa676cb0 (patch)
tree73b8cc252fcebbafad57f5b100c2f774eb7a42c1 /net/core/netdev_rx_queue.c
parentb6e39e48469e37057fce27a1b87cf6d3e456aa42 (diff)
parent65d657d806848add1e1f0632562d7f47d5d5c188 (diff)
downloadlinux-15089225889ba4b29f0263757cd66932fa676cb0.tar.xz
Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'
Daniel Borkmann says: ==================== netkit: Support for io_uring zero-copy and AF_XDP Containers use virtual netdevs to route traffic from a physical netdev in the host namespace. They do not have access to the physical netdev in the host and thus can't use memory providers or AF_XDP that require reconfiguring/restarting queues in the physical netdev. This patchset adds the concept of queue leasing to virtual netdevs that allow containers to use memory providers and AF_XDP at native speed. Leased queues are bound to a real queue in a physical netdev and act as a proxy. Memory providers and AF_XDP operations take an ifindex and queue id, so containers would pass in an ifindex for a virtual netdev and a queue id of a leased queue, which then gets proxied to the underlying real queue. We have implemented support for this concept in netkit and tested the latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504 (bnxt_en) 100G NICs. For more details see the individual patches. ==================== Link: https://patch.msgid.link/20260402231031.447597-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/core/netdev_rx_queue.c')
-rw-r--r--net/core/netdev_rx_queue.c202
1 files changed, 174 insertions, 28 deletions
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 05fd2875d725..469319451ba2 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -10,15 +10,109 @@
#include "dev.h"
#include "page_pool_priv.h"
-/* See also page_pool_is_unreadable() */
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src)
+{
+ netdev_assert_locked(rxq_src->dev);
+ netdev_assert_locked(rxq_dst->dev);
+
+ netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL);
+
+ WRITE_ONCE(rxq_src->lease, rxq_dst);
+ WRITE_ONCE(rxq_dst->lease, rxq_src);
+}
+
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src)
+{
+ netdev_assert_locked(rxq_dst->dev);
+ netdev_assert_locked(rxq_src->dev);
+
+ netif_rxq_cleanup_unlease(rxq_src, rxq_dst);
+
+ WRITE_ONCE(rxq_src->lease, NULL);
+ WRITE_ONCE(rxq_dst->lease, NULL);
+
+ netdev_put(rxq_src->dev, &rxq_src->lease_tracker);
+}
+
+bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease);
+ return false;
+}
+
+/* Virtual devices eligible for leasing have no dev->dev.parent, while
+ * physical devices always have one. Use this to enforce the correct
+ * lease traversal direction.
+ */
+static bool netif_lease_dir_ok(const struct net_device *dev,
+ enum netif_lease_dir dir)
{
- struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+ if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent)
+ return true;
+ if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent)
+ return true;
+ return false;
+}
- return !!rxq->mp_params.mp_ops;
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx,
+ enum netif_lease_dir dir)
+{
+ struct net_device *orig_dev = *dev;
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx);
+
+ if (rxq->lease) {
+ if (!netif_lease_dir_ok(orig_dev, dir))
+ return NULL;
+ rxq = rxq->lease;
+ *rxq_idx = get_netdev_rx_queue_index(rxq);
+ *dev = rxq->dev;
+ }
+ return rxq;
+}
+
+struct netdev_rx_queue *
+netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx)
+{
+ struct net_device *orig_dev = *dev;
+ struct netdev_rx_queue *rxq;
+
+ /* Locking order is always from the virtual to the physical device
+ * see netdev_nl_queue_create_doit().
+ */
+ netdev_ops_assert_locked(orig_dev);
+ rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS);
+ if (rxq && orig_dev != *dev)
+ netdev_lock(*dev);
+ return rxq;
+}
+
+void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
+ struct net_device *dev)
+{
+ if (orig_dev != dev)
+ netdev_unlock(dev);
+}
+
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops;
+ return false;
}
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv;
+ return false;
+}
+
static int netdev_rx_queue_reconfig(struct net_device *dev,
unsigned int rxq_idx,
struct netdev_queue_config *qcfg_old,
@@ -108,9 +202,9 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
}
EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
-int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
- const struct pp_memory_provider_params *p,
- struct netlink_ext_ack *extack)
+static int __netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
{
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
struct netdev_queue_config qcfg[2];
@@ -120,12 +214,6 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
if (!qops)
return -EOPNOTSUPP;
- if (rxq_idx >= dev->real_num_rx_queues) {
- NL_SET_ERR_MSG(extack, "rx queue index out of range");
- return -ERANGE;
- }
- rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
-
if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
return -EINVAL;
@@ -172,27 +260,48 @@ err_clear_mp:
return ret;
}
-int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
- struct pp_memory_provider_params *p)
+int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
{
+ struct net_device *orig_dev = dev;
int ret;
- netdev_lock(dev);
- ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
- netdev_unlock(dev);
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (!netif_rxq_is_leased(dev, rxq_idx))
+ return __netif_mp_open_rxq(dev, rxq_idx, p, extack);
+
+ if (!netif_get_rx_queue_lease_locked(&dev, &rxq_idx)) {
+ NL_SET_ERR_MSG(extack, "rx queue leased to a virtual netdev");
+ return -EBUSY;
+ }
+ if (!dev->dev.parent) {
+ NL_SET_ERR_MSG(extack, "rx queue belongs to a virtual netdev");
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = __netif_mp_open_rxq(dev, rxq_idx, p, extack);
+out:
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
return ret;
}
-void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
- const struct pp_memory_provider_params *old_p)
+static void __netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
{
struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int err;
- if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
- return;
-
rxq = __netif_get_rx_queue(dev, ifq_idx);
/* Callers holding a netdev ref may get here after we already
@@ -214,10 +323,47 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
WARN_ON(err && err != -ENETDOWN);
}
-void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *old_p)
+void netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
{
- netdev_lock(dev);
- __net_mp_close_rxq(dev, ifq_idx, old_p);
- netdev_unlock(dev);
+ struct net_device *orig_dev = dev;
+
+ if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
+ return;
+ if (!netif_rxq_is_leased(dev, ifq_idx))
+ return __netif_mp_close_rxq(dev, ifq_idx, old_p);
+
+ if (WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &ifq_idx)))
+ return;
+
+ __netif_mp_close_rxq(dev, ifq_idx, old_p);
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
+}
+
+void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq,
+ const struct pp_memory_provider_params *p)
+{
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(p->mp_priv, rxq);
+}
+
+/* Clean up memory provider state when a queue lease is torn down. If
+ * a memory provider was installed on the physical queue via the lease,
+ * close it now. The memory provider is a property of the queue itself,
+ * and it was _guaranteed_ to be installed on the physical queue via
+ * the lease redirection. The extra __netif_mp_close_rxq is needed
+ * since the physical queue can outlive the virtual queue in the lease
+ * case, so it needs to be reconfigured to clear the memory provider.
+ */
+void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq,
+ struct netdev_rx_queue *virt_rxq)
+{
+ struct pp_memory_provider_params *p = &phys_rxq->mp_params;
+ unsigned int ifq_idx = get_netdev_rx_queue_index(phys_rxq);
+
+ if (!p->mp_ops)
+ return;
+
+ __netif_mp_uninstall_rxq(virt_rxq, p);
+ __netif_mp_close_rxq(phys_rxq->dev, ifq_idx, p);
}