summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-04-10 04:24:34 +0300
committerJakub Kicinski <kuba@kernel.org>2026-04-10 04:24:35 +0300
commit15089225889ba4b29f0263757cd66932fa676cb0 (patch)
tree73b8cc252fcebbafad57f5b100c2f774eb7a42c1
parentb6e39e48469e37057fce27a1b87cf6d3e456aa42 (diff)
parent65d657d806848add1e1f0632562d7f47d5d5c188 (diff)
downloadlinux-15089225889ba4b29f0263757cd66932fa676cb0.tar.xz
Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'
Daniel Borkmann says: ==================== netkit: Support for io_uring zero-copy and AF_XDP Containers use virtual netdevs to route traffic from a physical netdev in the host namespace. They do not have access to the physical netdev in the host and thus can't use memory providers or AF_XDP that require reconfiguring/restarting queues in the physical netdev. This patchset adds the concept of queue leasing to virtual netdevs that allow containers to use memory providers and AF_XDP at native speed. Leased queues are bound to a real queue in a physical netdev and act as a proxy. Memory providers and AF_XDP operations take an ifindex and queue id, so containers would pass in an ifindex for a virtual netdev and a queue id of a leased queue, which then gets proxied to the underlying real queue. We have implemented support for this concept in netkit and tested the latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504 (bnxt_en) 100G NICs. For more details see the individual patches. ==================== Link: https://patch.msgid.link/20260402231031.447597-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/netlink/specs/netdev.yaml46
-rw-r--r--Documentation/netlink/specs/rt-link.yaml11
-rw-r--r--Documentation/networking/netdevices.rst6
-rw-r--r--drivers/net/netkit.c412
-rw-r--r--include/linux/netdevice.h11
-rw-r--r--include/net/netdev_queues.h23
-rw-r--r--include/net/netdev_rx_queue.h29
-rw-r--r--include/net/page_pool/memory_provider.h8
-rw-r--r--include/uapi/linux/if_link.h6
-rw-r--r--include/uapi/linux/netdev.h11
-rw-r--r--io_uring/zcrx.c12
-rw-r--r--net/core/dev.c18
-rw-r--r--net/core/dev.h12
-rw-r--r--net/core/devmem.c6
-rw-r--r--net/core/netdev-genl-gen.c20
-rw-r--r--net/core/netdev-genl-gen.h2
-rw-r--r--net/core/netdev-genl.c238
-rw-r--r--net/core/netdev_queues.c103
-rw-r--r--net/core/netdev_rx_queue.c202
-rw-r--r--net/ethtool/channels.c28
-rw-r--r--net/ethtool/ioctl.c21
-rw-r--r--net/xdp/xsk.c75
-rw-r--r--tools/include/uapi/linux/netdev.h11
-rw-r--r--tools/testing/selftests/drivers/net/hw/Makefile1
-rw-r--r--tools/testing/selftests/drivers/net/hw/lib/py/__init__.py4
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/nk_qlease.py1407
26 files changed, 2558 insertions, 165 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 596c306ce52b..b93beb247a11 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -339,6 +339,15 @@ attribute-sets:
doc: XSK information for this queue, if any.
type: nest
nested-attributes: xsk-info
+ -
+ name: lease
+ doc: |
+ A queue from a virtual device can have a lease which refers to
+ another queue from a physical device. This is useful for memory
+ providers and AF_XDP operations which take an ifindex and queue id
+ to allow applications to bind against virtual devices in containers.
+ type: nest
+ nested-attributes: lease
-
name: qstats
doc: |
@@ -538,6 +547,26 @@ attribute-sets:
-
name: type
-
+ name: lease
+ attributes:
+ -
+ name: ifindex
+ doc: The netdev ifindex to lease the queue from.
+ type: u32
+ checks:
+ min: 1
+ -
+ name: queue
+ doc: The netdev queue to lease from.
+ type: nest
+ nested-attributes: queue-id
+ -
+ name: netns-id
+ doc: The network namespace id of the netdev.
+ type: s32
+ checks:
+ min: 0
+ -
name: dmabuf
attributes:
-
@@ -686,6 +715,7 @@ operations:
- dmabuf
- io-uring
- xsk
+ - lease
dump:
request:
attributes:
@@ -797,6 +827,22 @@ operations:
reply:
attributes:
- id
+ -
+ name: queue-create
+ doc: |
+ Create a new queue for the given netdevice. Whether this operation
+ is supported depends on the device and the driver.
+ attribute-set: queue
+ flags: [admin-perm]
+ do:
+ request:
+ attributes:
+ - ifindex
+ - type
+ - lease
+ reply: &queue-create-op
+ attributes:
+ - id
kernel-family:
headers: ["net/netdev_netlink.h"]
diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml
index df4b56beb818..fcb5aaf0926f 100644
--- a/Documentation/netlink/specs/rt-link.yaml
+++ b/Documentation/netlink/specs/rt-link.yaml
@@ -826,6 +826,13 @@ definitions:
- name: none
- name: default
-
+ name: netkit-pairing
+ type: enum
+ enum-name: netkit-pairing
+ entries:
+ - name: pair
+ - name: single
+ -
name: ovpn-mode
enum-name: ovpn-mode
name-prefix: ovpn-mode
@@ -2299,6 +2306,10 @@ attribute-sets:
-
name: tailroom
type: u16
+ -
+ name: pairing
+ type: u32
+ enum: netkit-pairing
-
name: linkinfo-ovpn-attrs
name-prefix: ifla-ovpn-
diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 35704d115312..83e28b96884f 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -329,6 +329,12 @@ by setting ``request_ops_lock`` to true. Code comments and docs refer
to drivers which have ops called under the instance lock as "ops locked".
See also the documentation of the ``lock`` member of struct net_device.
+There is also a case of taking two per-netdev locks in sequence when netdev
+queues are leased, that is, the netdev-scope lock is taken for both the
+virtual and the physical device. To prevent deadlocks, the virtual device's
+lock must always be acquired before the physical device's (see
+``netdev_nl_queue_create_doit``).
+
In the future, there will be an option for individual
drivers to opt out of using ``rtnl_lock`` and instead perform their control
operations directly under the netdev instance lock.
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 5c0e01396e06..5619209329d5 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -9,11 +9,21 @@
#include <linux/bpf_mprog.h>
#include <linux/indirect_call_wrapper.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/xdp_sock_drv.h>
#include <net/netkit.h>
#include <net/dst.h>
#include <net/tcx.h>
-#define DRV_NAME "netkit"
+#define NETKIT_DRV_NAME "netkit"
+
+#define NETKIT_NUM_RX_QUEUES_MAX 1024
+#define NETKIT_NUM_TX_QUEUES_MAX 1
+
+#define NETKIT_NUM_RX_QUEUES_REAL 1
+#define NETKIT_NUM_TX_QUEUES_REAL 1
struct netkit {
__cacheline_group_begin(netkit_fastpath);
@@ -26,6 +36,7 @@ struct netkit {
__cacheline_group_begin(netkit_slowpath);
enum netkit_mode mode;
+ enum netkit_pairing pair;
bool primary;
u32 headroom;
__cacheline_group_end(netkit_slowpath);
@@ -36,6 +47,8 @@ struct netkit_link {
struct net_device *dev;
};
+static struct rtnl_link_ops netkit_link_ops;
+
static __always_inline int
netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
enum netkit_action ret)
@@ -135,6 +148,10 @@ static int netkit_open(struct net_device *dev)
struct netkit *nk = netkit_priv(dev);
struct net_device *peer = rtnl_dereference(nk->peer);
+ if (nk->pair == NETKIT_DEVICE_SINGLE) {
+ netif_carrier_on(dev);
+ return 0;
+ }
if (!peer)
return -ENOTCONN;
if (peer->flags & IFF_UP) {
@@ -194,16 +211,17 @@ static void netkit_set_headroom(struct net_device *dev, int headroom)
rcu_read_lock();
peer = rcu_dereference(nk->peer);
- if (unlikely(!peer))
- goto out;
-
- nk2 = netkit_priv(peer);
- nk->headroom = headroom;
- headroom = max(nk->headroom, nk2->headroom);
+ if (!peer) {
+ nk->headroom = headroom;
+ dev->needed_headroom = headroom;
+ } else {
+ nk2 = netkit_priv(peer);
+ nk->headroom = headroom;
+ headroom = max(nk->headroom, nk2->headroom);
- peer->needed_headroom = headroom;
- dev->needed_headroom = headroom;
-out:
+ peer->needed_headroom = headroom;
+ dev->needed_headroom = headroom;
+ }
rcu_read_unlock();
}
@@ -219,9 +237,96 @@ static void netkit_get_stats(struct net_device *dev,
stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
}
+static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
+{
+ if (!dev->netdev_ops->ndo_bpf ||
+ !dev->netdev_ops->ndo_xdp_xmit ||
+ !dev->netdev_ops->ndo_xsk_wakeup)
+ return false;
+ return true;
+}
+
+static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct netdev_bpf xdp_lower;
+ struct netdev_rx_queue *rxq;
+ struct net_device *phys;
+ bool create = false;
+ int ret = -EBUSY;
+
+ switch (xdp->command) {
+ case XDP_SETUP_XSK_POOL:
+ if (nk->pair == NETKIT_DEVICE_PAIR)
+ return -EOPNOTSUPP;
+ if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
+ return -EINVAL;
+
+ rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
+ if (!rxq->lease)
+ return -EOPNOTSUPP;
+
+ phys = rxq->lease->dev;
+ if (!netkit_xsk_supported_at_phys(phys))
+ return -EOPNOTSUPP;
+
+ create = xdp->xsk.pool;
+ memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
+ xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease);
+ break;
+ case XDP_SETUP_PROG:
+ return -EOPNOTSUPP;
+ default:
+ return -EINVAL;
+ }
+
+ netdev_lock(phys);
+ if (create &&
+ (phys->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (!create || !dev_get_min_mp_channel_count(phys))
+ ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
+out:
+ netdev_unlock(phys);
+ return ret;
+}
+
+static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
+{
+ struct netdev_rx_queue *rxq, *rxq_lease;
+ struct net_device *phys;
+
+ if (queue_id >= dev->real_num_rx_queues)
+ return -EINVAL;
+
+ rxq = __netif_get_rx_queue(dev, queue_id);
+ rxq_lease = READ_ONCE(rxq->lease);
+ if (unlikely(!rxq_lease))
+ return -EOPNOTSUPP;
+
+ /* netkit_xsk already validated full xsk support, hence it's
+ * fine to call into ndo_xsk_wakeup right away given this
+ * was a prerequisite to get here in the first place. The
+ * phys xsk support cannot change without tearing down the
+ * device (which clears the lease first).
+ */
+ phys = rxq_lease->dev;
+ return phys->netdev_ops->ndo_xsk_wakeup(phys,
+ get_netdev_rx_queue_index(rxq_lease), flags);
+}
+
+static int netkit_init(struct net_device *dev)
+{
+ netdev_lockdep_set_classes(dev);
+ return 0;
+}
+
static void netkit_uninit(struct net_device *dev);
static const struct net_device_ops netkit_netdev_ops = {
+ .ndo_init = netkit_init,
.ndo_open = netkit_open,
.ndo_stop = netkit_close,
.ndo_start_xmit = netkit_xmit,
@@ -232,19 +337,104 @@ static const struct net_device_ops netkit_netdev_ops = {
.ndo_get_peer_dev = netkit_peer_dev,
.ndo_get_stats64 = netkit_get_stats,
.ndo_uninit = netkit_uninit,
+ .ndo_bpf = netkit_xsk,
+ .ndo_xsk_wakeup = netkit_xsk_wakeup,
.ndo_features_check = passthru_features_check,
};
static void netkit_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
- strscpy(info->driver, DRV_NAME, sizeof(info->driver));
+ strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver));
}
static const struct ethtool_ops netkit_ethtool_ops = {
.get_drvinfo = netkit_get_drvinfo,
};
+static int netkit_queue_create(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netkit *nk = netkit_priv(dev);
+ u32 rxq_count_old, rxq_count_new;
+ int err;
+
+ rxq_count_old = dev->real_num_rx_queues;
+ rxq_count_new = rxq_count_old + 1;
+
+ /* In paired mode, only the non-primary (peer) device can
+ * create leased queues since the primary is the management
+ * side. In single device mode, leasing is always allowed.
+ */
+ if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) {
+ NL_SET_ERR_MSG(extack,
+ "netkit can only lease against the peer device");
+ return -EOPNOTSUPP;
+ }
+
+ err = netif_set_real_num_rx_queues(dev, rxq_count_new);
+ if (err) {
+ if (rxq_count_new > dev->num_rx_queues)
+ NL_SET_ERR_MSG(extack,
+ "netkit maximum queue limit reached");
+ else
+ NL_SET_ERR_MSG_FMT(extack,
+ "netkit cannot create more queues err=%d", err);
+ return err;
+ }
+
+ return rxq_count_old;
+}
+
+static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = {
+ .ndo_queue_create = netkit_queue_create,
+};
+
+static struct net_device *netkit_alloc(struct nlattr *tb[],
+ const char *ifname,
+ unsigned char name_assign_type,
+ unsigned int num_tx_queues,
+ unsigned int num_rx_queues)
+{
+ const struct rtnl_link_ops *ops = &netkit_link_ops;
+ struct net_device *dev;
+
+ if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX ||
+ num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ dev = alloc_netdev_mqs(ops->priv_size, ifname,
+ name_assign_type, ops->setup,
+ num_tx_queues, num_rx_queues);
+ if (dev) {
+ dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL;
+ dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL;
+ }
+ return dev;
+}
+
+static void netkit_queue_unlease(struct net_device *dev)
+{
+ struct netdev_rx_queue *rxq, *rxq_lease;
+ struct net_device *dev_lease;
+ int i;
+
+ if (dev->real_num_rx_queues == 1)
+ return;
+
+ netdev_lock(dev);
+ for (i = 1; i < dev->real_num_rx_queues; i++) {
+ rxq = __netif_get_rx_queue(dev, i);
+ rxq_lease = rxq->lease;
+ dev_lease = rxq_lease->dev;
+
+ netdev_lock(dev_lease);
+ netdev_rx_queue_unlease(rxq, rxq_lease);
+ netdev_unlock(dev_lease);
+ }
+ netdev_unlock(dev);
+}
+
static void netkit_setup(struct net_device *dev)
{
static const netdev_features_t netkit_features_hw_vlan =
@@ -275,8 +465,9 @@ static void netkit_setup(struct net_device *dev)
dev->priv_flags |= IFF_DISABLE_NETPOLL;
dev->lltx = true;
- dev->ethtool_ops = &netkit_ethtool_ops;
- dev->netdev_ops = &netkit_netdev_ops;
+ dev->netdev_ops = &netkit_netdev_ops;
+ dev->ethtool_ops = &netkit_ethtool_ops;
+ dev->queue_mgmt_ops = &netkit_queue_mgmt_ops;
dev->features |= netkit_features;
dev->hw_features = netkit_features;
@@ -325,8 +516,6 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
return 0;
}
-static struct rtnl_link_ops netkit_link_ops;
-
static int netkit_new_link(struct net_device *dev,
struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
@@ -335,15 +524,17 @@ static int netkit_new_link(struct net_device *dev,
enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
+ enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
enum netkit_action policy_prim = NETKIT_PASS;
enum netkit_action policy_peer = NETKIT_PASS;
+ bool seen_peer = false, seen_scrub = false;
struct nlattr **data = params->data;
enum netkit_mode mode = NETKIT_L3;
unsigned char ifname_assign_type;
struct nlattr **tb = params->tb;
u16 headroom = 0, tailroom = 0;
struct ifinfomsg *ifmp = NULL;
- struct net_device *peer;
+ struct net_device *peer = NULL;
char ifname[IFNAMSIZ];
struct netkit *nk;
int err;
@@ -380,6 +571,13 @@ static int netkit_new_link(struct net_device *dev,
headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
if (data[IFLA_NETKIT_TAILROOM])
tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
+ if (data[IFLA_NETKIT_PAIRING])
+ pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);
+
+ seen_scrub = data[IFLA_NETKIT_SCRUB];
+ seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
+ data[IFLA_NETKIT_PEER_SCRUB] ||
+ data[IFLA_NETKIT_PEER_POLICY];
}
if (ifmp && tbp[IFLA_IFNAME]) {
@@ -392,45 +590,47 @@ static int netkit_new_link(struct net_device *dev,
if (mode != NETKIT_L2 &&
(tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
return -EOPNOTSUPP;
+ if (pair == NETKIT_DEVICE_SINGLE &&
+ (tb != tbp || seen_peer || seen_scrub ||
+ policy_prim != NETKIT_PASS))
+ return -EOPNOTSUPP;
- peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
- &netkit_link_ops, tbp, extack);
- if (IS_ERR(peer))
- return PTR_ERR(peer);
-
- netif_inherit_tso_max(peer, dev);
- if (headroom) {
- peer->needed_headroom = headroom;
- dev->needed_headroom = headroom;
- }
- if (tailroom) {
- peer->needed_tailroom = tailroom;
- dev->needed_tailroom = tailroom;
- }
-
- if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
- eth_hw_addr_random(peer);
- if (ifmp && dev->ifindex)
- peer->ifindex = ifmp->ifi_index;
-
- nk = netkit_priv(peer);
- nk->primary = false;
- nk->policy = policy_peer;
- nk->scrub = scrub_peer;
- nk->mode = mode;
- nk->headroom = headroom;
- bpf_mprog_bundle_init(&nk->bundle);
+ if (pair == NETKIT_DEVICE_PAIR) {
+ peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
+ &netkit_link_ops, tbp, extack);
+ if (IS_ERR(peer))
+ return PTR_ERR(peer);
+
+ netif_inherit_tso_max(peer, dev);
+ if (headroom)
+ peer->needed_headroom = headroom;
+ if (tailroom)
+ peer->needed_tailroom = tailroom;
+ if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
+ eth_hw_addr_random(peer);
+ if (ifmp && dev->ifindex)
+ peer->ifindex = ifmp->ifi_index;
- err = register_netdevice(peer);
- if (err < 0)
- goto err_register_peer;
- netif_carrier_off(peer);
- if (mode == NETKIT_L2)
- dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+ nk = netkit_priv(peer);
+ nk->primary = false;
+ nk->policy = policy_peer;
+ nk->scrub = scrub_peer;
+ nk->mode = mode;
+ nk->pair = pair;
+ nk->headroom = headroom;
+ bpf_mprog_bundle_init(&nk->bundle);
+
+ err = register_netdevice(peer);
+ if (err < 0)
+ goto err_register_peer;
+ netif_carrier_off(peer);
+ if (mode == NETKIT_L2)
+ dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
- err = rtnl_configure_link(peer, NULL, 0, NULL);
- if (err < 0)
- goto err_configure_peer;
+ err = rtnl_configure_link(peer, NULL, 0, NULL);
+ if (err < 0)
+ goto err_configure_peer;
+ }
if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
@@ -438,15 +638,23 @@ static int netkit_new_link(struct net_device *dev,
nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
else
strscpy(dev->name, "nk%d", IFNAMSIZ);
+ if (headroom)
+ dev->needed_headroom = headroom;
+ if (tailroom)
+ dev->needed_tailroom = tailroom;
nk = netkit_priv(dev);
nk->primary = true;
nk->policy = policy_prim;
nk->scrub = scrub_prim;
nk->mode = mode;
+ nk->pair = pair;
nk->headroom = headroom;
bpf_mprog_bundle_init(&nk->bundle);
+ if (pair == NETKIT_DEVICE_SINGLE)
+ xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);
+
err = register_netdevice(dev);
if (err < 0)
goto err_configure_peer;
@@ -455,10 +663,12 @@ static int netkit_new_link(struct net_device *dev,
dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
rcu_assign_pointer(netkit_priv(dev)->peer, peer);
- rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+ if (peer)
+ rcu_assign_pointer(netkit_priv(peer)->peer, dev);
return 0;
err_configure_peer:
- unregister_netdevice(peer);
+ if (peer)
+ unregister_netdevice(peer);
return err;
err_register_peer:
free_netdev(peer);
@@ -518,6 +728,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi
nk = netkit_priv(dev);
if (!nk->primary)
return ERR_PTR(-EACCES);
+ if (nk->pair == NETKIT_DEVICE_SINGLE)
+ return ERR_PTR(-EOPNOTSUPP);
if (which == BPF_NETKIT_PEER) {
dev = rcu_dereference_rtnl(nk->peer);
if (!dev)
@@ -844,6 +1056,7 @@ static void netkit_release_all(struct net_device *dev)
static void netkit_uninit(struct net_device *dev)
{
netkit_release_all(dev);
+ netkit_queue_unlease(dev);
}
static void netkit_del_link(struct net_device *dev, struct list_head *head)
@@ -856,7 +1069,15 @@ static void netkit_del_link(struct net_device *dev, struct list_head *head)
if (peer) {
nk = netkit_priv(peer);
RCU_INIT_POINTER(nk->peer, NULL);
- unregister_netdevice_queue(peer, head);
+ /* Guard against the peer already being in an unregister
+ * list (e.g. same-namespace teardown where the peer is
+ * in the caller's dev_kill_list). list_move_tail() on an
+ * already-queued device would otherwise corrupt that
+ * list's iteration. This situation can occur via netkit
+ * notifier, hence guard against this scenario.
+ */
+ if (!unregister_netdevice_queued(peer))
+ unregister_netdevice_queue(peer, head);
}
}
@@ -879,6 +1100,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
{ IFLA_NETKIT_PEER_INFO, "peer info" },
{ IFLA_NETKIT_HEADROOM, "headroom" },
{ IFLA_NETKIT_TAILROOM, "tailroom" },
+ { IFLA_NETKIT_PAIRING, "pairing" },
};
if (!nk->primary) {
@@ -898,9 +1120,11 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
}
if (data[IFLA_NETKIT_POLICY]) {
+ err = -EOPNOTSUPP;
attr = data[IFLA_NETKIT_POLICY];
policy = nla_get_u32(attr);
- err = netkit_check_policy(policy, attr, extack);
+ if (nk->pair == NETKIT_DEVICE_PAIR)
+ err = netkit_check_policy(policy, attr, extack);
if (err)
return err;
WRITE_ONCE(nk->policy, policy);
@@ -921,6 +1145,50 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
return 0;
}
+static void netkit_check_lease_unregister(struct net_device *dev)
+{
+ LIST_HEAD(list_kill);
+ u32 q_idx;
+
+ if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING ||
+ !dev->dev.parent)
+ return;
+
+ netdev_lock_ops(dev);
+ for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) {
+ struct net_device *tmp = dev;
+ struct netdev_rx_queue *rxq;
+ u32 tmp_q_idx = q_idx;
+
+ rxq = __netif_get_rx_queue_lease(&tmp, &tmp_q_idx,
+ NETIF_PHYS_TO_VIRT);
+ if (rxq && tmp != dev &&
+ tmp->netdev_ops == &netkit_netdev_ops) {
+ /* A single phys device can have multiple queues leased
+ * to one netkit device. We can only queue that netkit
+ * device once to the list_kill. Queues of that phys
+ * device can be leased with different individual netkit
+ * devices, hence we batch via list_kill.
+ */
+ if (unregister_netdevice_queued(tmp))
+ continue;
+ netkit_del_link(tmp, &list_kill);
+ }
+ }
+ netdev_unlock_ops(dev);
+ unregister_netdevice_many(&list_kill);
+}
+
+static int netkit_notifier(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event == NETDEV_UNREGISTER)
+ netkit_check_lease_unregister(dev);
+ return NOTIFY_DONE;
+}
+
static size_t netkit_get_size(const struct net_device *dev)
{
return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
@@ -931,6 +1199,7 @@ static size_t netkit_get_size(const struct net_device *dev)
nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */
nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
+ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
0;
}
@@ -951,6 +1220,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
return -EMSGSIZE;
if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
return -EMSGSIZE;
+ if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
+ return -EMSGSIZE;
if (peer) {
nk = netkit_priv(peer);
@@ -972,13 +1243,15 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
[IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 },
[IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
[IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
+ [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
[IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT,
.reject_message = "Primary attribute is read-only" },
};
static struct rtnl_link_ops netkit_link_ops = {
- .kind = DRV_NAME,
+ .kind = NETKIT_DRV_NAME,
.priv_size = sizeof(struct netkit),
+ .alloc = netkit_alloc,
.setup = netkit_setup,
.newlink = netkit_new_link,
.dellink = netkit_del_link,
@@ -992,26 +1265,39 @@ static struct rtnl_link_ops netkit_link_ops = {
.maxtype = IFLA_NETKIT_MAX,
};
-static __init int netkit_init(void)
+static struct notifier_block netkit_netdev_notifier = {
+ .notifier_call = netkit_notifier,
+};
+
+static __init int netkit_mod_init(void)
{
+ int ret;
+
BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
(int)NETKIT_PASS != (int)TCX_PASS ||
(int)NETKIT_DROP != (int)TCX_DROP ||
(int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
- return rtnl_link_register(&netkit_link_ops);
+ ret = rtnl_link_register(&netkit_link_ops);
+ if (ret)
+ return ret;
+ ret = register_netdevice_notifier(&netkit_netdev_notifier);
+ if (ret)
+ rtnl_link_unregister(&netkit_link_ops);
+ return ret;
}
-static __exit void netkit_exit(void)
+static __exit void netkit_mod_exit(void)
{
+ unregister_netdevice_notifier(&netkit_netdev_notifier);
rtnl_link_unregister(&netkit_link_ops);
}
-module_init(netkit_init);
-module_exit(netkit_exit);
+module_init(netkit_mod_init);
+module_exit(netkit_mod_exit);
MODULE_DESCRIPTION("BPF-programmable network device");
MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
MODULE_LICENSE("GPL");
-MODULE_ALIAS_RTNL_LINK(DRV_NAME);
+MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e15367373f7c..47417b2d48a4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2561,7 +2561,14 @@ struct net_device {
* Also protects some fields in:
* struct napi_struct, struct netdev_queue, struct netdev_rx_queue
*
- * Ordering: take after rtnl_lock.
+ * Ordering:
+ *
+ * - take after rtnl_lock
+ *
+ * - for the case of netdev queue leasing, the netdev-scope lock is
+ * taken for both the virtual and the physical device; to prevent
+ * deadlocks, the virtual device's lock must always be acquired
+ * before the physical device's (see netdev_nl_queue_create_doit)
*/
struct mutex lock;
@@ -3413,6 +3420,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
+bool unregister_netdevice_queued(const struct net_device *dev);
+
static inline void unregister_netdevice(struct net_device *dev)
{
unregister_netdevice_queue(dev, NULL);
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 95ed28212f4e..70c9fe9e83cc 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -150,6 +150,11 @@ enum {
* When NIC-wide config is changed the callback will
* be invoked for all queues.
*
+ * @ndo_queue_create: Create a new RX queue on a virtual device that will
+ * be paired with a physical device's queue via leasing.
+ * Return the new queue id on success, negative error
+ * on failure.
+ *
* @supported_params: Bitmask of supported parameters, see QCFG_*.
*
* Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
@@ -178,6 +183,8 @@ struct netdev_queue_mgmt_ops {
struct netlink_ext_ack *extack);
struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev,
int idx);
+ int (*ndo_queue_create)(struct net_device *dev,
+ struct netlink_ext_ack *extack);
unsigned int supported_params;
};
@@ -185,7 +192,7 @@ struct netdev_queue_mgmt_ops {
void netdev_queue_config(struct net_device *dev, int rxq,
struct netdev_queue_config *qcfg);
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx);
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
/**
* DOC: Lockless queue stopping / waking helpers.
@@ -373,6 +380,14 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
get_desc, start_thrs); \
})
-struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
-
-#endif
+struct device *netdev_queue_get_dma_dev(struct net_device *dev,
+ unsigned int idx,
+ enum netdev_queue_type type);
+bool netdev_can_create_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_can_lease_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_queue_busy(struct net_device *dev, unsigned int idx,
+ enum netdev_queue_type type,
+ struct netlink_ext_ack *extack);
+#endif /* _LINUX_NET_QUEUES_H */
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 08f81329fc11..7e98c679ea84 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -31,6 +31,14 @@ struct netdev_rx_queue {
struct napi_struct *napi;
struct netdev_queue_config qcfg;
struct pp_memory_provider_params mp_params;
+
+ /* If a queue is leased, then the lease pointer is always
+ * valid. From the physical device it points to the virtual
+ * queue, and from the virtual device it points to the
+ * physical queue.
+ */
+ struct netdev_rx_queue *lease;
+ netdevice_tracker lease_tracker;
} ____cacheline_aligned_in_smp;
/*
@@ -59,6 +67,23 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
return index;
}
-int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+enum netif_lease_dir {
+ NETIF_VIRT_TO_PHYS,
+ NETIF_PHYS_TO_VIRT,
+};
-#endif
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq,
+ enum netif_lease_dir dir);
+
+struct netdev_rx_queue *
+netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq);
+void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
+ struct net_device *dev);
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+#endif /* _LINUX_NETDEV_RX_QUEUE_H */
diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
index ada4f968960a..255ce4cfd975 100644
--- a/include/net/page_pool/memory_provider.h
+++ b/include/net/page_pool/memory_provider.h
@@ -23,14 +23,10 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
void net_mp_niov_clear_page_pool(struct net_iov *niov);
-int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *p);
-int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *p,
struct netlink_ext_ack *extack);
-void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *old_p);
-void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *old_p);
/**
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 83a96c56b8ca..280bb1780512 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1296,6 +1296,11 @@ enum netkit_mode {
NETKIT_L3,
};
+enum netkit_pairing {
+ NETKIT_DEVICE_PAIR,
+ NETKIT_DEVICE_SINGLE,
+};
+
/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
* the BPF program if attached. This also means the latter can
* consume the two fields if they were populated earlier.
@@ -1320,6 +1325,7 @@ enum {
IFLA_NETKIT_PEER_SCRUB,
IFLA_NETKIT_HEADROOM,
IFLA_NETKIT_TAILROOM,
+ IFLA_NETKIT_PAIRING,
__IFLA_NETKIT_MAX,
};
#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
NETDEV_A_QUEUE_DMABUF,
NETDEV_A_QUEUE_IO_URING,
NETDEV_A_QUEUE_XSK,
+ NETDEV_A_QUEUE_LEASE,
__NETDEV_A_QUEUE_MAX,
NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,6 +204,15 @@ enum {
};
enum {
+ NETDEV_A_LEASE_IFINDEX = 1,
+ NETDEV_A_LEASE_QUEUE,
+ NETDEV_A_LEASE_NETNS_ID,
+
+ __NETDEV_A_LEASE_MAX,
+ NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
+enum {
NETDEV_A_DMABUF_IFINDEX = 1,
NETDEV_A_DMABUF_QUEUES,
NETDEV_A_DMABUF_FD,
@@ -228,6 +238,7 @@ enum {
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
NETDEV_CMD_BIND_TX,
+ NETDEV_CMD_QUEUE_CREATE,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 62d693287457..f4a7809ba0c2 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -552,8 +552,11 @@ static void io_close_queue(struct io_zcrx_ifq *ifq)
}
if (netdev) {
- if (ifq->if_rxq != -1)
- net_mp_close_rxq(netdev, ifq->if_rxq, &p);
+ if (ifq->if_rxq != -1) {
+ netdev_lock(netdev);
+ netif_mp_close_rxq(netdev, ifq->if_rxq, &p);
+ netdev_unlock(netdev);
+ }
netdev_put(netdev, &netdev_tracker);
}
ifq->if_rxq = -1;
@@ -826,7 +829,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
}
netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
- ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
+ ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq,
+ NETDEV_QUEUE_TYPE_RX);
if (!ifq->dev) {
ret = -EOPNOTSUPP;
goto netdev_put_unlock;
@@ -841,7 +845,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
mp_param.rx_page_size = 1U << ifq->niov_shift;
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
- ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
+ ret = netif_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
if (ret)
goto netdev_put_unlock;
netdev_unlock(ifq->netdev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 5a31f9d2128c..e7bc95cbd1fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1122,6 +1122,14 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
}
struct net_device *
+netdev_put_lock(struct net_device *dev, struct net *net,
+ netdevice_tracker *tracker)
+{
+ netdev_tracker_free(dev, tracker);
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
unsigned long *index)
{
@@ -12342,10 +12350,8 @@ static void dev_memory_provider_uninstall(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct netdev_rx_queue *rxq = &dev->_rx[i];
- struct pp_memory_provider_params *p = &rxq->mp_params;
- if (p->mp_ops && p->mp_ops->uninstall)
- p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ __netif_mp_uninstall_rxq(rxq, &rxq->mp_params);
}
}
@@ -12378,6 +12384,12 @@ static void netif_close_many_and_unlock_cond(struct list_head *close_head)
#endif
}
+bool unregister_netdevice_queued(const struct net_device *dev)
+{
+ ASSERT_RTNL();
+ return !list_empty(&dev->unreg_list);
+}
+
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh)
{
diff --git a/net/core/dev.h b/net/core/dev.h
index 781619e76b3e..95edb2d4eff8 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -12,6 +12,7 @@ struct net;
struct netlink_ext_ack;
struct netdev_queue_config;
struct cpumask;
+struct pp_memory_provider_params;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
@@ -31,6 +32,8 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
+struct net_device *netdev_put_lock(struct net_device *dev, struct net *net,
+ netdevice_tracker *tracker);
struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
unsigned long *index);
@@ -96,6 +99,15 @@ int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
struct netdev_queue_config *qcfg,
struct netlink_ext_ack *extack);
+bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx);
+bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx);
+
+void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq,
+ const struct pp_memory_provider_params *p);
+
+void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq,
+ struct netdev_rx_queue *virt_rxq);
+
/* netdev management, shared between various uAPI entry points */
struct netdev_name_node {
struct hlist_node hlist;
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 69d79aee07ef..cde4c89bc146 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -145,7 +145,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
rxq_idx = get_netdev_rx_queue_index(rxq);
- __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
+ netif_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
percpu_ref_kill(&binding->ref);
@@ -163,7 +163,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
u32 xa_idx;
int err;
- err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
+ err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
if (err)
return err;
@@ -176,7 +176,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
return 0;
err_close_rxq:
- __net_mp_close_rxq(dev, rxq_idx, &mp_params);
+ netif_mp_close_rxq(dev, rxq_idx, &mp_params);
return err;
}
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index ba673e81716f..81aecb5d3bc5 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range
};
/* Common nested types */
+const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = {
+ [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+ [NETDEV_A_LEASE_NETNS_ID] = NLA_POLICY_MIN(NLA_S32, 0),
+};
+
const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
@@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1]
[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
};
+/* NETDEV_CMD_QUEUE_CREATE - do */
+static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+ [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy),
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_DMABUF_FD,
.flags = GENL_CMD_CAP_DO,
},
+ {
+ .cmd = NETDEV_CMD_QUEUE_CREATE,
+ .doit = netdev_nl_queue_create_doit,
+ .policy = netdev_queue_create_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_LEASE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index cffc08517a41..d71b435d72c1 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -14,6 +14,7 @@
#include <net/netdev_netlink.h>
/* Common nested types */
+extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1];
extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
@@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 470fabbeacd9..056460d01940 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -387,11 +387,62 @@ static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi)
}
static int
+netdev_nl_queue_fill_lease(struct sk_buff *rsp, struct net_device *netdev,
+ u32 q_idx, u32 q_type)
+{
+ struct net_device *orig_netdev = netdev;
+ struct nlattr *nest_lease, *nest_queue;
+ struct netdev_rx_queue *rxq;
+ struct net *net, *peer_net;
+
+ rxq = __netif_get_rx_queue_lease(&netdev, &q_idx,
+ NETIF_PHYS_TO_VIRT);
+ if (!rxq || orig_netdev == netdev)
+ return 0;
+
+ nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE);
+ if (!nest_lease)
+ goto nla_put_failure;
+
+ nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE);
+ if (!nest_queue)
+ goto nla_put_failure;
+ if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx))
+ goto nla_put_failure;
+ if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type))
+ goto nla_put_failure;
+ nla_nest_end(rsp, nest_queue);
+
+ if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX,
+ READ_ONCE(netdev->ifindex)))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ peer_net = dev_net_rcu(netdev);
+ net = dev_net_rcu(orig_netdev);
+ if (!net_eq(net, peer_net)) {
+ s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC);
+
+ if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id))
+ goto nla_put_failure_unlock;
+ }
+ rcu_read_unlock();
+ nla_nest_end(rsp, nest_lease);
+ return 0;
+
+nla_put_failure_unlock:
+ rcu_read_unlock();
+nla_put_failure:
+ return -ENOMEM;
+}
+
+static int
netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
u32 q_idx, u32 q_type, const struct genl_info *info)
{
struct pp_memory_provider_params *params;
- struct netdev_rx_queue *rxq;
+ struct net_device *orig_netdev = netdev;
+ struct netdev_rx_queue *rxq, *rxq_lease;
struct netdev_queue *txq;
void *hdr;
@@ -409,17 +460,22 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
rxq = __netif_get_rx_queue(netdev, q_idx);
if (nla_put_napi_id(rsp, rxq->napi))
goto nla_put_failure;
+ if (netdev_nl_queue_fill_lease(rsp, netdev, q_idx, q_type))
+ goto nla_put_failure;
+ rxq_lease = netif_get_rx_queue_lease_locked(&netdev, &q_idx);
+ if (rxq_lease)
+ rxq = rxq_lease;
params = &rxq->mp_params;
if (params->mp_ops &&
params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
- goto nla_put_failure;
+ goto nla_put_failure_lease;
#ifdef CONFIG_XDP_SOCKETS
if (rxq->pool)
if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
- goto nla_put_failure;
+ goto nla_put_failure_lease;
#endif
-
+ netif_put_rx_queue_lease_locked(orig_netdev, netdev);
break;
case NETDEV_QUEUE_TYPE_TX:
txq = netdev_get_tx_queue(netdev, q_idx);
@@ -437,6 +493,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
return 0;
+nla_put_failure_lease:
+ netif_put_rx_queue_lease_locked(orig_netdev, netdev);
nla_put_failure:
genlmsg_cancel(rsp, hdr);
return -EMSGSIZE;
@@ -918,7 +976,8 @@ netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
struct device *rxq_dma_dev;
- rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
+ rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx,
+ NETDEV_QUEUE_TYPE_RX);
if (dma_dev && rxq_dma_dev != dma_dev) {
NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
rxq_idx, prev_rxq_idx);
@@ -1095,7 +1154,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_netdev;
}
- dma_dev = netdev_queue_get_dma_dev(netdev, 0);
+ dma_dev = netdev_queue_get_dma_dev(netdev, 0, NETDEV_QUEUE_TYPE_TX);
binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
@@ -1120,6 +1179,173 @@ err_genlmsg_free:
return err;
}
+int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
+ const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1;
+ int err, ifindex, ifindex_lease, queue_id, queue_id_lease;
+ struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)];
+ struct netdev_rx_queue *rxq, *rxq_lease;
+ struct net_device *dev, *dev_lease;
+ netdevice_tracker dev_tracker;
+ s32 netns_lease = -1;
+ struct nlattr *nest;
+ struct sk_buff *rsp;
+ struct net *net;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE))
+ return -EINVAL;
+ if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) !=
+ NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ nest = info->attrs[NETDEV_A_QUEUE_LEASE];
+ err = nla_parse_nested(ltb, lmaxtype, nest,
+ netdev_lease_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+ if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) ||
+ NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE))
+ return -EINVAL;
+ if (ltb[NETDEV_A_LEASE_NETNS_ID]) {
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ netns_lease = nla_get_s32(ltb[NETDEV_A_LEASE_NETNS_ID]);
+ }
+
+ ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]);
+
+ nest = ltb[NETDEV_A_LEASE_QUEUE];
+ err = nla_parse_nested(qtb, qmaxtype, nest,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+ if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE))
+ return -EINVAL;
+ if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ /* Locking order is always from the virtual to the physical device
+ * since this is also the same order when applications open the
+ * memory provider later on.
+ */
+ dev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto err_genlmsg_free;
+ }
+ if (!netdev_can_create_queue(dev, info->extack)) {
+ err = -EINVAL;
+ goto err_unlock_dev;
+ }
+
+ net = genl_info_net(info);
+ if (netns_lease >= 0) {
+ net = get_net_ns_by_id(net, netns_lease);
+ if (!net) {
+ err = -ENONET;
+ goto err_unlock_dev;
+ }
+ }
+
+ dev_lease = netdev_get_by_index(net, ifindex_lease, &dev_tracker,
+ GFP_KERNEL);
+ if (!dev_lease) {
+ err = -ENODEV;
+ goto err_put_netns;
+ }
+ if (!netdev_can_lease_queue(dev_lease, info->extack)) {
+ netdev_put(dev_lease, &dev_tracker);
+ err = -EINVAL;
+ goto err_put_netns;
+ }
+
+ dev_lease = netdev_put_lock(dev_lease, net, &dev_tracker);
+ if (!dev_lease) {
+ err = -ENODEV;
+ goto err_put_netns;
+ }
+ if (queue_id_lease >= dev_lease->real_num_rx_queues) {
+ err = -ERANGE;
+ NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]);
+ goto err_unlock_dev_lease;
+ }
+ if (netdev_queue_busy(dev_lease, queue_id_lease, NETDEV_QUEUE_TYPE_RX,
+ info->extack)) {
+ err = -EBUSY;
+ goto err_unlock_dev_lease;
+ }
+
+ rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease);
+ rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1);
+
+ /* Leasing queues from different physical devices is currently
+ * not supported. Capabilities such as XDP features and DMA
+ * device may differ between physical devices, and computing
+ * a correct intersection for the virtual device is not yet
+ * implemented.
+ */
+ if (rxq->lease && rxq->lease->dev != dev_lease) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Leasing queues from different devices not supported");
+ goto err_unlock_dev_lease;
+ }
+
+ queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev, info->extack);
+ if (queue_id < 0) {
+ err = queue_id;
+ goto err_unlock_dev_lease;
+ }
+ rxq = __netif_get_rx_queue(dev, queue_id);
+
+ netdev_rx_queue_lease(rxq, rxq_lease);
+
+ nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(dev_lease);
+ netdev_unlock(dev);
+ if (netns_lease >= 0)
+ put_net(net);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_dev_lease:
+ netdev_unlock(dev_lease);
+err_put_netns:
+ if (netns_lease >= 0)
+ put_net(net);
+err_unlock_dev:
+ netdev_unlock(dev);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
{
INIT_LIST_HEAD(&priv->bindings);
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
index 251f27a8307f..265161e12a9c 100644
--- a/net/core/netdev_queues.c
+++ b/net/core/netdev_queues.c
@@ -1,27 +1,112 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/xdp_sock_drv.h>
+
+#include "dev.h"
+
+static struct device *
+__netdev_queue_get_dma_dev(struct net_device *dev, unsigned int idx)
+{
+ const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct device *dma_dev;
+
+ if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
+ dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
+ else
+ dma_dev = dev->dev.parent;
+
+ return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+}
/**
* netdev_queue_get_dma_dev() - get dma device for zero-copy operations
* @dev: net_device
* @idx: queue index
+ * @type: queue type (RX or TX)
*
- * Get dma device for zero-copy operations to be used for this queue.
- * When such device is not available or valid, the function will return NULL.
+ * Get dma device for zero-copy operations to be used for this queue. If
+ * the queue is an RX queue leased from a physical queue, we retrieve the
+ * physical queue's dma device. When the dma device is not available or
+ * valid, the function will return NULL.
*
* Return: Device or NULL on error
*/
-struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
+struct device *netdev_queue_get_dma_dev(struct net_device *dev,
+ unsigned int idx,
+ enum netdev_queue_type type)
{
- const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct net_device *orig_dev = dev;
struct device *dma_dev;
- if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
- dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
- else
- dma_dev = dev->dev.parent;
+ /* Only RX side supports queue leasing today. */
+ if (type != NETDEV_QUEUE_TYPE_RX || !netif_rxq_is_leased(dev, idx))
+ return __netdev_queue_get_dma_dev(dev, idx);
- return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+ if (!netif_get_rx_queue_lease_locked(&dev, &idx))
+ return NULL;
+
+ dma_dev = __netdev_queue_get_dma_dev(dev, idx);
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
+ return dma_dev;
+}
+
+bool netdev_can_create_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ if (dev->dev.parent) {
+ NL_SET_ERR_MSG(extack, "Device is not a virtual device");
+ return false;
+ }
+ if (!dev->queue_mgmt_ops ||
+ !dev->queue_mgmt_ops->ndo_queue_create) {
+ NL_SET_ERR_MSG(extack, "Device does not support queue creation");
+ return false;
+ }
+ if (dev->real_num_rx_queues < 1 ||
+ dev->real_num_tx_queues < 1) {
+ NL_SET_ERR_MSG(extack, "Device must have at least one real queue");
+ return false;
+ }
+ return true;
}
+bool netdev_can_lease_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ if (!dev->dev.parent) {
+ NL_SET_ERR_MSG(extack, "Lease device is a virtual device");
+ return false;
+ }
+ if (!netif_device_present(dev)) {
+ NL_SET_ERR_MSG(extack, "Lease device has been removed from the system");
+ return false;
+ }
+ if (!dev->queue_mgmt_ops) {
+ NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations");
+ return false;
+ }
+ return true;
+}
+
+bool netdev_queue_busy(struct net_device *dev, unsigned int idx,
+ enum netdev_queue_type type,
+ struct netlink_ext_ack *extack)
+{
+ if (xsk_get_pool_from_qid(dev, idx)) {
+ NL_SET_ERR_MSG(extack, "Device queue in use by AF_XDP");
+ return true;
+ }
+ if (type == NETDEV_QUEUE_TYPE_TX)
+ return false;
+ if (netif_rxq_is_leased(dev, idx)) {
+ NL_SET_ERR_MSG(extack, "Device queue in use due to queue leasing");
+ return true;
+ }
+ if (netif_rxq_has_mp(dev, idx)) {
+ NL_SET_ERR_MSG(extack, "Device queue in use by memory provider");
+ return true;
+ }
+ return false;
+}
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 05fd2875d725..469319451ba2 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -10,15 +10,109 @@
#include "dev.h"
#include "page_pool_priv.h"
-/* See also page_pool_is_unreadable() */
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src)
+{
+ netdev_assert_locked(rxq_src->dev);
+ netdev_assert_locked(rxq_dst->dev);
+
+ netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL);
+
+ WRITE_ONCE(rxq_src->lease, rxq_dst);
+ WRITE_ONCE(rxq_dst->lease, rxq_src);
+}
+
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src)
+{
+ netdev_assert_locked(rxq_dst->dev);
+ netdev_assert_locked(rxq_src->dev);
+
+ netif_rxq_cleanup_unlease(rxq_src, rxq_dst);
+
+ WRITE_ONCE(rxq_src->lease, NULL);
+ WRITE_ONCE(rxq_dst->lease, NULL);
+
+ netdev_put(rxq_src->dev, &rxq_src->lease_tracker);
+}
+
+bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease);
+ return false;
+}
+
+/* Virtual devices eligible for leasing have no dev->dev.parent, while
+ * physical devices always have one. Use this to enforce the correct
+ * lease traversal direction.
+ */
+static bool netif_lease_dir_ok(const struct net_device *dev,
+ enum netif_lease_dir dir)
{
- struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+ if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent)
+ return true;
+ if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent)
+ return true;
+ return false;
+}
- return !!rxq->mp_params.mp_ops;
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx,
+ enum netif_lease_dir dir)
+{
+ struct net_device *orig_dev = *dev;
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx);
+
+ if (rxq->lease) {
+ if (!netif_lease_dir_ok(orig_dev, dir))
+ return NULL;
+ rxq = rxq->lease;
+ *rxq_idx = get_netdev_rx_queue_index(rxq);
+ *dev = rxq->dev;
+ }
+ return rxq;
+}
+
+struct netdev_rx_queue *
+netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx)
+{
+ struct net_device *orig_dev = *dev;
+ struct netdev_rx_queue *rxq;
+
+ /* Locking order is always from the virtual to the physical device
+ * see netdev_nl_queue_create_doit().
+ */
+ netdev_ops_assert_locked(orig_dev);
+ rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS);
+ if (rxq && orig_dev != *dev)
+ netdev_lock(*dev);
+ return rxq;
+}
+
+void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
+ struct net_device *dev)
+{
+ if (orig_dev != dev)
+ netdev_unlock(dev);
+}
+
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops;
+ return false;
}
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv;
+ return false;
+}
+
static int netdev_rx_queue_reconfig(struct net_device *dev,
unsigned int rxq_idx,
struct netdev_queue_config *qcfg_old,
@@ -108,9 +202,9 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
}
EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
-int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
- const struct pp_memory_provider_params *p,
- struct netlink_ext_ack *extack)
+static int __netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
{
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
struct netdev_queue_config qcfg[2];
@@ -120,12 +214,6 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
if (!qops)
return -EOPNOTSUPP;
- if (rxq_idx >= dev->real_num_rx_queues) {
- NL_SET_ERR_MSG(extack, "rx queue index out of range");
- return -ERANGE;
- }
- rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
-
if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
return -EINVAL;
@@ -172,27 +260,48 @@ err_clear_mp:
return ret;
}
-int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
- struct pp_memory_provider_params *p)
+int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
{
+ struct net_device *orig_dev = dev;
int ret;
- netdev_lock(dev);
- ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
- netdev_unlock(dev);
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (!netif_rxq_is_leased(dev, rxq_idx))
+ return __netif_mp_open_rxq(dev, rxq_idx, p, extack);
+
+ if (!netif_get_rx_queue_lease_locked(&dev, &rxq_idx)) {
+ NL_SET_ERR_MSG(extack, "rx queue leased to a virtual netdev");
+ return -EBUSY;
+ }
+ if (!dev->dev.parent) {
+ NL_SET_ERR_MSG(extack, "rx queue belongs to a virtual netdev");
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = __netif_mp_open_rxq(dev, rxq_idx, p, extack);
+out:
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
return ret;
}
-void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
- const struct pp_memory_provider_params *old_p)
+static void __netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
{
struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int err;
- if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
- return;
-
rxq = __netif_get_rx_queue(dev, ifq_idx);
/* Callers holding a netdev ref may get here after we already
@@ -214,10 +323,47 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
WARN_ON(err && err != -ENETDOWN);
}
-void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *old_p)
+void netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
{
- netdev_lock(dev);
- __net_mp_close_rxq(dev, ifq_idx, old_p);
- netdev_unlock(dev);
+ struct net_device *orig_dev = dev;
+
+ if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
+ return;
+ if (!netif_rxq_is_leased(dev, ifq_idx))
+ return __netif_mp_close_rxq(dev, ifq_idx, old_p);
+
+ if (WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &ifq_idx)))
+ return;
+
+ __netif_mp_close_rxq(dev, ifq_idx, old_p);
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
+}
+
+void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq,
+ const struct pp_memory_provider_params *p)
+{
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(p->mp_priv, rxq);
+}
+
+/* Clean up memory provider state when a queue lease is torn down. If
+ * a memory provider was installed on the physical queue via the lease,
+ * close it now. The memory provider is a property of the queue itself,
+ * and it was _guaranteed_ to be installed on the physical queue via
+ * the lease redirection. The extra __netif_mp_close_rxq is needed
+ * since the physical queue can outlive the virtual queue in the lease
+ * case, so it needs to be reconfigured to clear the memory provider.
+ */
+void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq,
+ struct netdev_rx_queue *virt_rxq)
+{
+ struct pp_memory_provider_params *p = &phys_rxq->mp_params;
+ unsigned int ifq_idx = get_netdev_rx_queue_index(phys_rxq);
+
+ if (!p->mp_ops)
+ return;
+
+ __netif_mp_uninstall_rxq(virt_rxq, p);
+ __netif_mp_close_rxq(phys_rxq->dev, ifq_idx, p);
}
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 45232cf1c144..64ef8cff2005 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <net/xdp_sock_drv.h>
+#include <net/netdev_queues.h>
#include "common.h"
#include "netlink.h"
@@ -109,7 +109,7 @@ ethnl_set_channels_validate(struct ethnl_req_info *req_info,
static int
ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
{
- unsigned int from_channel, old_total, i;
+ unsigned int old_combined, old_rx, old_tx, i;
bool mod = false, mod_combined = false;
struct net_device *dev = req_info->dev;
struct ethtool_channels channels = {};
@@ -118,8 +118,9 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
int ret;
dev->ethtool_ops->get_channels(dev, &channels);
- old_total = channels.combined_count +
- max(channels.rx_count, channels.tx_count);
+ old_combined = channels.combined_count;
+ old_rx = channels.rx_count;
+ old_tx = channels.tx_count;
ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT],
&mod);
@@ -169,14 +170,19 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
if (ret)
return ret;
- /* Disabling channels, query zero-copy AF_XDP sockets */
- from_channel = channels.combined_count +
- min(channels.rx_count, channels.tx_count);
- for (i = from_channel; i < old_total; i++)
- if (xsk_get_pool_from_qid(dev, i)) {
- GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
+ /* ensure channels are not busy at the moment */
+ for (i = channels.combined_count + channels.rx_count;
+ i < old_combined + old_rx; i++) {
+ if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_RX,
+ info->extack))
return -EINVAL;
- }
+ }
+ for (i = channels.combined_count + channels.tx_count;
+ i < old_combined + old_tx; i++) {
+ if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_TX,
+ info->extack))
+ return -EINVAL;
+ }
ret = dev->ethtool_ops->set_channels(dev, &channels);
return ret < 0 ? ret : 1;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 3c713a91ad0d..bd97f9b9bf18 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -27,12 +27,12 @@
#include <linux/net.h>
#include <linux/pm_runtime.h>
#include <linux/utsname.h>
+#include <linux/ethtool_netlink.h>
#include <net/devlink.h>
#include <net/ipv6.h>
-#include <net/xdp_sock_drv.h>
#include <net/flow_offload.h>
#include <net/netdev_lock.h>
-#include <linux/ethtool_netlink.h>
+#include <net/netdev_queues.h>
#include "common.h"
@@ -2250,7 +2250,6 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
- u16 from_channel, to_channel;
unsigned int i;
int ret;
@@ -2284,13 +2283,17 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
if (ret)
return ret;
- /* Disabling channels, query zero-copy AF_XDP sockets */
- from_channel = channels.combined_count +
- min(channels.rx_count, channels.tx_count);
- to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
- for (i = from_channel; i < to_channel; i++)
- if (xsk_get_pool_from_qid(dev, i))
+ /* Disabling channels, query busy queues (AF_XDP, queue leasing) */
+ for (i = channels.combined_count + channels.rx_count;
+ i < curr.combined_count + curr.rx_count; i++) {
+ if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_RX, NULL))
return -EINVAL;
+ }
+ for (i = channels.combined_count + channels.tx_count;
+ i < curr.combined_count + curr.tx_count; i++) {
+ if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_TX, NULL))
+ return -EINVAL;
+ }
ret = dev->ethtool_ops->set_channels(dev, &channels);
if (!ret)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c8ef9e427c9c..60be6561f486 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -23,6 +23,8 @@
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <linux/vmalloc.h>
+
+#include <net/netdev_queues.h>
#include <net/xdp_sock_drv.h>
#include <net/busy_poll.h>
#include <net/netdev_lock.h>
@@ -117,10 +119,18 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid);
void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
{
- if (queue_id < dev->num_rx_queues)
- dev->_rx[queue_id].pool = NULL;
- if (queue_id < dev->num_tx_queues)
- dev->_tx[queue_id].pool = NULL;
+ struct net_device *orig_dev = dev;
+ unsigned int id = queue_id;
+
+ if (id < dev->real_num_rx_queues)
+ WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id));
+
+ if (id < dev->num_rx_queues)
+ dev->_rx[id].pool = NULL;
+ if (id < dev->num_tx_queues)
+ dev->_tx[id].pool = NULL;
+
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
}
/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
@@ -130,17 +140,30 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
u16 queue_id)
{
- if (queue_id >= max_t(unsigned int,
- dev->real_num_rx_queues,
- dev->real_num_tx_queues))
+ struct net_device *orig_dev = dev;
+ unsigned int id = queue_id;
+ int ret = 0;
+
+ if (id >= max(dev->real_num_rx_queues,
+ dev->real_num_tx_queues))
return -EINVAL;
- if (queue_id < dev->real_num_rx_queues)
- dev->_rx[queue_id].pool = pool;
- if (queue_id < dev->real_num_tx_queues)
- dev->_tx[queue_id].pool = pool;
+ if (id < dev->real_num_rx_queues) {
+ if (!netif_get_rx_queue_lease_locked(&dev, &id))
+ return -EBUSY;
+ if (xsk_get_pool_from_qid(dev, id)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
- return 0;
+ if (id < dev->real_num_rx_queues)
+ dev->_rx[id].pool = pool;
+ if (id < dev->real_num_tx_queues)
+ dev->_tx[id].pool = pool;
+out:
+ netif_put_rx_queue_lease_locked(orig_dev, dev);
+ return ret;
}
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
@@ -330,12 +353,36 @@ static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
+static bool xsk_dev_queue_valid(const struct xdp_sock *xs,
+ const struct xdp_rxq_info *info)
+{
+ struct net_device *dev = xs->dev;
+ u32 queue_index = xs->queue_id;
+ struct netdev_rx_queue *rxq;
+
+ if (info->dev == dev &&
+ info->queue_index == queue_index)
+ return true;
+
+ if (queue_index < dev->real_num_rx_queues) {
+ rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease);
+ if (!rxq)
+ return false;
+
+ dev = rxq->dev;
+ queue_index = get_netdev_rx_queue_index(rxq);
+
+ return info->dev == dev &&
+ info->queue_index == queue_index;
+ }
+ return false;
+}
+
static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
if (!xsk_is_bound(xs))
return -ENXIO;
-
- if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+ if (!xsk_dev_queue_valid(xs, xdp->rxq))
return -EINVAL;
if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
NETDEV_A_QUEUE_DMABUF,
NETDEV_A_QUEUE_IO_URING,
NETDEV_A_QUEUE_XSK,
+ NETDEV_A_QUEUE_LEASE,
__NETDEV_A_QUEUE_MAX,
NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,6 +204,15 @@ enum {
};
enum {
+ NETDEV_A_LEASE_IFINDEX = 1,
+ NETDEV_A_LEASE_QUEUE,
+ NETDEV_A_LEASE_NETNS_ID,
+
+ __NETDEV_A_LEASE_MAX,
+ NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
+enum {
NETDEV_A_DMABUF_IFINDEX = 1,
NETDEV_A_DMABUF_QUEUES,
NETDEV_A_DMABUF_FD,
@@ -228,6 +238,7 @@ enum {
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
NETDEV_CMD_BIND_TX,
+ NETDEV_CMD_QUEUE_CREATE,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index deeca3f8d080..28d245e11bc4 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -35,6 +35,7 @@ TEST_PROGS = \
loopback.sh \
nic_timestamp.py \
nk_netns.py \
+ nk_qlease.py \
pp_alloc_fail.py \
rss_api.py \
rss_ctx.py \
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
index df4da5078c48..84a4dab6c649 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -20,7 +20,7 @@ try:
# Import one by one to avoid pylint false positives
from net.lib.py import NetNS, NetNSEnter, NetdevSimDev
from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \
- NlError, RtnlFamily, DevlinkFamily, PSPFamily
+ NlError, RtnlFamily, DevlinkFamily, PSPFamily, Netlink
from net.lib.py import CmdExitFailure
from net.lib.py import bkg, cmd, bpftool, bpftrace, defer, ethtool, \
fd_read_timeout, ip, rand_port, rand_ports, wait_port_listen, \
@@ -36,7 +36,7 @@ try:
__all__ = ["NetNS", "NetNSEnter", "NetdevSimDev",
"EthtoolFamily", "NetdevFamily", "NetshaperFamily",
- "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily",
+ "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", "Netlink",
"CmdExitFailure",
"bkg", "cmd", "bpftool", "bpftrace", "defer", "ethtool",
"fd_read_timeout", "ip", "rand_port", "rand_ports",
diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py
new file mode 100755
index 000000000000..2bc5ffe96c7d
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/nk_qlease.py
@@ -0,0 +1,1407 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import errno
+import re
+import time
+import threading
+from os import path
+from lib.py import (
+ ksft_run,
+ ksft_exit,
+ ksft_eq,
+ ksft_ne,
+ ksft_in,
+ ksft_not_in,
+ ksft_raises,
+)
+from lib.py import (
+ NetDrvContEnv,
+ NetNS,
+ NetNSEnter,
+ EthtoolFamily,
+ NetdevFamily,
+ RtnlFamily,
+ NetdevSimDev,
+)
+from lib.py import (
+ NlError,
+ Netlink,
+ bkg,
+ cmd,
+ defer,
+ ethtool,
+ ip,
+ rand_port,
+ wait_port_listen,
+)
+from lib.py import KsftSkipEx, CmdExitFailure
+
+
+def set_flow_rule(cfg):
+ output = ethtool(
+ f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}"
+ ).stdout
+ values = re.search(r"ID (\d+)", output).group(1)
+ return int(values)
+
+
+def create_netkit(rxqueues):
+ all_links = ip("-d link show", json=True)
+ old_idxs = {
+ link["ifindex"]
+ for link in all_links
+ if link.get("linkinfo", {}).get("info_kind") == "netkit"
+ }
+
+ rtnl = RtnlFamily()
+ rtnl.newlink(
+ {
+ "linkinfo": {
+ "kind": "netkit",
+ "data": {
+ "mode": "l2",
+ "policy": "forward",
+ "peer-policy": "forward",
+ },
+ },
+ "num-rx-queues": rxqueues,
+ },
+ flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL],
+ )
+
+ all_links = ip("-d link show", json=True)
+ nk_links = [
+ link
+ for link in all_links
+ if link.get("linkinfo", {}).get("info_kind") == "netkit"
+ and link["ifindex"] not in old_idxs
+ ]
+ nk_links.sort(key=lambda x: x["ifindex"])
+ return (
+ nk_links[1]["ifname"],
+ nk_links[1]["ifindex"],
+ nk_links[0]["ifname"],
+ nk_links[0]["ifindex"],
+ )
+
+
+def create_netkit_single(rxqueues):
+ rtnl = RtnlFamily()
+ rtnl.newlink(
+ {
+ "linkinfo": {
+ "kind": "netkit",
+ "data": {
+ "mode": "l2",
+ "pairing": "single",
+ },
+ },
+ "num-rx-queues": rxqueues,
+ },
+ flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL],
+ )
+
+ all_links = ip("-d link show", json=True)
+ nk_links = [
+ link
+ for link in all_links
+ if link.get("linkinfo", {}).get("info_kind") == "netkit"
+ and "UP" not in link.get("flags", [])
+ ]
+ return nk_links[0]["ifname"], nk_links[0]["ifindex"]
+
+
+def test_remove_phys(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ src_queue = 1
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ nk_queue_id = result["id"]
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["ifindex"], nk_guest_idx)
+ ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id)
+
+ nsimdev.remove()
+ time.sleep(0.1)
+ ret = cmd(f"ip link show dev {nk_host}", fail=False)
+ ksft_ne(ret.ret, 0)
+
+
+def test_double_lease(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=3)
+ defer(cmd, f"ip link del dev {nk_host}")
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ src_queue = 1
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(result["id"], 1)
+
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EBUSY)
+
+
+def test_virtual_lessor(netns) -> None:
+ nk_host_a, _, nk_guest_a, nk_guest_a_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host_a}")
+ ip(f"link set dev {nk_host_a} up")
+ ip(f"link set dev {nk_guest_a} up")
+
+ nk_host_b, _, nk_guest_b, nk_guest_b_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host_b}")
+
+ ip(f"link set dev {nk_guest_b} netns {netns.name}")
+ ip(f"link set dev {nk_host_b} up")
+ ip(f"link set dev {nk_guest_b} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_b_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nk_guest_a_idx,
+ "queue": {"id": 0, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_phys_lessee(_netns) -> None:
+ nsimdev_a = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev_a.remove)
+ nsim_a = nsimdev_a.nsims[0]
+ ip(f"link set dev {nsim_a.ifname} up")
+
+ nsimdev_b = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev_b.remove)
+ nsim_b = nsimdev_b.nsims[0]
+ ip(f"link set dev {nsim_b.ifname} up")
+
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nsim_a.ifindex,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim_b.ifindex,
+ "queue": {"id": 0, "type": "rx"},
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_different_lessors(netns) -> None:
+ nsimdev_a = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev_a.remove)
+ nsim_a = nsimdev_a.nsims[0]
+ ip(f"link set dev {nsim_a.ifname} up")
+
+ nsimdev_b = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev_b.remove)
+ nsim_b = nsimdev_b.nsims[0]
+ ip(f"link set dev {nsim_b.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=3)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim_a.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim_b.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EOPNOTSUPP)
+
+
+def test_queue_out_of_range(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 2, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.ERANGE)
+
+
+def test_resize_leased(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ ethnl = EthtoolFamily()
+ with ksft_raises(NlError) as e:
+ ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1})
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_self_lease(_netns) -> None:
+ nk_host, _, _, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nk_guest_idx,
+ "queue": {"id": 0, "type": "rx"},
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_veth_queue_create(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ ip("link add veth0 type veth peer name veth1")
+ defer(cmd, "ip link del dev veth0", fail=False)
+
+ all_links = ip("-d link show", json=True)
+ veth_peer = [
+ link
+ for link in all_links
+ if link.get("ifname") == "veth1"
+ ]
+ veth_peer_idx = veth_peer[0]["ifindex"]
+
+ ip(f"link set dev veth1 netns {netns.name}")
+ ip("link set dev veth0 up")
+ ip("link set dev veth1 up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": veth_peer_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_create_tx_type(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "tx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_create_primary(_netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, nk_host_idx, _, _ = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_host} up")
+
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_host_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EOPNOTSUPP)
+
+
+def test_create_limit(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=1)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_link_flap_phys(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}")
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ src_queue = 1
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ nk_queue_id = result["id"]
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id)
+
+ # Link flap the physical device
+ ip(f"link set dev {nsim.ifname} down")
+ ip(f"link set dev {nsim.ifname} up")
+
+ # Verify lease survives the flap
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id)
+
+
+def test_queue_get_virtual(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}")
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ src_queue = 1
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ nk_queue_id = result["id"]
+
+ # queue-get on virtual device's leased queue should not show lease
+ # info (lease info is only shown from the physical device's side)
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nk_guest_idx, "id": nk_queue_id, "type": "rx"}
+ )
+ ksft_eq(queue_info["id"], nk_queue_id)
+ ksft_eq(queue_info["ifindex"], nk_guest_idx)
+ ksft_not_in("lease", queue_info)
+
+ # Default queue (not leased) also has no lease info
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nk_guest_idx, "id": 0, "type": "rx"}
+ )
+ ksft_not_in("lease", queue_info)
+
+
+def test_remove_virt_first(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ src_queue = 1
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(result["id"], 1)
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], result["id"])
+
+ # Delete netkit (virtual device removed first, physical stays)
+ cmd(f"ip link del dev {nk_host}")
+
+ # Verify lease is cleaned up on physical device
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_not_in("lease", queue_info)
+
+
+def test_multiple_leases(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=3)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=4)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ r1 = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ r2 = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 2, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ ksft_eq(r1["id"], 1)
+ ksft_eq(r2["id"], 2)
+
+ # Verify both leases visible on physical device
+ netdevnl = NetdevFamily()
+ q1 = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": 1, "type": "rx"}
+ )
+ q2 = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": 2, "type": "rx"}
+ )
+ ksft_in("lease", q1)
+ ksft_in("lease", q2)
+ ksft_eq(q1["lease"]["ifindex"], nk_guest_idx)
+ ksft_eq(q2["lease"]["ifindex"], nk_guest_idx)
+ ksft_eq(q1["lease"]["queue"]["id"], r1["id"])
+ ksft_eq(q2["lease"]["queue"]["id"], r2["id"])
+
+
+def test_lease_queue_tx_type(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "tx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+
+def test_invalid_netns(netns) -> None:
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": 1,
+ "queue": {"id": 0, "type": "rx"},
+ "netns-id": 999,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.ENONET)
+
+
+def test_invalid_phys_ifindex(netns) -> None:
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ with ksft_raises(NlError) as e:
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": 99999,
+ "queue": {"id": 0, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(e.exception.nl_msg.error, -errno.ENODEV)
+
+
+def test_multi_netkit_remove_phys(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=3)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ # Create two netkit pairs, each leasing a different physical queue
+ nk_host_a, _, nk_guest_a, nk_guest_a_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host_a}", fail=False)
+
+ nk_host_b, _, nk_guest_b, nk_guest_b_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host_b}", fail=False)
+
+ ip(f"link set dev {nk_guest_a} netns {netns.name}")
+ ip(f"link set dev {nk_host_a} up")
+ ip(f"link set dev {nk_guest_a} up", ns=netns)
+
+ ip(f"link set dev {nk_guest_b} netns {netns.name}")
+ ip(f"link set dev {nk_host_b} up")
+ ip(f"link set dev {nk_guest_b} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_a_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_b_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 2, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ # Removing the physical device should take down both netkit pairs
+ nsimdev.remove()
+ time.sleep(0.1)
+ ret = cmd(f"ip link show dev {nk_host_a}", fail=False)
+ ksft_ne(ret.ret, 0)
+ ret = cmd(f"ip link show dev {nk_host_b}", fail=False)
+ ksft_ne(ret.ret, 0)
+
+
+def test_single_remove_phys(_netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_name, nk_idx = create_netkit_single(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_name}", fail=False)
+
+ ip(f"link set dev {nk_name} up")
+
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ },
+ }
+ )
+
+ # Removing the physical device should take down the single netkit device
+ nsimdev.remove()
+ time.sleep(0.1)
+ ret = cmd(f"ip link show dev {nk_name}", fail=False)
+ ksft_ne(ret.ret, 0)
+
+
+def test_link_flap_virt(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}")
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ src_queue = 1
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ nk_queue_id = result["id"]
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id)
+
+ # Link flap the virtual (netkit) device
+ ip(f"link set dev {nk_guest} down", ns=netns)
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ # Verify lease survives the virtual device flap
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id)
+
+
+def test_phys_queue_no_lease(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}")
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ # Physical queue 0 (not leased) should have no lease info
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": 0, "type": "rx"}
+ )
+ ksft_not_in("lease", queue_info)
+
+ # Physical queue 1 (leased) should have lease info
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": 1, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+
+
+def test_same_ns_lease(_netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_name, nk_idx = create_netkit_single(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_name}", fail=False)
+
+ ip(f"link set dev {nk_name} up")
+
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ },
+ }
+ )
+ ksft_eq(result["id"], 1)
+
+ # Same namespace: lease info should NOT have netns-id
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": 1, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["ifindex"], nk_idx)
+ ksft_eq(queue_info["lease"]["queue"]["id"], result["id"])
+ ksft_not_in("netns-id", queue_info["lease"])
+
+
+def test_resize_after_unlease(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 1, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ # Resize should fail while lease is active
+ ethnl = EthtoolFamily()
+ with ksft_raises(NlError) as e:
+ ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1})
+ ksft_eq(e.exception.nl_msg.error, -errno.EINVAL)
+
+ # Delete netkit, clearing the lease
+ cmd(f"ip link del dev {nk_host}")
+
+ # Resize should now succeed
+ ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1})
+
+
+def test_lease_queue_zero(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": 0, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(result["id"], 1)
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": 0, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], result["id"])
+
+
+def test_release_and_reuse(netns) -> None:
+ nsimdev = NetdevSimDev(port_count=1, queue_count=2)
+ defer(nsimdev.remove)
+ nsim = nsimdev.nsims[0]
+ ip(f"link set dev {nsim.ifname} up")
+
+ src_queue = 1
+
+ # First lease
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+
+ # Delete netkit, freeing the lease
+ cmd(f"ip link del dev {nk_host}")
+
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_not_in("lease", queue_info)
+
+ # Re-create netkit and lease the same physical queue again
+ nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2)
+ defer(cmd, f"ip link del dev {nk_host}", fail=False)
+
+ ip(f"link set dev {nk_guest} netns {netns.name}")
+ ip(f"link set dev {nk_host} up")
+ ip(f"link set dev {nk_guest} up", ns=netns)
+
+ with NetNSEnter(str(netns)):
+ netdevnl = NetdevFamily()
+ result = netdevnl.queue_create(
+ {
+ "ifindex": nk_guest_idx,
+ "type": "rx",
+ "lease": {
+ "ifindex": nsim.ifindex,
+ "queue": {"id": src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ ksft_eq(result["id"], 1)
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"}
+ )
+ ksft_in("lease", queue_info)
+ ksft_eq(queue_info["lease"]["queue"]["id"], result["id"])
+
+
+def test_iou_zcrx(cfg) -> None:
+ cfg.require_ipver("6")
+ ethnl = EthtoolFamily()
+
+ rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}})
+ rx_rings = rings["rx"]
+ hds_thresh = rings.get("hds-thresh", 0)
+
+ ethnl.rings_set(
+ {
+ "header": {"dev-index": cfg.ifindex},
+ "tcp-data-split": "enabled",
+ "hds-thresh": 0,
+ "rx": 64,
+ }
+ )
+ defer(
+ ethnl.rings_set,
+ {
+ "header": {"dev-index": cfg.ifindex},
+ "tcp-data-split": "unknown",
+ "hds-thresh": hds_thresh,
+ "rx": rx_rings,
+ },
+ )
+
+ ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}")
+ defer(ethtool, f"-X {cfg.ifname} default")
+
+ flow_rule_id = set_flow_rule(cfg)
+ defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
+
+ rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}"
+ tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840"
+ with bkg(rx_cmd, exit_wait=True):
+ wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns)
+ cmd(tx_cmd, host=cfg.remote)
+
+
+def test_attrs(cfg) -> None:
+ cfg.require_ipver("6")
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"}
+ )
+
+ ksft_eq(queue_info["id"], cfg.src_queue)
+ ksft_eq(queue_info["type"], "rx")
+ ksft_eq(queue_info["ifindex"], cfg.ifindex)
+
+ ksft_in("lease", queue_info)
+ lease = queue_info["lease"]
+ ksft_eq(lease["ifindex"], cfg.nk_guest_ifindex)
+ ksft_eq(lease["queue"]["id"], cfg.nk_queue)
+ ksft_eq(lease["queue"]["type"], "rx")
+ ksft_in("netns-id", lease)
+
+
+def test_attach_xdp_with_mp(cfg) -> None:
+ cfg.require_ipver("6")
+ ethnl = EthtoolFamily()
+
+ rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}})
+ rx_rings = rings["rx"]
+ hds_thresh = rings.get("hds-thresh", 0)
+
+ ethnl.rings_set(
+ {
+ "header": {"dev-index": cfg.ifindex},
+ "tcp-data-split": "enabled",
+ "hds-thresh": 0,
+ "rx": 64,
+ }
+ )
+ defer(
+ ethnl.rings_set,
+ {
+ "header": {"dev-index": cfg.ifindex},
+ "tcp-data-split": "unknown",
+ "hds-thresh": hds_thresh,
+ "rx": rx_rings,
+ },
+ )
+
+ ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}")
+ defer(ethtool, f"-X {cfg.ifname} default")
+
+ netdevnl = NetdevFamily()
+
+ rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}"
+ with bkg(rx_cmd):
+ wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns)
+
+ time.sleep(0.1)
+ queue_info = netdevnl.queue_get(
+ {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"}
+ )
+ ksft_in("io-uring", queue_info)
+
+ prog = cfg.net_lib_dir / "xdp_dummy.bpf.o"
+ with ksft_raises(CmdExitFailure):
+ ip(f"link set dev {cfg.ifname} xdp obj {prog} sec xdp.frags")
+
+ time.sleep(0.1)
+ queue_info = netdevnl.queue_get(
+ {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"}
+ )
+ ksft_not_in("io-uring", queue_info)
+
+
+def test_destroy(cfg) -> None:
+ cfg.require_ipver("6")
+ ethnl = EthtoolFamily()
+
+ rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}})
+ rx_rings = rings["rx"]
+ hds_thresh = rings.get("hds-thresh", 0)
+
+ ethnl.rings_set(
+ {
+ "header": {"dev-index": cfg.ifindex},
+ "tcp-data-split": "enabled",
+ "hds-thresh": 0,
+ "rx": 64,
+ }
+ )
+ defer(
+ ethnl.rings_set,
+ {
+ "header": {"dev-index": cfg.ifindex},
+ "tcp-data-split": "unknown",
+ "hds-thresh": hds_thresh,
+ "rx": rx_rings,
+ },
+ )
+
+ ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}")
+ defer(ethtool, f"-X {cfg.ifname} default")
+
+ rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}"
+ rx_proc = cmd(rx_cmd, background=True)
+ wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns)
+
+ netdevnl = NetdevFamily()
+ queue_info = netdevnl.queue_get(
+ {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"}
+ )
+ ksft_in("io-uring", queue_info)
+
+ # ip link del will wait for all refs to drop first, but iou-zcrx is holding
+ # onto a ref. Terminate iou-zcrx async via a thread after a delay.
+ kill_timer = threading.Timer(1, rx_proc.proc.terminate)
+ kill_timer.start()
+
+ ip(f"link del dev {cfg._nk_host_ifname}")
+ kill_timer.join()
+ cfg._nk_host_ifname = None
+ cfg._nk_guest_ifname = None
+
+ queue_info = netdevnl.queue_get(
+ {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"}
+ )
+ ksft_not_in("io-uring", queue_info)
+
+ cmd(f"tc filter del dev {cfg.ifname} ingress pref {cfg._bpf_prog_pref}")
+ cfg._tc_attached = False
+
+ flow_rule_id = set_flow_rule(cfg)
+ defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
+
+ rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.src_queue}"
+ tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
+ with bkg(rx_cmd, exit_wait=True):
+ wait_port_listen(cfg.port, proto="tcp")
+ cmd(tx_cmd, host=cfg.remote)
+ # Short delay since iou cleanup is async and takes a bit of time.
+ time.sleep(0.1)
+ queue_info = netdevnl.queue_get(
+ {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"}
+ )
+ ksft_not_in("io-uring", queue_info)
+
+
+def main() -> None:
+ netns = NetNS()
+ cmd("ip netns attach init 1")
+ ip("netns set init 0", ns=netns)
+ ip("link set lo up", ns=netns)
+
+ ksft_run(
+ [
+ test_remove_phys,
+ test_double_lease,
+ test_virtual_lessor,
+ test_phys_lessee,
+ test_different_lessors,
+ test_queue_out_of_range,
+ test_resize_leased,
+ test_self_lease,
+ test_create_tx_type,
+ test_create_primary,
+ test_create_limit,
+ test_link_flap_phys,
+ test_queue_get_virtual,
+ test_remove_virt_first,
+ test_multiple_leases,
+ test_lease_queue_tx_type,
+ test_invalid_netns,
+ test_invalid_phys_ifindex,
+ test_multi_netkit_remove_phys,
+ test_single_remove_phys,
+ test_link_flap_virt,
+ test_phys_queue_no_lease,
+ test_same_ns_lease,
+ test_resize_after_unlease,
+ test_lease_queue_zero,
+ test_release_and_reuse,
+ test_veth_queue_create,
+ ],
+ args=(netns,),
+ )
+
+ cmd("ip netns del init", fail=False)
+ del netns
+
+ with NetDrvContEnv(__file__, rxqueues=2) as cfg:
+ cfg.bin_local = path.abspath(
+ path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx"
+ )
+ cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+ cfg.port = rand_port()
+
+ ethnl = EthtoolFamily()
+ channels = ethnl.channels_get({"header": {"dev-index": cfg.ifindex}})
+ channels = channels["combined-count"]
+ if channels < 2:
+ raise KsftSkipEx("Test requires NETIF with at least 2 combined channels")
+
+ cfg.src_queue = channels - 1
+
+ with NetNSEnter(str(cfg.netns)):
+ netdevnl = NetdevFamily()
+ bind_result = netdevnl.queue_create(
+ {
+ "ifindex": cfg.nk_guest_ifindex,
+ "type": "rx",
+ "lease": {
+ "ifindex": cfg.ifindex,
+ "queue": {"id": cfg.src_queue, "type": "rx"},
+ "netns-id": 0,
+ },
+ }
+ )
+ cfg.nk_queue = bind_result["id"]
+
+ # test_destroy must be last because it destroys the netkit devices
+ ksft_run(
+ [test_iou_zcrx, test_attrs, test_attach_xdp_with_mp, test_destroy],
+ args=(cfg,),
+ )
+ ksft_exit()
+
+
+if __name__ == "__main__":
+ main()