From 759ab1edb56c88906830fd6b2e7b12514dd32758 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jul 2023 11:55:29 -0700 Subject: net: store netdevs in an xarray Iterating over the netdev hash table for netlink dumps is hard. Dumps are done in "chunks" so we need to save the position after each chunk, so we know where to restart from. Because netdevs are stored in a hash table we remember which bucket we were in and how many devices we dumped. Since we don't hold any locks across the "chunks" - devices may come and go while we're dumping. If that happens we may miss a device (if device is deleted from the bucket we were in). We indicate to user space that this may have happened by setting NLM_F_DUMP_INTR. User space is supposed to dump again (I think) if it sees that. Somehow I doubt most user space gets this right.. To illustrate let's look at an example: System state: start: # [A, B, C] del: B # [A, C] with the hash table we may dump [A, B], missing C completely even tho it existed both before and after the "del B". Add an xarray and use it to allocate ifindexes. This way we can iterate ifindexes in order, without the worry that we'll skip one. We may still generate a dump of a state which "never existed", for example for a set of values and sequence of ops: System state: start: # [A, B] add: C # [A, C, B] del: B # [A, C] we may generate a dump of [A], if C got an index between A and B. System has never been in such state. But I'm 90% sure that's perfectly fine, important part is that we can't _miss_ devices which exist before and after. User space which wants to mirror kernel's state subscribes to notifications and does periodic dumps so it will know that C exists from the notification about its creation or from the next dump (next dump is _guaranteed_ to include C, if it doesn't get removed). To avoid any perf regressions keep the hash table for now. Most net namespaces have very few devices and microbenchmarking 1M lookups on Skylake I get the following results (not counting loopback to number of devs): #devs | hash | xa | delta 2 | 18.3 | 20.1 | + 9.8% 16 | 18.3 | 20.1 | + 9.5% 64 | 18.3 | 26.3 | +43.8% 128 | 20.4 | 26.3 | +28.6% 256 | 20.0 | 26.4 | +32.1% 1024 | 26.6 | 26.7 | + 0.2% 8192 |541.3 | 33.5 | -93.8% No surprises since the hash table has 256 entries. The microbenchmark scans indexes in order, if the pattern is more random xa starts to win at 512 devices already. But that's a lot of devices, in practice. Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230726185530.2247698-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 78beaa765c73..9f6add96de2d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -42,6 +42,7 @@ #include #include #include +#include struct user_namespace; struct proc_dir_entry; @@ -69,7 +70,7 @@ struct net { atomic_t dev_unreg_count; unsigned int dev_base_seq; /* protected by rtnl_mutex */ - int ifindex; + u32 ifindex; spinlock_t nsid_lock; atomic_t fnhe_genid; @@ -110,6 +111,7 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct xarray dev_by_index; struct raw_notifier_head netdev_chain; /* Note that @hash_mix can be read millions times per second, -- cgit v1.2.3 From 84e00d9bd4e472bd9b145ed40dbd132dd7a15462 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jul 2023 11:55:30 -0700 Subject: net: convert some netlink netdev iterators to depend on the xarray Reap the benefits of easier iteration thanks to the xarray. Convert just the genetlink ones, those are easier to test. Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230726185530.2247698-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++ net/core/netdev-genl.c | 37 ++++++------------------ net/ethtool/netlink.c | 65 ++++++++++++------------------------------- net/ethtool/tunnels.c | 71 +++++++++++++++++------------------------------ 4 files changed, 54 insertions(+), 122 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 32a4cdf37dd4..84c36a7f873f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3016,6 +3016,9 @@ extern rwlock_t dev_base_lock; /* Device list lock */ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) +#define for_each_netdev_dump(net, d, ifindex) \ + xa_for_each_start(&(net)->dev_by_index, (ifindex), (d), (ifindex)) + static inline struct net_device *next_net_device(struct net_device *dev) { struct list_head *lh; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 65ef4867fc49..797c813c7c77 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -101,43 +101,22 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *netdev; - int idx = 0, s_idx; - int h, s_h; - int err; - - s_h = cb->args[0]; - s_idx = cb->args[1]; + int err = 0; rtnl_lock(); - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(netdev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = netdev_nl_dev_fill(netdev, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NETDEV_CMD_DEV_GET); - if (err < 0) - break; -cont: - idx++; - } + for_each_netdev_dump(net, netdev, cb->args[0]) { + err = netdev_nl_dev_fill(netdev, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NETDEV_CMD_DEV_GET); + if (err < 0) + break; } - rtnl_unlock(); if (err != -EMSGSIZE) return err; - cb->args[1] = idx; - cb->args[0] = h; - cb->seq = net->dev_base_seq; - return skb->len; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 39a459b0111b..ae344f1b0bbd 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -252,8 +252,7 @@ int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply - * @pos_hash: saved iteration position - hashbucket - * @pos_idx: saved iteration position - index + * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used @@ -263,8 +262,7 @@ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; - int pos_hash; - int pos_idx; + unsigned long pos_ifindex; }; static const struct ethnl_request_ops * @@ -490,55 +488,27 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; rtnl_lock(); - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - unsigned int seq; - - head = &net->dev_index_head[h]; - -restart_chain: - seq = net->dev_base_seq; - cb->seq = seq; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - dev_hold(dev); - rtnl_unlock(); - - ret = ethnl_default_dump_one(skb, dev, ctx, cb); - dev_put(dev); - if (ret < 0) { - if (ret == -EOPNOTSUPP) - goto lock_and_cont; - if (likely(skb->len)) - ret = skb->len; - goto out; - } -lock_and_cont: - rtnl_lock(); - if (net->dev_base_seq != seq) { - s_idx = idx + 1; - goto restart_chain; - } -cont: - idx++; - } + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + dev_hold(dev); + rtnl_unlock(); + + ret = ethnl_default_dump_one(skb, dev, ctx, cb); + + rtnl_lock(); + dev_put(dev); + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } } rtnl_unlock(); -out: - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - return ret; } @@ -584,8 +554,7 @@ static int ethnl_default_start(struct netlink_callback *cb) ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; - ctx->pos_hash = 0; - ctx->pos_idx = 0; + ctx->pos_ifindex = 0; return 0; diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 67fb414ca859..05f752557b5e 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -212,8 +212,7 @@ err_unlock_rtnl: struct ethnl_tunnel_info_dump_ctx { struct ethnl_req_info req_info; - int pos_hash; - int pos_idx; + unsigned long ifindex; }; int ethnl_tunnel_info_start(struct netlink_callback *cb) @@ -243,57 +242,39 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; void *ehdr; rtnl_lock(); - cb->seq = net->dev_base_seq; - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - - head = &net->dev_index_head[h]; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - - ehdr = ethnl_dump_put(skb, cb, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - goto out; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - goto out; - } - - ctx->req_info.dev = dev; - ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); - ctx->req_info.dev = NULL; - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - if (ret == -EOPNOTSUPP) - goto cont; - goto out; - } - genlmsg_end(skb, ehdr); -cont: - idx++; + for_each_netdev_dump(net, dev, ctx->ifindex) { + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); + if (!ehdr) { + ret = -EMSGSIZE; + break; } + + ret = ethnl_fill_reply_header(skb, dev, + ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + break; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + continue; + break; + } + genlmsg_end(skb, ehdr); } -out: rtnl_unlock(); - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - if (ret == -EMSGSIZE && skb->len) return skb->len; return ret; -- cgit v1.2.3