From 80d2eefcb4c84aa9018b2a997ab3a4c567bc821a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 Mar 2024 08:40:30 +0100 Subject: net: Use backlog-NAPI to clean up the defer_list. The defer_list is a per-CPU list which is used to free skbs outside of the socket lock and on the CPU on which they have been allocated. The list is processed during NAPI callbacks so ideally the list is cleaned up. Should the amount of skbs on the list exceed a certain water mark then the softirq is triggered remotely on the target CPU by invoking a remote function call. The raise of the softirqs via a remote function call leads to waking the ksoftirqd on PREEMPT_RT which is undesired. The backlog-NAPI threads already provide the infrastructure which can be utilized to perform the cleanup of the defer_list. The NAPI state is updated with the input_pkt_queue.lock acquired. It order not to break the state, it is needed to also wake the backlog-NAPI thread with the lock held. This requires to acquire the use the lock in rps_lock_irq*() if the backlog-NAPI threads are used even with RPS disabled. Move the logic of remotely starting softirqs to clean up the defer_list into kick_defer_list_purge(). Make sure a lock is held in rps_lock_irq*() if backlog-NAPI threads are used. Schedule backlog-NAPI for defer_list cleanup if backlog-NAPI is available. Acked-by: Jakub Kicinski Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb37817d6382..e41d30ebaca6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3287,6 +3287,7 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); -- cgit v1.2.3 From 117aef12a7b1b797bce9f66b156c65eab850b5b5 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:52 +0100 Subject: ip_tunnel: use a separate struct to store tunnel params in the kernel Unlike IPv6 tunnels which use purely-kernel __ip6_tnl_parm structure to store params inside the kernel, IPv4 tunnel code uses the same ip_tunnel_parm which is being used to talk with the userspace. This makes it difficult to alter or add any fields or use a different format for whatever data. Define struct ip_tunnel_parm_kern, a 1:1 copy of ip_tunnel_parm for now, and use it throughout the code. Define the pieces, where the copy user <-> kernel happens, as standalone functions, and copy the data there field-by-field, so that the kernel-side structure could be easily modified later on and the users wouldn't have to care about this. Reviewed-by: Simon Horman Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_ipip.c | 30 ++++++----- .../net/ethernet/mellanox/mlxsw/spectrum_ipip.h | 2 +- .../net/ethernet/mellanox/mlxsw/spectrum_span.c | 4 +- include/linux/netdevice.h | 7 +-- include/net/ip_tunnels.h | 25 +++++++-- net/ipv4/ip_gre.c | 17 +++--- net/ipv4/ip_tunnel.c | 63 +++++++++++++++++----- net/ipv4/ip_tunnel_core.c | 2 +- net/ipv4/ip_vti.c | 12 ++--- net/ipv4/ipip.c | 12 ++--- net/ipv4/ipmr.c | 2 +- net/ipv6/addrconf.c | 3 +- net/ipv6/sit.c | 33 ++++++------ 13 files changed, 137 insertions(+), 75 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c index 3340b4a694c3..d67df358a52f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c @@ -8,7 +8,7 @@ #include "spectrum_ipip.h" #include "reg.h" -struct ip_tunnel_parm +struct ip_tunnel_parm_kern mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev) { struct ip_tunnel *tun = netdev_priv(ol_dev); @@ -24,7 +24,8 @@ mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev) return tun->parms; } -static bool mlxsw_sp_ipip_parms4_has_ikey(const struct ip_tunnel_parm *parms) +static bool +mlxsw_sp_ipip_parms4_has_ikey(const struct ip_tunnel_parm_kern *parms) { return !!(parms->i_flags & TUNNEL_KEY); } @@ -34,7 +35,8 @@ static bool mlxsw_sp_ipip_parms6_has_ikey(const struct __ip6_tnl_parm *parms) return !!(parms->i_flags & TUNNEL_KEY); } -static bool mlxsw_sp_ipip_parms4_has_okey(const struct ip_tunnel_parm *parms) +static bool +mlxsw_sp_ipip_parms4_has_okey(const struct ip_tunnel_parm_kern *parms) { return !!(parms->o_flags & TUNNEL_KEY); } @@ -44,7 +46,7 @@ static bool mlxsw_sp_ipip_parms6_has_okey(const struct __ip6_tnl_parm *parms) return !!(parms->o_flags & TUNNEL_KEY); } -static u32 mlxsw_sp_ipip_parms4_ikey(const struct ip_tunnel_parm *parms) +static u32 mlxsw_sp_ipip_parms4_ikey(const struct ip_tunnel_parm_kern *parms) { return mlxsw_sp_ipip_parms4_has_ikey(parms) ? be32_to_cpu(parms->i_key) : 0; @@ -56,7 +58,7 @@ static u32 mlxsw_sp_ipip_parms6_ikey(const struct __ip6_tnl_parm *parms) be32_to_cpu(parms->i_key) : 0; } -static u32 mlxsw_sp_ipip_parms4_okey(const struct ip_tunnel_parm *parms) +static u32 mlxsw_sp_ipip_parms4_okey(const struct ip_tunnel_parm_kern *parms) { return mlxsw_sp_ipip_parms4_has_okey(parms) ? be32_to_cpu(parms->o_key) : 0; @@ -69,7 +71,7 @@ static u32 mlxsw_sp_ipip_parms6_okey(const struct __ip6_tnl_parm *parms) } static union mlxsw_sp_l3addr -mlxsw_sp_ipip_parms4_saddr(const struct ip_tunnel_parm *parms) +mlxsw_sp_ipip_parms4_saddr(const struct ip_tunnel_parm_kern *parms) { return (union mlxsw_sp_l3addr) { .addr4 = parms->iph.saddr }; } @@ -81,7 +83,7 @@ mlxsw_sp_ipip_parms6_saddr(const struct __ip6_tnl_parm *parms) } static union mlxsw_sp_l3addr -mlxsw_sp_ipip_parms4_daddr(const struct ip_tunnel_parm *parms) +mlxsw_sp_ipip_parms4_daddr(const struct ip_tunnel_parm_kern *parms) { return (union mlxsw_sp_l3addr) { .addr4 = parms->iph.daddr }; } @@ -96,7 +98,7 @@ union mlxsw_sp_l3addr mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto, const struct net_device *ol_dev) { - struct ip_tunnel_parm parms4; + struct ip_tunnel_parm_kern parms4; struct __ip6_tnl_parm parms6; switch (proto) { @@ -115,7 +117,9 @@ mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto, static __be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev) { - struct ip_tunnel_parm parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev); + struct ip_tunnel_parm_kern parms4; + + parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev); return mlxsw_sp_ipip_parms4_daddr(&parms4).addr4; } @@ -124,7 +128,7 @@ static union mlxsw_sp_l3addr mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto, const struct net_device *ol_dev) { - struct ip_tunnel_parm parms4; + struct ip_tunnel_parm_kern parms4; struct __ip6_tnl_parm parms6; switch (proto) { @@ -150,7 +154,7 @@ bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr) static struct mlxsw_sp_ipip_parms mlxsw_sp_ipip_netdev_parms_init_gre4(const struct net_device *ol_dev) { - struct ip_tunnel_parm parms = mlxsw_sp_ipip_netdev_parms4(ol_dev); + struct ip_tunnel_parm_kern parms = mlxsw_sp_ipip_netdev_parms4(ol_dev); return (struct mlxsw_sp_ipip_parms) { .proto = MLXSW_SP_L3_PROTO_IPV4, @@ -187,8 +191,8 @@ mlxsw_sp_ipip_decap_config_gre4(struct mlxsw_sp *mlxsw_sp, { u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb); u16 ul_rif_id = mlxsw_sp_ipip_lb_ul_rif_id(ipip_entry->ol_lb); + struct ip_tunnel_parm_kern parms; char rtdp_pl[MLXSW_REG_RTDP_LEN]; - struct ip_tunnel_parm parms; unsigned int type_check; bool has_ikey; u32 daddr4; @@ -252,7 +256,7 @@ static struct mlxsw_sp_rif_ipip_lb_config mlxsw_sp_ipip_ol_loopback_config_gre4(struct mlxsw_sp *mlxsw_sp, const struct net_device *ol_dev) { - struct ip_tunnel_parm parms = mlxsw_sp_ipip_netdev_parms4(ol_dev); + struct ip_tunnel_parm_kern parms = mlxsw_sp_ipip_netdev_parms4(ol_dev); enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt; lb_ipipt = mlxsw_sp_ipip_parms4_has_okey(&parms) ? diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h index a35f009da561..a66173779641 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h @@ -9,7 +9,7 @@ #include #include -struct ip_tunnel_parm +struct ip_tunnel_parm_kern mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev); struct __ip6_tnl_parm mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index af50ff9e5f26..f0cbc6b9b041 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -413,8 +413,8 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, __be32 *saddrp, __be32 *daddrp) { struct ip_tunnel *tun = netdev_priv(to_dev); + struct ip_tunnel_parm_kern parms; struct net_device *dev = NULL; - struct ip_tunnel_parm parms; struct rtable *rt = NULL; struct flowi4 fl4; @@ -451,7 +451,7 @@ mlxsw_sp_span_entry_gretap4_parms(struct mlxsw_sp *mlxsw_sp, const struct net_device *to_dev, struct mlxsw_sp_span_parms *sparmsp) { - struct ip_tunnel_parm tparm = mlxsw_sp_ipip_netdev_parms4(to_dev); + struct ip_tunnel_parm_kern tparm = mlxsw_sp_ipip_netdev_parms4(to_dev); union mlxsw_sp_l3addr saddr = { .addr4 = tparm.iph.saddr }; union mlxsw_sp_l3addr daddr = { .addr4 = tparm.iph.daddr }; bool inherit_tos = tparm.iph.tos & 0x1; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e41d30ebaca6..55c7cf9404a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,7 @@ struct ethtool_ops; struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; -struct ip_tunnel_parm; +struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; struct netdev_name_node; @@ -1327,7 +1327,7 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, + * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); @@ -1583,7 +1583,8 @@ struct net_device_ops { int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); int (*ndo_tunnel_ctl)(struct net_device *dev, - struct ip_tunnel_parm *p, int cmd); + struct ip_tunnel_parm_kern *p, + int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5cd64bb2104d..20f0319ab149 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -110,6 +110,17 @@ struct ip_tunnel_prl_entry { struct metadata_dst; +/* Kernel-side copy of ip_tunnel_parm */ +struct ip_tunnel_parm_kern { + char name[IFNAMSIZ]; + int link; + __be16 i_flags; + __be16 o_flags; + __be32 i_key; + __be32 o_key; + struct iphdr iph; +}; + struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; @@ -136,7 +147,7 @@ struct ip_tunnel { struct dst_cache dst_cache; - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ @@ -291,7 +302,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data); +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); @@ -307,16 +322,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); + struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); + struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms); + struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7b16c211b904..8853c065a985 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -785,7 +785,8 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, +static int ipgre_tunnel_ctl(struct net_device *dev, + struct ip_tunnel_parm_kern *p, int cmd) { int err; @@ -1131,7 +1132,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1198,7 +1199,7 @@ static int ipgre_netlink_parms(struct net_device *dev, static int erspan_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1357,7 +1358,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1375,7 +1376,7 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1394,8 +1395,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1423,8 +1424,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1496,7 +1497,7 @@ static size_t ipgre_get_size(const struct net_device *dev) static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; + struct ip_tunnel_parm_kern *p = &t->parms; __be16 o_flags = p->o_flags; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1b8d8ff9a237..d10f41f81463 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,7 +56,7 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, +static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p, __be16 flags, __be32 key) { if (p->i_flags & TUNNEL_KEY) { @@ -171,7 +171,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { unsigned int h; __be32 remote; @@ -206,7 +206,7 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, int type) { __be32 remote = parms->iph.daddr; @@ -230,7 +230,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { int err; struct ip_tunnel *tunnel; @@ -326,7 +326,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { struct ip_tunnel *nt; struct net_device *dev; @@ -871,7 +871,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, - struct ip_tunnel_parm *p, + struct ip_tunnel_parm_kern *p, bool set_mtu, __u32 fwmark) { @@ -903,7 +903,8 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, netdev_state_change(dev); } -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -1005,16 +1006,52 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data) +{ + struct ip_tunnel_parm p; + + if (copy_from_user(&p, data, sizeof(p))) + return false; + + strscpy(kp->name, p.name); + kp->link = p.link; + kp->i_flags = p.i_flags; + kp->o_flags = p.o_flags; + kp->i_key = p.i_key; + kp->o_key = p.o_key; + memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph))); + + return true; +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user); + +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp) +{ + struct ip_tunnel_parm p; + + strscpy(p.name, kp->name); + p.link = kp->link; + p.i_flags = kp->i_flags; + p.o_flags = kp->o_flags; + p.i_key = kp->i_key; + p.o_key = kp->o_key; + memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph))); + + return !copy_to_user(data, &p, sizeof(p)); +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user); + int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; int err; - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(data, &p, sizeof(p))) + if (!err && !ip_tunnel_parm_to_user(data, &p)) return -EFAULT; return err; } @@ -1093,7 +1130,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; unsigned int i; itn->rtnl_link_ops = ops; @@ -1171,7 +1208,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); @@ -1225,7 +1262,7 @@ err_register_netdevice: EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 80ccd6661aa3..98f136b07191 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -1116,7 +1116,7 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index ee587adb169f..7e595f619615 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -167,7 +167,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parms = &tunnel->parms; + struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; @@ -373,7 +373,7 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { int err = 0; @@ -531,7 +531,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void vti_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -566,7 +566,7 @@ static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); @@ -578,8 +578,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); @@ -606,7 +606,7 @@ static size_t vti_get_size(const struct net_device *dev) static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; + struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index f2696eaadbe6..db960cd08a3b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -330,7 +330,7 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || @@ -405,8 +405,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, bool *collect_md, - __u32 *fwmark) + struct ip_tunnel_parm_kern *parms, + bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -432,8 +432,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { @@ -452,8 +452,8 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; @@ -510,7 +510,7 @@ static size_t ipip_get_size(const struct net_device *dev) static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fd5c01c8489f..6c750bd13dd8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -441,7 +441,7 @@ static bool ipmr_init_vif_indev(const struct net_device *dev) static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 92db9b474f2b..82ad6b8fa007 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -63,6 +63,7 @@ #include #include +#include #include #include #include @@ -2917,7 +2918,7 @@ put: static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, struct in6_ifreq *ireq) { - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 655c9b1a19b8..894e6a4c4be3 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -132,8 +132,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, - struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu ** +__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -226,7 +226,8 @@ out: } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) + struct ip_tunnel_parm_kern *parms, + int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -1135,7 +1136,8 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, +static void ipip6_tunnel_update(struct ip_tunnel *t, + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct net *net = t->net; @@ -1196,11 +1198,11 @@ static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; - struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1251,7 +1253,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1268,7 +1270,7 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1281,7 +1283,7 @@ ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1297,7 +1299,7 @@ ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1328,7 +1330,7 @@ ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1348,7 +1350,8 @@ ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { switch (cmd) { case SIOCGETTUNNEL: @@ -1490,7 +1493,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1599,8 +1602,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD @@ -1687,7 +1690,7 @@ static size_t ipip6_get_size(const struct net_device *dev) static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || -- cgit v1.2.3 From b9495b564d91a0afe4125db64d2bc25c310bda9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:18 +0000 Subject: net: move kick_defer_list_purge() to net/core/dev.h kick_defer_list_purge() is defined in net/core/dev.c and used from net/core/skubff.c Because we need softnet_data, include from net/core/dev.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/dev.h | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 55c7cf9404a4..15e70527b703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3288,7 +3288,6 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c..9d0f8b4587f8 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -4,11 +4,9 @@ #include #include +#include struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; @@ -150,4 +148,6 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + #endif -- cgit v1.2.3 From 2fe50a4d7225cf10748775e290361896637091a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:19 +0000 Subject: net: move dev_xmit_recursion() helpers to net/core/dev.h Move dev_xmit_recursion() and friends to net/core/dev.h They are only used from net/core/dev.c and net/core/filter.c. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 ----------------- net/core/dev.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 15e70527b703..0cd9ee83f554 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3271,23 +3271,6 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 8 -static inline bool dev_xmit_recursion(void) -{ - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > - XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - __this_cpu_inc(softnet_data.xmit.recursion); -} - -static inline void dev_xmit_recursion_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.recursion); -} - void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/net/core/dev.h b/net/core/dev.h index 9d0f8b4587f8..8572d2c8dc4a 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -150,4 +150,21 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + #endif -- cgit v1.2.3 From a7ae7b0b2ea014ff3ed4be812c3efa1b1d86e153 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:21 +0000 Subject: net: make softnet_data.dropped an atomic_t If under extreme cpu backlog pressure enqueue_to_backlog() has to drop a packet, it could do this without dirtying a cache line and potentially slowing down the target cpu. Move sd->dropped into a separate cache line, and make it atomic. In non pressure mode, this field is not touched, no need to consume valuable space in a hot cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 13 +++++++++---- net/core/net-procfs.c | 3 ++- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0cd9ee83f554..be9f071a340d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3237,10 +3237,11 @@ struct softnet_data { unsigned int input_queue_tail; #endif unsigned int received_rps; - unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + atomic_t dropped ____cacheline_aligned_in_smp; + /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; diff --git a/net/core/dev.c b/net/core/dev.c index 4ad7836365e6..02c98f115243 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4800,17 +4800,22 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(net_hotdata.max_backlog) && - !skb_flow_limit(skb, qlen)) { + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -4826,11 +4831,11 @@ enqueue: napi_schedule_rps(sd); goto enqueue; } - reason = SKB_DROP_REASON_CPU_BACKLOG; - sd->dropped++; backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61..fa6d3969734a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + sd->processed, atomic_read(&sd->dropped), + sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ sd->received_rps, flow_limit_count, -- cgit v1.2.3 From 36b83ffcf209a2e6099dae1070df6a2001dfab27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:23 +0000 Subject: net: rps: change input_queue_tail_incr_save() input_queue_tail_incr_save() is incrementing the sd queue_tail and save it in the flow last_qtail. Two issues here : - no lock protects the write on last_qtail, we should use appropriate annotations. - We can perform this write after releasing the per-cpu backlog lock, to decrease this lock hold duration (move away the cache line miss) Also move input_queue_head_incr() and rps helpers to include/net/rps.h, while adding rps_ prefix to better reflect their role. v2: Fixed a build issue (Jakub and kernel build bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 --------------- include/net/rps.h | 23 +++++++++++++++++++++++ net/core/dev.c | 20 ++++++++++++-------- 3 files changed, 35 insertions(+), 23 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be9f071a340d..2c11c295cf64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3250,21 +3250,6 @@ struct softnet_data { call_single_data_t defer_csd; }; -static inline void input_queue_head_incr(struct softnet_data *sd) -{ -#ifdef CONFIG_RPS - sd->input_queue_head++; -#endif -} - -static inline void input_queue_tail_incr_save(struct softnet_data *sd, - unsigned int *qtail) -{ -#ifdef CONFIG_RPS - *qtail = ++sd->input_queue_tail; -#endif -} - DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); static inline int dev_recursion_level(void) diff --git a/include/net/rps.h b/include/net/rps.h index 7660243e905b..10ca25731c1e 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + sd->input_queue_head++; +#endif +} + #endif /* _NET_RPS_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 0a8ccb0451c3..79073bbc9a64 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { + READ_ONCE(rflow->last_qtail))) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned long flags; unsigned int qlen; int max_backlog; + u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); + tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); + + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); if (++work >= quota) return work; @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; -- cgit v1.2.3 From d3ae5f4632c107d3c2eeb97a60fecc6a6f9d6fbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:25 +0000 Subject: net: rps: move received_rps field to a better location Commit 14d898f3c1b3 ("dev: Move received_rps counter next to RPS members in softnet data") was unfortunate: received_rps is dirtied by a cpu and never read by other cpus in fast path. Its presence in the hot RPS cache line (shared by many cpus) is hurting RPS/RFS performance. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c11c295cf64..7d12b5a9380f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3204,6 +3204,7 @@ struct softnet_data { struct softnet_data *rps_ipi_list; #endif + unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; @@ -3236,7 +3237,6 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif - unsigned int received_rps; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; -- cgit v1.2.3 From b1f81b9a535b48b2c9ca460720a2bc73fd2001de Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 Mar 2024 08:27:50 +0100 Subject: netdevice: add DEFINE_FREE() for dev_put For short netdev holds within a function there are still a lot of users of dev_put() rather than netdev_put(). Add DEFINE_FREE() to allow making those safer. Signed-off-by: Johannes Berg Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7d12b5a9380f..0c198620ac93 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4097,6 +4097,8 @@ static inline void dev_put(struct net_device *dev) netdev_put(dev, NULL); } +DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T)) + static inline void netdev_ref_replace(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, -- cgit v1.2.3 From 6916e461e7933d3d003441291c543938f2ccb371 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 4 Apr 2024 11:29:51 +0200 Subject: net: phy: Introduce ethernet link topology representation Link topologies containing multiple network PHYs attached to the same net_device can be found when using a PHY as a media converter for use with an SFP connector, on which an SFP transceiver containing a PHY can be used. With the current model, the transceiver's PHY can't be used for operations such as cable testing, timestamping, macsec offload, etc. The reason being that most of the logic for these configuration, coming from either ethtool netlink or ioctls tend to use netdev->phydev, which in multi-phy systems will reference the PHY closest to the MAC. Introduce a numbering scheme allowing to enumerate PHY devices that belong to any netdev, which can in turn allow userspace to take more precise decisions with regard to each PHY's configuration. The numbering is maintained per-netdev, in a phy_device_list. The numbering works similarly to a netdevice's ifindex, with identifiers that are only recycled once INT_MAX has been reached. This prevents races that could occur between PHY listing and SFP transceiver removal/insertion. The identifiers are assigned at phy_attach time, as the numbering depends on the netdevice the phy is attached to. The PHY index can be re-used for PHYs that are persistent. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 2 + drivers/net/phy/Makefile | 2 +- drivers/net/phy/phy_device.c | 7 +++ drivers/net/phy/phy_link_topology.c | 105 +++++++++++++++++++++++++++++++++ include/linux/netdevice.h | 4 +- include/linux/phy.h | 4 ++ include/linux/phy_link_topology.h | 72 ++++++++++++++++++++++ include/linux/phy_link_topology_core.h | 25 ++++++++ include/uapi/linux/ethtool.h | 16 +++++ net/core/dev.c | 9 +++ 10 files changed, 244 insertions(+), 2 deletions(-) create mode 100644 drivers/net/phy/phy_link_topology.c create mode 100644 include/linux/phy_link_topology.h create mode 100644 include/linux/phy_link_topology_core.h (limited to 'include/linux/netdevice.h') diff --git a/MAINTAINERS b/MAINTAINERS index 046598b471b7..4745ea94d463 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8015,6 +8015,8 @@ F: include/linux/mii.h F: include/linux/of_net.h F: include/linux/phy.h F: include/linux/phy_fixed.h +F: include/linux/phy_link_topology.h +F: include/linux/phy_link_topology_core.h F: include/linux/phylib_stubs.h F: include/linux/platform_data/mdio-bcm-unimac.h F: include/linux/platform_data/mdio-gpio.h diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 202ed7f450da..1d8be374915f 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -2,7 +2,7 @@ # Makefile for Linux PHY drivers libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ - linkmode.o + linkmode.o phy_link_topology.o mdio-bus-y += mdio_bus.o mdio_device.o ifdef CONFIG_MDIO_DEVICE diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 6c6ec9475709..452fc8b3406d 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1511,6 +1512,11 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, if (phydev->sfp_bus_attached) dev->sfp_bus = phydev->sfp_bus; + + err = phy_link_topo_add_phy(dev->link_topo, phydev, + PHY_UPSTREAM_MAC, dev); + if (err) + goto error; } /* Some Ethernet drivers try to connect to a PHY device before @@ -1938,6 +1944,7 @@ void phy_detach(struct phy_device *phydev) if (dev) { phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; + phy_link_topo_del_phy(dev->link_topo, phydev); } phydev->phylink = NULL; diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c new file mode 100644 index 000000000000..985941c5c558 --- /dev/null +++ b/drivers/net/phy/phy_link_topology.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Infrastructure to handle all PHY devices connected to a given netdev, + * either directly or indirectly attached. + * + * Copyright (c) 2023 Maxime Chevallier + */ + +#include +#include +#include +#include +#include + +struct phy_link_topology *phy_link_topo_create(struct net_device *dev) +{ + struct phy_link_topology *topo; + + topo = kzalloc(sizeof(*topo), GFP_KERNEL); + if (!topo) + return ERR_PTR(-ENOMEM); + + xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1); + topo->next_phy_index = 1; + + return topo; +} + +void phy_link_topo_destroy(struct phy_link_topology *topo) +{ + if (!topo) + return; + + xa_destroy(&topo->phys); + kfree(topo); +} + +int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream) +{ + struct phy_device_node *pdn; + int ret; + + pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); + if (!pdn) + return -ENOMEM; + + pdn->phy = phy; + switch (upt) { + case PHY_UPSTREAM_MAC: + pdn->upstream.netdev = (struct net_device *)upstream; + if (phy_on_sfp(phy)) + pdn->parent_sfp_bus = pdn->upstream.netdev->sfp_bus; + break; + case PHY_UPSTREAM_PHY: + pdn->upstream.phydev = (struct phy_device *)upstream; + if (phy_on_sfp(phy)) + pdn->parent_sfp_bus = pdn->upstream.phydev->sfp_bus; + break; + default: + ret = -EINVAL; + goto err; + } + pdn->upstream_type = upt; + + /* Attempt to re-use a previously allocated phy_index */ + if (phy->phyindex) { + ret = xa_insert(&topo->phys, phy->phyindex, pdn, GFP_KERNEL); + + /* Errors could be either -ENOMEM or -EBUSY. If the phy has an + * index, and there's another entry at the same index, this is + * unexpected and we still error-out + */ + if (ret) + goto err; + return 0; + } + + ret = xa_alloc_cyclic(&topo->phys, &phy->phyindex, pdn, xa_limit_32b, + &topo->next_phy_index, GFP_KERNEL); + if (ret) + goto err; + + return 0; + +err: + kfree(pdn); + return ret; +} +EXPORT_SYMBOL_GPL(phy_link_topo_add_phy); + +void phy_link_topo_del_phy(struct phy_link_topology *topo, + struct phy_device *phy) +{ + struct phy_device_node *pdn = xa_erase(&topo->phys, phy->phyindex); + + /* We delete the PHY from the topology, however we don't re-set the + * phy->phyindex field. If the PHY isn't gone, we can re-assign it the + * same index next time it's added back to the topology + */ + + kfree(pdn); +} +EXPORT_SYMBOL_GPL(phy_link_topo_del_phy); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0c198620ac93..d45f330d083d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,7 +40,6 @@ #include #endif #include - #include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -1974,6 +1974,7 @@ enum netdev_reg_state { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one + * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2364,6 +2365,7 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif + struct phy_link_topology *link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index e6e83304558e..8c848c79b1fd 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -550,6 +550,9 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. + * @phyindex: Unique id across the phy's parent tree of phys to address the PHY + * from userspace, similar to ifindex. A zero index means the PHY + * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -650,6 +653,7 @@ struct phy_device { struct device_link *devlink; + u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h new file mode 100644 index 000000000000..6b79feb607e7 --- /dev/null +++ b/include/linux/phy_link_topology.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PHY device list allow maintaining a list of PHY devices that are + * part of a netdevice's link topology. PHYs can for example be chained, + * as is the case when using a PHY that exposes an SFP module, on which an + * SFP transceiver that embeds a PHY is connected. + * + * This list can then be used by userspace to leverage individual PHY + * capabilities. + */ +#ifndef __PHY_LINK_TOPOLOGY_H +#define __PHY_LINK_TOPOLOGY_H + +#include +#include + +struct xarray; +struct phy_device; +struct net_device; +struct sfp_bus; + +struct phy_device_node { + enum phy_upstream upstream_type; + + union { + struct net_device *netdev; + struct phy_device *phydev; + } upstream; + + struct sfp_bus *parent_sfp_bus; + + struct phy_device *phy; +}; + +struct phy_link_topology { + struct xarray phys; + u32 next_phy_index; +}; + +static inline struct phy_device * +phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex) +{ + struct phy_device_node *pdn = xa_load(&topo->phys, phyindex); + + if (pdn) + return pdn->phy; + + return NULL; +} + +#if IS_REACHABLE(CONFIG_PHYLIB) +int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream); + +void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy); + +#else +static inline int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream) +{ + return 0; +} + +static inline void phy_link_topo_del_phy(struct phy_link_topology *topo, + struct phy_device *phy) +{ +} +#endif + +#endif /* __PHY_LINK_TOPOLOGY_H */ diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h new file mode 100644 index 000000000000..0a6479055745 --- /dev/null +++ b/include/linux/phy_link_topology_core.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PHY_LINK_TOPOLOGY_CORE_H +#define __PHY_LINK_TOPOLOGY_CORE_H + +struct phy_link_topology; + +#if IS_REACHABLE(CONFIG_PHYLIB) + +struct phy_link_topology *phy_link_topo_create(struct net_device *dev); +void phy_link_topo_destroy(struct phy_link_topology *topo); + +#else + +static inline struct phy_link_topology *phy_link_topo_create(struct net_device *dev) +{ + return NULL; +} + +static inline void phy_link_topo_destroy(struct phy_link_topology *topo) +{ +} + +#endif + +#endif /* __PHY_LINK_TOPOLOGY_CORE_H */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 11fc18988bc2..95c2f09f0d0a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2268,4 +2268,20 @@ struct ethtool_link_settings { * __u32 map_lp_advertising[link_mode_masks_nwords]; */ }; + +/** + * enum phy_upstream - Represents the upstream component a given PHY device + * is connected to, as in what is on the other end of the MII bus. Most PHYs + * will be attached to an Ethernet MAC controller, but in some cases, there's + * an intermediate PHY used as a media-converter, which will driver another + * MII interface as its output. + * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port, + * or ethernet controller) + * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter) + */ +enum phy_upstream { + PHY_UPSTREAM_MAC, + PHY_UPSTREAM_PHY, +}; + #endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 92f5bddbc2de..854a3a28a8d8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -158,6 +158,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" @@ -10962,6 +10963,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + dev->link_topo = phy_link_topo_create(dev); + if (IS_ERR(dev->link_topo)) { + dev->link_topo = NULL; + goto free_all; + } + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11050,6 +11057,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; + phy_link_topo_destroy(dev->link_topo); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); -- cgit v1.2.3 From c661050f93d3fd37a33c06041bb18a89688de7d2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 22 Apr 2024 05:38:56 -0700 Subject: net: create a dummy net_device allocator It is impossible to use init_dummy_netdev together with alloc_netdev() as the 'setup' argument. This is because alloc_netdev() initializes some fields in the net_device structure, and later init_dummy_netdev() memzero them all. This causes some problems as reported here: https://lore.kernel.org/all/20240322082336.49f110cc@kernel.org/ Split the init_dummy_netdev() function in two. Create a new function called init_dummy_netdev_core() that does not memzero the net_device structure. Then have init_dummy_netdev() memzero-ing and calling init_dummy_netdev_core(), keeping the old behaviour. init_dummy_netdev_core() is the new function that could be called as an argument for alloc_netdev(). Also, create a helper to allocate and initialize dummy net devices, leveraging init_dummy_netdev_core() as the setup argument. This function basically simplify the allocation of dummy devices, by allocating and initializing it. Freeing the device continue to be done through free_netdev() Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 56 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 18 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d45f330d083d..f849e7d110ed 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4519,6 +4519,9 @@ static inline void netif_addr_unlock_bh(struct net_device *dev) void ether_setup(struct net_device *dev); +/* Allocate dummy net_device */ +struct net_device *alloc_netdev_dummy(int sizeof_priv); + /* Support for loadable net-drivers */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, diff --git a/net/core/dev.c b/net/core/dev.c index 62b39d6b1d8f..e09aa3785c15 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10420,25 +10420,12 @@ err_free_name: } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initializes the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * This is useful if you are calling this function after alloc_netdev(), + * since it does not memset the net_device fields. */ -void init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev_core(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * as they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ @@ -10459,8 +10446,28 @@ void init_dummy_netdev(struct net_device *dev) * its refcount. */ } -EXPORT_SYMBOL_GPL(init_dummy_netdev); +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initializes the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +void init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * as they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + init_dummy_netdev_core(dev); +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); /** * register_netdev - register a network device @@ -11080,6 +11087,19 @@ void free_netdev(struct net_device *dev) } EXPORT_SYMBOL(free_netdev); +/** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev_core); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + /** * synchronize_net - Synchronize with packet receive processing * -- cgit v1.2.3 From 0840556e5a3a331b6932ef17dd4bc94445df3297 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 29 Apr 2024 18:58:12 -0700 Subject: net: Protect dev->name by seqlock. We will convert ioctl(SIOCGARP) to RCU, and then we need to copy dev->name which is currently protected by rtnl_lock(). This patch does the following: 1) Add seqlock netdev_rename_lock to protect dev->name 2) Add netdev_copy_name() that copies dev->name to buffer under netdev_rename_lock 3) Use netdev_copy_name() in netdev_get_name() and drop devnet_rename_sem Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89iJEWs7AYSJqGCUABeVqOCTkErponfZdT5kV-iD=-SajnQ@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240430015813.71143-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + net/core/dev.c | 27 +++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f849e7d110ed..41853424b41d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); +void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, diff --git a/net/core/dev.c b/net/core/dev.c index cd7ba50eac15..e02d2363347e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -940,6 +940,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) } EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} + /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace @@ -951,7 +963,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -960,12 +971,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } @@ -1217,7 +1227,10 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); + write_sequnlock(&netdev_rename_lock); + if (err < 0) { up_write(&devnet_rename_sem); return err; @@ -1257,7 +1270,9 @@ rollback: if (err >= 0) { err = ret; down_write(&devnet_rename_sem); + write_seqlock(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; @@ -11403,8 +11418,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - if (new_name[0]) /* Rename the netdev to prepared name */ + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); + } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); -- cgit v1.2.3 From c1742dcb6bda5fd535fbaa2145f0a180bc329aa6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 May 2024 17:39:26 +0000 Subject: net: no longer acquire RTNL in threaded_show() dev->threaded can be read locklessly, if we add corresponding READ_ONCE()/WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240502173926.2010646-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- net/core/dev.c | 4 ++-- net/core/net-sysfs.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 41853424b41d..2814a15eed73 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2370,8 +2370,8 @@ struct net_device { struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; + bool threaded; unsigned wol_enabled:1; - unsigned threaded:1; struct list_head net_notifier_list; diff --git a/net/core/dev.c b/net/core/dev.c index e02d2363347e..d6b24749eb2e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6531,7 +6531,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } } - dev->threaded = threaded; + WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. @@ -6622,7 +6622,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = 0; + dev->threaded = false; netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1f7f09e56771..4c27a360c294 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -605,13 +605,13 @@ static ssize_t threaded_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + rcu_read_lock(); if (dev_isalive(netdev)) - ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); - rtnl_unlock(); return ret; } -- cgit v1.2.3 From 087b24de5c825c53f15a9481b94f757223c20610 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 1 May 2024 23:25:40 +0000 Subject: queue_api: define queue api This API enables the net stack to reset the queues used for devmem TCP. Signed-off-by: Mina Almasry Signed-off-by: Shailend Chand Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/net/netdev_queues.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2814a15eed73..cf261fb89d73 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1957,6 +1957,7 @@ enum netdev_reg_state { * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops * @stat_ops: Optional ops for queue-aware statistics + * @queue_mgmt_ops: Optional ops for queue management * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size @@ -2340,6 +2341,8 @@ struct net_device { const struct netdev_stat_ops *stat_ops; + const struct netdev_queue_mgmt_ops *queue_mgmt_ops; + /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index c7ac4539eafc..e7b84f018cee 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -87,6 +87,37 @@ struct netdev_stat_ops { struct netdev_queue_stats_tx *tx); }; +/** + * struct netdev_queue_mgmt_ops - netdev ops for queue management + * + * @ndo_queue_mem_size: Size of the struct that describes a queue's memory. + * + * @ndo_queue_mem_alloc: Allocate memory for an RX queue at the specified index. + * The new memory is written at the specified address. + * + * @ndo_queue_mem_free: Free memory from an RX queue. + * + * @ndo_queue_start: Start an RX queue with the specified memory and at the + * specified index. + * + * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped + * queue's memory is written at the specified address. + */ +struct netdev_queue_mgmt_ops { + size_t ndo_queue_mem_size; + int (*ndo_queue_mem_alloc)(struct net_device *dev, + void *per_queue_mem, + int idx); + void (*ndo_queue_mem_free)(struct net_device *dev, + void *per_queue_mem); + int (*ndo_queue_start)(struct net_device *dev, + void *per_queue_mem, + int idx); + int (*ndo_queue_stop)(struct net_device *dev, + void *per_queue_mem, + int idx); +}; + /** * DOC: Lockless queue stopping / waking helpers. * -- cgit v1.2.3 From 5c1672705a1a2389f5ad78e0fea6f08ed32d6f18 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 13 May 2024 08:41:55 -0700 Subject: net: revert partially applied PHY topology series The series is causing issues with PHY drivers built as modules. Since it was only partially applied and the merge window has opened let's revert and try again for v6.11. Revert 6916e461e793 ("net: phy: Introduce ethernet link topology representation") Revert 0ec5ed6c130e ("net: sfp: pass the phy_device when disconnecting an sfp module's PHY") Revert e75e4e074c44 ("net: phy: add helpers to handle sfp phy connect/disconnect") Revert fdd353965b52 ("net: sfp: Add helper to return the SFP bus name") Revert 841942bc6212 ("net: ethtool: Allow passing a phy index for some commands") Link: https://lore.kernel.org/all/171242462917.4000.9759453824684907063.git-patchwork-notify@kernel.org/ Link: https://lore.kernel.org/all/20240507102822.2023826-1-maxime.chevallier@bootlin.com/ Link: https://lore.kernel.org/r/20240513154156.104281-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 7 -- MAINTAINERS | 2 - drivers/net/phy/Makefile | 2 +- drivers/net/phy/marvell-88x2222.c | 2 - drivers/net/phy/marvell.c | 2 - drivers/net/phy/marvell10g.c | 2 - drivers/net/phy/phy_device.c | 55 -------------- drivers/net/phy/phy_link_topology.c | 105 --------------------------- drivers/net/phy/phylink.c | 3 +- drivers/net/phy/qcom/at803x.c | 2 - drivers/net/phy/qcom/qca807x.c | 2 - drivers/net/phy/sfp-bus.c | 15 +--- include/linux/netdevice.h | 4 +- include/linux/phy.h | 6 -- include/linux/phy_link_topology.h | 72 ------------------ include/linux/phy_link_topology_core.h | 25 ------- include/linux/sfp.h | 8 +- include/uapi/linux/ethtool.h | 16 ---- include/uapi/linux/ethtool_netlink.h | 1 - net/core/dev.c | 9 --- net/ethtool/netlink.c | 48 +----------- net/ethtool/netlink.h | 5 -- 22 files changed, 8 insertions(+), 385 deletions(-) delete mode 100644 drivers/net/phy/phy_link_topology.c delete mode 100644 include/linux/phy_link_topology.h delete mode 100644 include/linux/phy_link_topology_core.h (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 8bc71f249448..160bfb0ae8ba 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -57,7 +57,6 @@ Structure of this header is ``ETHTOOL_A_HEADER_DEV_INDEX`` u32 device ifindex ``ETHTOOL_A_HEADER_DEV_NAME`` string device name ``ETHTOOL_A_HEADER_FLAGS`` u32 flags common for all requests - ``ETHTOOL_A_HEADER_PHY_INDEX`` u32 phy device index ============================== ====== ============================= ``ETHTOOL_A_HEADER_DEV_INDEX`` and ``ETHTOOL_A_HEADER_DEV_NAME`` identify the @@ -82,12 +81,6 @@ the behaviour is backward compatible, i.e. requests from old clients not aware of the flag should be interpreted the way the client expects. A client must not set flags it does not understand. -``ETHTOOL_A_HEADER_PHY_INDEX`` identifies the Ethernet PHY the message relates to. -As there are numerous commands that are related to PHY configuration, and because -there may be more than one PHY on the link, the PHY index can be passed in the -request for the commands that needs it. It is, however, not mandatory, and if it -is not passed for commands that target a PHY, the net_device.phydev pointer -is used. Bit sets ======== diff --git a/MAINTAINERS b/MAINTAINERS index 40eb26e6c60c..b4ed6480e919 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8027,8 +8027,6 @@ F: include/linux/mii.h F: include/linux/of_net.h F: include/linux/phy.h F: include/linux/phy_fixed.h -F: include/linux/phy_link_topology.h -F: include/linux/phy_link_topology_core.h F: include/linux/phylib_stubs.h F: include/linux/platform_data/mdio-bcm-unimac.h F: include/linux/platform_data/mdio-gpio.h diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 1d8be374915f..202ed7f450da 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -2,7 +2,7 @@ # Makefile for Linux PHY drivers libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ - linkmode.o phy_link_topology.o + linkmode.o mdio-bus-y += mdio_bus.o mdio_device.o ifdef CONFIG_MDIO_DEVICE diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c index 0b777cdd7078..b88398e6872b 100644 --- a/drivers/net/phy/marvell-88x2222.c +++ b/drivers/net/phy/marvell-88x2222.c @@ -553,8 +553,6 @@ static const struct sfp_upstream_ops sfp_phy_ops = { .link_down = mv2222_sfp_link_down, .attach = phy_sfp_attach, .detach = phy_sfp_detach, - .connect_phy = phy_sfp_connect_phy, - .disconnect_phy = phy_sfp_disconnect_phy, }; static int mv2222_probe(struct phy_device *phydev) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 9964bf3dea2f..b89fbffa6a93 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -3613,8 +3613,6 @@ static const struct sfp_upstream_ops m88e1510_sfp_ops = { .module_remove = m88e1510_sfp_remove, .attach = phy_sfp_attach, .detach = phy_sfp_detach, - .connect_phy = phy_sfp_connect_phy, - .disconnect_phy = phy_sfp_disconnect_phy, }; static int m88e1510_probe(struct phy_device *phydev) diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 6642eb642d4b..ad43e280930c 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -503,8 +503,6 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id) static const struct sfp_upstream_ops mv3310_sfp_ops = { .attach = phy_sfp_attach, .detach = phy_sfp_detach, - .connect_phy = phy_sfp_connect_phy, - .disconnect_phy = phy_sfp_disconnect_phy, .module_insert = mv3310_sfp_insert, }; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 616bd7ba46cb..6c6ec9475709 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -277,14 +276,6 @@ static void phy_mdio_device_remove(struct mdio_device *mdiodev) static struct phy_driver genphy_driver; -static struct phy_link_topology *phy_get_link_topology(struct phy_device *phydev) -{ - if (phydev->attached_dev) - return phydev->attached_dev->link_topo; - - return NULL; -} - static LIST_HEAD(phy_fixup_list); static DEFINE_MUTEX(phy_fixup_lock); @@ -1378,46 +1369,6 @@ phy_standalone_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(phy_standalone); -/** - * phy_sfp_connect_phy - Connect the SFP module's PHY to the upstream PHY - * @upstream: pointer to the upstream phy device - * @phy: pointer to the SFP module's phy device - * - * This helper allows keeping track of PHY devices on the link. It adds the - * SFP module's phy to the phy namespace of the upstream phy - */ -int phy_sfp_connect_phy(void *upstream, struct phy_device *phy) -{ - struct phy_device *phydev = upstream; - struct phy_link_topology *topo = phy_get_link_topology(phydev); - - if (topo) - return phy_link_topo_add_phy(topo, phy, PHY_UPSTREAM_PHY, phydev); - - return 0; -} -EXPORT_SYMBOL(phy_sfp_connect_phy); - -/** - * phy_sfp_disconnect_phy - Disconnect the SFP module's PHY from the upstream PHY - * @upstream: pointer to the upstream phy device - * @phy: pointer to the SFP module's phy device - * - * This helper allows keeping track of PHY devices on the link. It removes the - * SFP module's phy to the phy namespace of the upstream phy. As the module phy - * will be destroyed, re-inserting the same module will add a new phy with a - * new index. - */ -void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy) -{ - struct phy_device *phydev = upstream; - struct phy_link_topology *topo = phy_get_link_topology(phydev); - - if (topo) - phy_link_topo_del_phy(topo, phy); -} -EXPORT_SYMBOL(phy_sfp_disconnect_phy); - /** * phy_sfp_attach - attach the SFP bus to the PHY upstream network device * @upstream: pointer to the phy device @@ -1560,11 +1511,6 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, if (phydev->sfp_bus_attached) dev->sfp_bus = phydev->sfp_bus; - - err = phy_link_topo_add_phy(dev->link_topo, phydev, - PHY_UPSTREAM_MAC, dev); - if (err) - goto error; } /* Some Ethernet drivers try to connect to a PHY device before @@ -1992,7 +1938,6 @@ void phy_detach(struct phy_device *phydev) if (dev) { phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; - phy_link_topo_del_phy(dev->link_topo, phydev); } phydev->phylink = NULL; diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c deleted file mode 100644 index 985941c5c558..000000000000 --- a/drivers/net/phy/phy_link_topology.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Infrastructure to handle all PHY devices connected to a given netdev, - * either directly or indirectly attached. - * - * Copyright (c) 2023 Maxime Chevallier - */ - -#include -#include -#include -#include -#include - -struct phy_link_topology *phy_link_topo_create(struct net_device *dev) -{ - struct phy_link_topology *topo; - - topo = kzalloc(sizeof(*topo), GFP_KERNEL); - if (!topo) - return ERR_PTR(-ENOMEM); - - xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1); - topo->next_phy_index = 1; - - return topo; -} - -void phy_link_topo_destroy(struct phy_link_topology *topo) -{ - if (!topo) - return; - - xa_destroy(&topo->phys); - kfree(topo); -} - -int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream) -{ - struct phy_device_node *pdn; - int ret; - - pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); - if (!pdn) - return -ENOMEM; - - pdn->phy = phy; - switch (upt) { - case PHY_UPSTREAM_MAC: - pdn->upstream.netdev = (struct net_device *)upstream; - if (phy_on_sfp(phy)) - pdn->parent_sfp_bus = pdn->upstream.netdev->sfp_bus; - break; - case PHY_UPSTREAM_PHY: - pdn->upstream.phydev = (struct phy_device *)upstream; - if (phy_on_sfp(phy)) - pdn->parent_sfp_bus = pdn->upstream.phydev->sfp_bus; - break; - default: - ret = -EINVAL; - goto err; - } - pdn->upstream_type = upt; - - /* Attempt to re-use a previously allocated phy_index */ - if (phy->phyindex) { - ret = xa_insert(&topo->phys, phy->phyindex, pdn, GFP_KERNEL); - - /* Errors could be either -ENOMEM or -EBUSY. If the phy has an - * index, and there's another entry at the same index, this is - * unexpected and we still error-out - */ - if (ret) - goto err; - return 0; - } - - ret = xa_alloc_cyclic(&topo->phys, &phy->phyindex, pdn, xa_limit_32b, - &topo->next_phy_index, GFP_KERNEL); - if (ret) - goto err; - - return 0; - -err: - kfree(pdn); - return ret; -} -EXPORT_SYMBOL_GPL(phy_link_topo_add_phy); - -void phy_link_topo_del_phy(struct phy_link_topology *topo, - struct phy_device *phy) -{ - struct phy_device_node *pdn = xa_erase(&topo->phys, phy->phyindex); - - /* We delete the PHY from the topology, however we don't re-set the - * phy->phyindex field. If the PHY isn't gone, we can re-assign it the - * same index next time it's added back to the topology - */ - - kfree(pdn); -} -EXPORT_SYMBOL_GPL(phy_link_topo_del_phy); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index b7e5c669dc8e..994471fad833 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -3411,8 +3411,7 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) return ret; } -static void phylink_sfp_disconnect_phy(void *upstream, - struct phy_device *phydev) +static void phylink_sfp_disconnect_phy(void *upstream) { phylink_disconnect_phy(upstream); } diff --git a/drivers/net/phy/qcom/at803x.c b/drivers/net/phy/qcom/at803x.c index 105602581a03..c8f83e5f78ab 100644 --- a/drivers/net/phy/qcom/at803x.c +++ b/drivers/net/phy/qcom/at803x.c @@ -770,8 +770,6 @@ static const struct sfp_upstream_ops at8031_sfp_ops = { .attach = phy_sfp_attach, .detach = phy_sfp_detach, .module_insert = at8031_sfp_insert, - .connect_phy = phy_sfp_connect_phy, - .disconnect_phy = phy_sfp_disconnect_phy, }; static int at8031_parse_dt(struct phy_device *phydev) diff --git a/drivers/net/phy/qcom/qca807x.c b/drivers/net/phy/qcom/qca807x.c index 5eb0ab1cb70e..672c6929119a 100644 --- a/drivers/net/phy/qcom/qca807x.c +++ b/drivers/net/phy/qcom/qca807x.c @@ -699,8 +699,6 @@ static const struct sfp_upstream_ops qca807x_sfp_ops = { .detach = phy_sfp_detach, .module_insert = qca807x_sfp_insert, .module_remove = qca807x_sfp_remove, - .connect_phy = phy_sfp_connect_phy, - .disconnect_phy = phy_sfp_disconnect_phy, }; static int qca807x_probe(struct phy_device *phydev) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 37c85f1e6534..2f44fc51848f 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -487,7 +487,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus) bus->socket_ops->stop(bus->sfp); bus->socket_ops->detach(bus->sfp); if (bus->phydev && ops && ops->disconnect_phy) - ops->disconnect_phy(bus->upstream, bus->phydev); + ops->disconnect_phy(bus->upstream); } bus->registered = false; } @@ -743,7 +743,7 @@ void sfp_remove_phy(struct sfp_bus *bus) const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus); if (ops && ops->disconnect_phy) - ops->disconnect_phy(bus->upstream, bus->phydev); + ops->disconnect_phy(bus->upstream); bus->phydev = NULL; } EXPORT_SYMBOL_GPL(sfp_remove_phy); @@ -860,14 +860,3 @@ void sfp_unregister_socket(struct sfp_bus *bus) sfp_bus_put(bus); } EXPORT_SYMBOL_GPL(sfp_unregister_socket); - -const char *sfp_get_name(struct sfp_bus *bus) -{ - ASSERT_RTNL(); - - if (bus->sfp_dev) - return dev_name(bus->sfp_dev); - - return NULL; -} -EXPORT_SYMBOL_GPL(sfp_get_name); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf261fb89d73..d20c6c99eb88 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,6 +40,7 @@ #include #endif #include + #include #include #include @@ -51,7 +52,6 @@ #include #include #include -#include struct netpoll_info; struct device; @@ -1975,7 +1975,6 @@ enum netdev_reg_state { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one - * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2368,7 +2367,6 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif - struct phy_link_topology *link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index 3ddfe7fe781a..e6e83304558e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -550,9 +550,6 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. - * @phyindex: Unique id across the phy's parent tree of phys to address the PHY - * from userspace, similar to ifindex. A zero index means the PHY - * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -653,7 +650,6 @@ struct phy_device { struct device_link *devlink; - u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; @@ -1758,8 +1754,6 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); -int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); -void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h deleted file mode 100644 index 6b79feb607e7..000000000000 --- a/include/linux/phy_link_topology.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PHY device list allow maintaining a list of PHY devices that are - * part of a netdevice's link topology. PHYs can for example be chained, - * as is the case when using a PHY that exposes an SFP module, on which an - * SFP transceiver that embeds a PHY is connected. - * - * This list can then be used by userspace to leverage individual PHY - * capabilities. - */ -#ifndef __PHY_LINK_TOPOLOGY_H -#define __PHY_LINK_TOPOLOGY_H - -#include -#include - -struct xarray; -struct phy_device; -struct net_device; -struct sfp_bus; - -struct phy_device_node { - enum phy_upstream upstream_type; - - union { - struct net_device *netdev; - struct phy_device *phydev; - } upstream; - - struct sfp_bus *parent_sfp_bus; - - struct phy_device *phy; -}; - -struct phy_link_topology { - struct xarray phys; - u32 next_phy_index; -}; - -static inline struct phy_device * -phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex) -{ - struct phy_device_node *pdn = xa_load(&topo->phys, phyindex); - - if (pdn) - return pdn->phy; - - return NULL; -} - -#if IS_REACHABLE(CONFIG_PHYLIB) -int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream); - -void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy); - -#else -static inline int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream) -{ - return 0; -} - -static inline void phy_link_topo_del_phy(struct phy_link_topology *topo, - struct phy_device *phy) -{ -} -#endif - -#endif /* __PHY_LINK_TOPOLOGY_H */ diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h deleted file mode 100644 index 0a6479055745..000000000000 --- a/include/linux/phy_link_topology_core.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PHY_LINK_TOPOLOGY_CORE_H -#define __PHY_LINK_TOPOLOGY_CORE_H - -struct phy_link_topology; - -#if IS_REACHABLE(CONFIG_PHYLIB) - -struct phy_link_topology *phy_link_topo_create(struct net_device *dev); -void phy_link_topo_destroy(struct phy_link_topology *topo); - -#else - -static inline struct phy_link_topology *phy_link_topo_create(struct net_device *dev) -{ - return NULL; -} - -static inline void phy_link_topo_destroy(struct phy_link_topology *topo) -{ -} - -#endif - -#endif /* __PHY_LINK_TOPOLOGY_CORE_H */ diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 5ebc57f78c95..a45da7eef9a2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -544,7 +544,7 @@ struct sfp_upstream_ops { void (*link_down)(void *priv); void (*link_up)(void *priv); int (*connect_phy)(void *priv, struct phy_device *); - void (*disconnect_phy)(void *priv, struct phy_device *); + void (*disconnect_phy)(void *priv); }; #if IS_ENABLED(CONFIG_SFP) @@ -570,7 +570,6 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode); int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops); void sfp_bus_del_upstream(struct sfp_bus *bus); -const char *sfp_get_name(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -649,11 +648,6 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } - -static inline const char *sfp_get_name(struct sfp_bus *bus) -{ - return NULL; -} #endif #endif diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 041e09c3515d..8733a3117902 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2323,20 +2323,4 @@ struct ethtool_link_settings { * __u32 map_lp_advertising[link_mode_masks_nwords]; */ }; - -/** - * enum phy_upstream - Represents the upstream component a given PHY device - * is connected to, as in what is on the other end of the MII bus. Most PHYs - * will be attached to an Ethernet MAC controller, but in some cases, there's - * an intermediate PHY used as a media-converter, which will driver another - * MII interface as its output. - * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port, - * or ethernet controller) - * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter) - */ -enum phy_upstream { - PHY_UPSTREAM_MAC, - PHY_UPSTREAM_PHY, -}; - #endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index f17dbe54bf5e..b49b804b9495 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -132,7 +132,6 @@ enum { ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ ETHTOOL_A_HEADER_DEV_NAME, /* string */ ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_HEADER_CNT, diff --git a/net/core/dev.c b/net/core/dev.c index d2ce91a334c1..e1bb6d7856d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -158,7 +158,6 @@ #include #include #include -#include #include "dev.h" #include "net-sysfs.h" @@ -10998,12 +10997,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif - dev->link_topo = phy_link_topo_create(dev); - if (IS_ERR(dev->link_topo)) { - dev->link_topo = NULL; - goto free_all; - } - dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11092,8 +11085,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - phy_link_topo_destroy(dev->link_topo); - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 563e94e0cbd8..bd04f28d5cf4 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -4,7 +4,6 @@ #include #include #include "netlink.h" -#include static struct genl_family ethtool_genl_family; @@ -31,24 +30,6 @@ const struct nla_policy ethnl_header_policy_stats[] = { ETHTOOL_FLAGS_STATS), }; -const struct nla_policy ethnl_header_policy_phy[] = { - [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, - [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = ALTIFNAMSIZ - 1 }, - [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, - ETHTOOL_FLAGS_BASIC), - [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), -}; - -const struct nla_policy ethnl_header_policy_phy_stats[] = { - [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, - [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = ALTIFNAMSIZ - 1 }, - [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, - ETHTOOL_FLAGS_STATS), - [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), -}; - int ethnl_ops_begin(struct net_device *dev) { int ret; @@ -108,9 +89,8 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { - struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; + struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)]; const struct nlattr *devname_attr; - struct phy_device *phydev = NULL; struct net_device *dev = NULL; u32 flags = 0; int ret; @@ -124,7 +104,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ - ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, + ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy) - 1, header, NULL, extack); if (ret < 0) return ret; @@ -165,30 +145,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, return -EINVAL; } - if (dev) { - if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { - struct nlattr *phy_id; - - phy_id = tb[ETHTOOL_A_HEADER_PHY_INDEX]; - phydev = phy_link_topo_get_phy(dev->link_topo, - nla_get_u32(phy_id)); - if (!phydev) { - NL_SET_BAD_ATTR(extack, phy_id); - return -ENODEV; - } - } else { - /* If we need a PHY but no phy index is specified, fallback - * to dev->phydev - */ - phydev = dev->phydev; - } - } else if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { - NL_SET_ERR_MSG_ATTR(extack, header, - "can't target a PHY without a netdev"); - return -EINVAL; - } - - req_info->phydev = phydev; req_info->dev = dev; req_info->flags = flags; return 0; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index d57a890b5d9e..9a333a8d04c1 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -250,7 +250,6 @@ static inline unsigned int ethnl_reply_header_size(void) * @dev: network device the request is for (may be null) * @dev_tracker: refcount tracker for @dev reference * @flags: request flags common for all request types - * @phydev: phy_device connected to @dev this request is for (may be null) * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at @@ -260,7 +259,6 @@ struct ethnl_req_info { struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; - struct phy_device *phydev; }; static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) @@ -397,12 +395,9 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; -extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; -extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1]; -extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; -- cgit v1.2.3