diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/bpf_sk_storage.c | 4 | ||||
-rw-r--r-- | net/core/dev.c | 342 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 144 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 264 | ||||
-rw-r--r-- | net/core/devlink.c | 680 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 6 | ||||
-rw-r--r-- | net/core/dst.c | 6 | ||||
-rw-r--r-- | net/core/fib_rules.c | 4 | ||||
-rw-r--r-- | net/core/filter.c | 96 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 12 | ||||
-rw-r--r-- | net/core/flow_offload.c | 90 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 2 | ||||
-rw-r--r-- | net/core/neighbour.c | 29 | ||||
-rw-r--r-- | net/core/net-procfs.c | 24 | ||||
-rw-r--r-- | net/core/net_namespace.c | 52 | ||||
-rw-r--r-- | net/core/page_pool.c | 114 | ||||
-rw-r--r-- | net/core/pktgen.c | 167 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 31 | ||||
-rw-r--r-- | net/core/scm.c | 4 | ||||
-rw-r--r-- | net/core/selftests.c | 12 | ||||
-rw-r--r-- | net/core/skbuff.c | 75 | ||||
-rw-r--r-- | net/core/sock.c | 31 | ||||
-rw-r--r-- | net/core/sock_map.c | 22 |
24 files changed, 1503 insertions, 710 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index f7f16650fe9e..35ced6201814 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o -ifeq ($(CONFIG_INET),y) obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o -endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index f564f82e91d9..68d2cbf8331a 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -416,7 +416,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { - if (in_irq() || in_nmi()) + if (in_hardirq() || in_nmi()) return (unsigned long)NULL; return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); @@ -425,7 +425,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { - if (in_irq() || in_nmi()) + if (in_hardirq() || in_nmi()) return -EPERM; return ____bpf_sk_storage_delete(map, sk); diff --git a/net/core/dev.c b/net/core/dev.c index 8f1a47ad6781..74fd402d26dd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -676,131 +676,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/****************************************************************************** - * - * Device Boot-time Settings Routines - * - ******************************************************************************/ - -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; - -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; - - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strlcpy(s[i].name, name, IFNAMSIZ); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } - - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; -} - -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) -{ - struct netdev_boot_setup *s = dev_boot_setup; - int i; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} -EXPORT_SYMBOL(netdev_boot_setup_check); - - -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) -{ - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; - - sprintf(name, "%s%d", prefix, unit); - - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(&init_net, name)) - return 1; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; -} - -/* - * Saves at boot time configured settings for any netdevice. - */ -int __init netdev_boot_setup(char *str) -{ - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; - - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} - -__setup("netdev=", netdev_boot_setup); - /******************************************************************************* * * Device Interface Subroutines @@ -956,8 +831,7 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -1030,8 +904,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -3099,6 +2972,50 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** + * netif_set_real_num_queues - set actual number of RX and TX queues used + * @dev: Network device + * @txq: Actual number of TX queues + * @rxq: Actual number of RX queues + * + * Set the real number of both TX and RX queues. + * Does nothing if the number of queues is already correct. + */ +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq) +{ + unsigned int old_rxq = dev->real_num_rx_queues; + int err; + + if (txq < 1 || txq > dev->num_tx_queues || + rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + /* Start from increases, so the error path only does decreases - + * decreases can't fail. + */ + if (rxq > dev->real_num_rx_queues) { + err = netif_set_real_num_rx_queues(dev, rxq); + if (err) + return err; + } + if (txq > dev->real_num_tx_queues) { + err = netif_set_real_num_tx_queues(dev, txq); + if (err) + goto undo_rx; + } + if (rxq < dev->real_num_rx_queues) + WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); + if (txq < dev->real_num_tx_queues) + WARN_ON(netif_set_real_num_tx_queues(dev, txq)); + + return 0; +undo_rx: + WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); + return err; +} +EXPORT_SYMBOL(netif_set_real_num_queues); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * * This routine should set an upper limit on the number of RSS queues @@ -3190,7 +3107,7 @@ EXPORT_SYMBOL(__dev_kfree_skb_irq); void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { - if (in_irq() || irqs_disabled()) + if (in_hardirq() || irqs_disabled()) __dev_kfree_skb_irq(skb, reason); else dev_kfree_skb(skb); @@ -4012,7 +3929,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -4756,45 +4673,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) return rxqueue; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - u32 metalen, act = XDP_DROP; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; + u32 metalen, act; int off; - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_is_redirected(skb)) - return XDP_PASS; - - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. - */ - if (skb_cloned(skb) || skb_is_nonlinear(skb) || - skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) - goto do_drop; - } - /* The XDP program wants to see the packet starting at the MAC * header. */ @@ -4849,6 +4739,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb->protocol = eth_type_trans(skb, skb->dev); } + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ switch (act) { case XDP_REDIRECT: case XDP_TX: @@ -4859,6 +4756,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (metalen) skb_metadata_set(skb, metalen); break; + } + + return act; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (skb_linearize(skb)) + goto do_drop; + } + + act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; default: bpf_warn_invalid_xdp_action(act); fallthrough; @@ -5141,8 +5081,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, - &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -5324,7 +5263,6 @@ another_round: ret = NET_RX_DROP; goto out; } - skb_reset_mac_len(skb); } if (eth_type_vlan(skb->protocol)) { @@ -5650,25 +5588,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; - if (new) { - u32 i; - - mutex_lock(&new->aux->used_maps_mutex); - - /* generic XDP does not work with DEVMAPs that can - * have a bpf_prog installed on an entry - */ - for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i]) || - cpu_map_prog_allowed(new->aux->used_maps[i])) { - mutex_unlock(&new->aux->used_maps_mutex); - return -EINVAL; - } - } - - mutex_unlock(&new->aux->used_maps_mutex); - } - switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -5876,7 +5795,7 @@ static void flush_all_backlogs(void) */ ASSERT_RTNL(); - get_online_cpus(); + cpus_read_lock(); cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { @@ -5894,7 +5813,7 @@ static void flush_all_backlogs(void) for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); - put_online_cpus(); + cpus_read_unlock(); } /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ @@ -6011,7 +5930,6 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), @@ -6021,17 +5939,30 @@ static void gro_list_prepare(const struct list_head *head, skb_mac_header(skb), maclen); - diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { #if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - if (!diffs) { - struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT); - struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT); + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; - } #endif + } NAPI_GRO_CB(p)->same_flow = !diffs; } @@ -6296,8 +6227,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - skb_ext_reset(skb); - nf_reset_ct(skb); + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } napi->skb = skb; } @@ -7597,7 +7532,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, { struct netdev_adjacent *lower; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); @@ -9362,7 +9297,7 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } -static u8 dev_xdp_prog_count(struct net_device *dev) +u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; @@ -9372,6 +9307,7 @@ static u8 dev_xdp_prog_count(struct net_device *dev) count++; return count; } +EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -9465,6 +9401,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; @@ -9503,6 +9441,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; + } + } + cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { @@ -10134,7 +10080,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -10201,7 +10147,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -10841,7 +10787,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!p) return NULL; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 45ae6eeb2964..8c39283c26ae 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -16,10 +16,9 @@ * General list handling functions */ -static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, - const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global, - bool sync) +static struct netdev_hw_addr* +__hw_addr_create(const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; @@ -29,32 +28,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) - return -ENOMEM; + return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; - list_add_tail_rcu(&ha->list, &list->list); - list->count++; - return 0; + return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync, - int sync_count) + int sync_count, bool exclusive) { + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; - list_for_each_entry(ha, &list->list, list) { - if (ha->type == addr_type && - !memcmp(ha->addr, addr, addr_len)) { + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + parent = *ins_point; + if (diff < 0) { + ins_point = &parent->rb_left; + } else if (diff > 0) { + ins_point = &parent->rb_right; + } else { + if (exclusive) + return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) @@ -73,8 +84,25 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } } - return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, - sync); + ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); + if (!ha) + return -ENOMEM; + + /* The first address in dev->dev_addrs is pointed to by dev->dev_addr + * and mutated freely by device drivers and netdev ops, so if we insert + * it into the tree we'll end up with an invalid rbtree. + */ + if (list->count > 0) { + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); + } else { + RB_CLEAR_NODE(&ha->node); + } + + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + + return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, @@ -82,7 +110,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, - 0); + 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, @@ -103,24 +131,61 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; + + if (!RB_EMPTY_NODE(&ha->node)) + rb_erase(&ha->node, &list->tree); + list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } +static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + struct rb_node *node; + + /* The first address isn't inserted into the tree because in the dev->dev_addrs + * list it's the address pointed to by dev->dev_addr which is freely mutated + * in place, so we need to check it separately. + */ + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + return ha; + + node = list->tree.rb_node; + + while (node) { + struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); + int diff = memcmp(addr, ha->addr, addr_len); + + if (diff == 0 && addr_type) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + if (diff < 0) + node = node->rb_left; + else if (diff > 0) + node = node->rb_right; + else + return ha; + } + + return NULL; +} + static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { - struct netdev_hw_addr *ha; + struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) - return __hw_addr_del_entry(list, ha, global, sync); - } - return -ENOENT; + if (!ha) + return -ENOENT; + return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, @@ -137,7 +202,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, - false, true, ha->sync_cnt); + false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; @@ -407,6 +472,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; + list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); @@ -418,6 +484,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; + list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); @@ -552,22 +619,14 @@ EXPORT_SYMBOL(dev_addr_del); */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->uc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_UNICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST, true, false); + err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -745,22 +804,14 @@ EXPORT_SYMBOL(dev_uc_init); */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->mc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_MULTICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, true, false); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -773,7 +824,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global, false, 0); + NETDEV_HW_ADDR_T_MULTICAST, global, false, + 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 478d032f34ac..0e87237fd871 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/wireless.h> +#include <linux/if_bridge.h> #include <net/dsa.h> #include <net/wext.h> @@ -25,79 +27,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr) return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); - /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ - -int dev_ifconf(struct net *net, struct ifconf *ifc, int size) +int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + void __user *pos; + size_t size; + int len, total = 0, done; - /* - * Fetch the caller's info block. - */ + /* both the ifconf and the ifreq structures are slightly different */ + if (in_compat_syscall()) { + struct compat_ifconf ifc32; - pos = ifc->ifc_buf; - len = ifc->ifc_len; + if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) + return -EFAULT; - /* - * Loop over the interfaces, and write an info block for each. - */ + pos = compat_ptr(ifc32.ifcbuf); + len = ifc32.ifc_len; + size = sizeof(struct compat_ifreq); + } else { + struct ifconf ifc; + + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; - total = 0; + pos = ifc.ifc_buf; + len = ifc.ifc_len; + size = sizeof(struct ifreq); + } + + /* Loop over the interfaces, and write an info block for each. */ + rtnl_lock(); for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0, size); - else - done = gifconf_list[i](dev, pos + total, - len - total, size); - if (done < 0) - return -EFAULT; - total += done; - } + if (!pos) + done = inet_gifconf(dev, NULL, 0, size); + else + done = inet_gifconf(dev, pos + total, + len - total, size); + if (done < 0) { + rtnl_unlock(); + return -EFAULT; } + total += done; } + rtnl_unlock(); - /* - * All done. Write the updated control block back to the caller. - */ - ifc->ifc_len = total; + return put_user(total, &uifc->ifc_len); +} + +static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct ifmap *ifmap = &ifr->ifr_map; + + if (in_compat_syscall()) { + struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; + + cifmap->mem_start = dev->mem_start; + cifmap->mem_end = dev->mem_end; + cifmap->base_addr = dev->base_addr; + cifmap->irq = dev->irq; + cifmap->dma = dev->dma; + cifmap->port = dev->if_port; + + return 0; + } + + ifmap->mem_start = dev->mem_start; + ifmap->mem_end = dev->mem_end; + ifmap->base_addr = dev->base_addr; + ifmap->irq = dev->irq; + ifmap->dma = dev->dma; + ifmap->port = dev->if_port; - /* - * Both BSD and Solaris return 0 here, so we do too. - */ return 0; } +static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; + + if (!dev->netdev_ops->ndo_set_config) + return -EOPNOTSUPP; + + if (in_compat_syscall()) { + struct ifmap ifmap = { + .mem_start = cifmap->mem_start, + .mem_end = cifmap->mem_end, + .base_addr = cifmap->base_addr, + .irq = cifmap->irq, + .dma = cifmap->dma, + .port = cifmap->port, + }; + + return dev->netdev_ops->ndo_set_config(dev, &ifmap); + } + + return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); +} + /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ @@ -128,13 +159,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm break; case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; @@ -215,19 +240,19 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } -static int dev_do_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) +static int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; int err; - err = dsa_ndo_do_ioctl(dev, ifr, cmd); + err = dsa_ndo_eth_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) return err; - if (ops->ndo_do_ioctl) { + if (ops->ndo_eth_ioctl) { if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); + err = ops->ndo_eth_ioctl(dev, ifr, cmd); else err = -ENODEV; } @@ -235,10 +260,55 @@ static int dev_do_ioctl(struct net_device *dev, return err; } +static int dev_siocbond(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocbond) { + if (netif_device_present(dev)) + return ops->ndo_siocbond(dev, ifr, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocdevprivate) { + if (netif_device_present(dev)) + return ops->ndo_siocdevprivate(dev, ifr, data, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocwandev) { + if (netif_device_present(dev)) + return ops->ndo_siocwandev(dev, ifs); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, + unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); @@ -275,12 +345,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; + return dev_setifmap(dev, ifr); case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -307,6 +372,22 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCWANDEV: + return dev_siocwandev(dev, &ifr->ifr_settings); + + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!netif_device_present(dev)) + return -ENODEV; + if (!netif_is_bridge_master(dev)) + return -EOPNOTSUPP; + dev_hold(dev); + rtnl_unlock(); + err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); + dev_put(dev); + rtnl_lock(); + return err; + case SIOCSHWTSTAMP: err = net_hwtstamp_validate(ifr); if (err) @@ -317,23 +398,23 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) * Unknown or private ioctl */ default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) + return dev_siocdevprivate(dev, ifr, data, cmd); + + if (cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCSHWTSTAMP || + cmd == SIOCGHWTSTAMP) { + err = dev_eth_ioctl(dev, ifr, cmd); + } else if (cmd == SIOCBONDENSLAVE || cmd == SIOCBONDRELEASE || cmd == SIOCBONDSETHWADDR || cmd == SIOCBONDSLAVEINFOQUERY || cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCGHWTSTAMP || - cmd == SIOCWANDEV) { - err = dev_do_ioctl(dev, ifr, cmd); + cmd == SIOCBONDCHANGEACTIVE) { + err = dev_siocbond(dev, ifr, cmd); } else err = -EINVAL; @@ -386,7 +467,8 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, + void __user *data, bool *need_copyout) { int ret; char *colon; @@ -437,7 +519,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCETHTOOL: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ethtool(net, ifr); + ret = dev_ethtool(net, ifr, data); rtnl_unlock(); if (colon) *colon = ':'; @@ -456,7 +538,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (colon) *colon = ':'; @@ -502,7 +584,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (need_copyout) *need_copyout = false; @@ -527,7 +609,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); return ret; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 85032626de24..a856ae401ea5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -92,7 +92,8 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ DEVLINK_PORT_FN_STATE_ACTIVE), }; -static LIST_HEAD(devlink_list); +static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); +#define DEVLINK_REGISTERED XA_MARK_1 /* devlink_mutex * @@ -108,23 +109,23 @@ struct net *devlink_net(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_net); -static void __devlink_net_set(struct devlink *devlink, struct net *net) +static void devlink_put(struct devlink *devlink) { - write_pnet(&devlink->_net, net); + if (refcount_dec_and_test(&devlink->refcount)) + complete(&devlink->comp); } -void devlink_net_set(struct devlink *devlink, struct net *net) +static bool __must_check devlink_try_get(struct devlink *devlink) { - if (WARN_ON(devlink->registered)) - return; - __devlink_net_set(devlink, net); + return refcount_inc_not_zero(&devlink->refcount); } -EXPORT_SYMBOL_GPL(devlink_net_set); static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { struct devlink *devlink; + unsigned long index; + bool found = false; char *busname; char *devname; @@ -136,19 +137,19 @@ static struct devlink *devlink_get_from_attrs(struct net *net, lockdep_assert_held(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (strcmp(devlink->dev->bus->name, busname) == 0 && strcmp(dev_name(devlink->dev), devname) == 0 && - net_eq(devlink_net(devlink), net)) - return devlink; + net_eq(devlink_net(devlink), net)) { + found = true; + break; + } } - return ERR_PTR(-ENODEV); -} + if (!found || !devlink_try_get(devlink)) + devlink = ERR_PTR(-ENODEV); -static struct devlink *devlink_get_from_info(struct genl_info *info) -{ - return devlink_get_from_attrs(genl_info_net(info), info->attrs); + return devlink; } static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, @@ -499,7 +500,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, int err; mutex_lock(&devlink_mutex); - devlink = devlink_get_from_info(info); + devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); if (IS_ERR(devlink)) { mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); @@ -542,6 +543,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, unlock: if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return err; } @@ -554,6 +556,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, devlink = info->user_ptr[0]; if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); } @@ -817,10 +820,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } -static int -devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops, - struct devlink_port *port, struct sk_buff *msg, - struct netlink_ext_ack *extack, bool *msg_updated) +static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops, + struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; @@ -829,7 +833,8 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * if (!ops->port_function_hw_addr_get) return 0; - err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len, + extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -843,12 +848,11 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * } static int devlink_nl_rate_fill(struct sk_buff *msg, - struct devlink *devlink, struct devlink_rate *devlink_rate, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) { + struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -906,12 +910,11 @@ devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } -static int -devlink_port_fn_state_fill(struct devlink *devlink, - const struct devlink_ops *ops, - struct devlink_port *port, struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) +static int devlink_port_fn_state_fill(const struct devlink_ops *ops, + struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; @@ -920,7 +923,7 @@ devlink_port_fn_state_fill(struct devlink *devlink, if (!ops->port_fn_state_get) return 0; - err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack); + err = ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -948,7 +951,6 @@ static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { - struct devlink *devlink = port->devlink; const struct devlink_ops *ops; struct nlattr *function_attr; bool msg_updated = false; @@ -958,13 +960,12 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por if (!function_attr) return -EMSGSIZE; - ops = devlink->ops; - err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg, - extack, &msg_updated); + ops = port->devlink->ops; + err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack, + &msg_updated); if (err) goto out; - err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack, - &msg_updated); + err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); @@ -973,12 +974,12 @@ out: return err; } -static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) { + struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -1039,53 +1040,47 @@ nla_put_failure: static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { - struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; - if (!devlink_port->registered) - return; - WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0, - NULL); + err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(devlink_port->devlink), msg, 0, + DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { - struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; - WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && - cmd != DEVLINK_CMD_RATE_DEL); + WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_rate_fill(msg, devlink, devlink_rate, - cmd, 0, 0, 0, NULL); + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(devlink_rate->devlink), msg, 0, + DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, @@ -1094,13 +1089,18 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, struct devlink_rate *devlink_rate; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; @@ -1110,18 +1110,19 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_rate_fill(msg, devlink, - devlink_rate, - cmd, id, + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, NLM_F_MULTI, NULL); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1136,7 +1137,6 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_rate *devlink_rate = info->user_ptr[1]; - struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; @@ -1144,8 +1144,7 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_rate_fill(msg, devlink, devlink_rate, - DEVLINK_CMD_RATE_NEW, + err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { @@ -1193,20 +1192,30 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { + devlink_put(devlink); + continue; + } + if (idx < start) { idx++; + devlink_put(devlink); continue; } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); + devlink_put(devlink); if (err) goto out; idx++; @@ -1222,7 +1231,6 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; @@ -1230,8 +1238,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_port_fill(msg, devlink, devlink_port, - DEVLINK_CMD_PORT_NEW, + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { @@ -1248,32 +1255,39 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_port *devlink_port; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; continue; } - err = devlink_nl_port_fill(msg, devlink, devlink_port, + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); + NLM_F_MULTI, cb->extack); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1282,31 +1296,33 @@ out: return msg->len; } -static int devlink_port_type_set(struct devlink *devlink, - struct devlink_port *devlink_port, +static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; - if (devlink->ops->port_type_set) { - if (port_type == devlink_port->type) - return 0; - err = devlink->ops->port_type_set(devlink_port, port_type); - if (err) - return err; - devlink_port->desired_type = port_type; - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + if (!devlink_port->devlink->ops->port_type_set) + return -EOPNOTSUPP; + + if (port_type == devlink_port->type) return 0; - } - return -EOPNOTSUPP; + + err = devlink_port->devlink->ops->port_type_set(devlink_port, + port_type); + if (err) + return err; + + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; } -static int -devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port, - const struct nlattr *attr, struct netlink_ext_ack *extack) +static int devlink_port_function_hw_addr_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) { - const struct devlink_ops *ops; + const struct devlink_ops *ops = port->devlink->ops; const u8 *hw_addr; int hw_addr_len; @@ -1327,17 +1343,16 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * } } - ops = devlink->ops; if (!ops->port_function_hw_addr_set) { NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes"); return -EOPNOTSUPP; } - return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); + return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len, + extack); } -static int devlink_port_fn_state_set(struct devlink *devlink, - struct devlink_port *port, +static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { @@ -1345,18 +1360,18 @@ static int devlink_port_fn_state_set(struct devlink *devlink, const struct devlink_ops *ops; state = nla_get_u8(attr); - ops = devlink->ops; + ops = port->devlink->ops; if (!ops->port_fn_state_set) { NL_SET_ERR_MSG_MOD(extack, "Function does not support state setting"); return -EOPNOTSUPP; } - return ops->port_fn_state_set(devlink, port, state, extack); + return ops->port_fn_state_set(port, state, extack); } -static int -devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, - const struct nlattr *attr, struct netlink_ext_ack *extack) +static int devlink_port_function_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; @@ -1370,7 +1385,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { - err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } @@ -1380,7 +1395,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) - err = devlink_port_fn_state_set(devlink, port, attr, extack); + err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); @@ -1391,14 +1406,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); - err = devlink_port_type_set(devlink, devlink_port, port_type); + err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } @@ -1407,7 +1421,7 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; - err = devlink_port_function_set(devlink, devlink_port, attr, extack); + err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } @@ -1502,9 +1516,8 @@ static int devlink_port_new_notifiy(struct devlink *devlink, goto out; } - err = devlink_nl_port_fill(msg, devlink, devlink_port, - DEVLINK_CMD_NEW, info->snd_portid, - info->snd_seq, 0, NULL); + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0, NULL); if (err) goto out; @@ -1908,13 +1921,18 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { @@ -1928,11 +1946,14 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2052,14 +2073,19 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_pool_get) - continue; + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, @@ -2070,10 +2096,13 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2265,14 +2294,19 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_port_pool_get) - continue; + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, @@ -2283,10 +2317,13 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2506,14 +2543,18 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_tc_pool_bind_get) - continue; + goto retry; mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { @@ -2526,10 +2567,13 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -3801,10 +3845,12 @@ static void devlink_param_notify(struct devlink *devlink, struct devlink_param_item *param_item, enum devlink_command cmd); -static void devlink_reload_netns_change(struct devlink *devlink, - struct net *dest_net) +static void devlink_ns_change_notify(struct devlink *devlink, + struct net *dest_net, struct net *curr_net, + bool new) { struct devlink_param_item *param_item; + enum devlink_command cmd; /* Userspace needs to be notified about devlink objects * removed from original and entering new network namespace. @@ -3812,17 +3858,18 @@ static void devlink_reload_netns_change(struct devlink *devlink, * reload process so the notifications are generated separatelly. */ - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - devlink_notify(devlink, DEVLINK_CMD_DEL); + if (!dest_net || net_eq(dest_net, curr_net)) + return; - __devlink_net_set(devlink, dest_net); + if (new) + devlink_notify(devlink, DEVLINK_CMD_NEW); - devlink_notify(devlink, DEVLINK_CMD_NEW); + cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, 0, param_item, cmd); + + if (!new) + devlink_notify(devlink, DEVLINK_CMD_DEL); } static bool devlink_reload_supported(const struct devlink_ops *ops) @@ -3902,6 +3949,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, u32 *actions_performed, struct netlink_ext_ack *extack) { u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + struct net *curr_net; int err; if (!devlink->reload_enabled) @@ -3909,18 +3957,22 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); + + curr_net = devlink_net(devlink); + devlink_ns_change_notify(devlink, dest_net, curr_net, false); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; - if (dest_net && !net_eq(dest_net, devlink_net(devlink))) - devlink_reload_netns_change(devlink, dest_net); + if (dest_net && !net_eq(dest_net, curr_net)) + write_pnet(&devlink->_net, dest_net); err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; + devlink_ns_change_notify(devlink, dest_net, curr_net, true); WARN_ON(!(*actions_performed & BIT(action))); /* Catch driver on updating the remote action within devlink reload */ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, @@ -4117,7 +4169,7 @@ out_free_msg: static void devlink_flash_update_begin_notify(struct devlink *devlink) { - struct devlink_flash_notify params = { 0 }; + struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, @@ -4126,7 +4178,7 @@ static void devlink_flash_update_begin_notify(struct devlink *devlink) static void devlink_flash_update_end_notify(struct devlink *devlink) { - struct devlink_flash_notify params = { 0 }; + struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, @@ -4283,6 +4335,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -4553,13 +4620,18 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink_param_item *param_item; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(param_item, &devlink->param_list, list) { if (idx < start) { @@ -4575,11 +4647,14 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -4821,13 +4896,18 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink_port *devlink_port; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { list_for_each_entry(param_item, @@ -4847,12 +4927,15 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -5062,7 +5145,6 @@ static void devlink_nl_region_notify(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd) { - struct devlink *devlink = region->devlink; struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); @@ -5071,8 +5153,9 @@ static void devlink_nl_region_notify(struct devlink_region *region, if (IS_ERR(msg)) return; - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, + devlink_net(region->devlink), msg, 0, + DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** @@ -5390,15 +5473,22 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, &idx, start); +retry: + devlink_put(devlink); if (err) goto out; } @@ -5761,6 +5851,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return skb->len; @@ -5769,6 +5860,7 @@ nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: mutex_unlock(&devlink->lock); + devlink_put(devlink); out_dev: mutex_unlock(&devlink_mutex); return err; @@ -5915,22 +6007,20 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; - if (idx < start) { - idx++; - continue; - } - if (!devlink->ops->info_get) { - idx++; - continue; - } + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + + if (idx < start || !devlink->ops->info_get) + goto inc; mutex_lock(&devlink->lock); err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, @@ -5940,9 +6030,14 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, mutex_unlock(&devlink->lock); if (err == -EOPNOTSUPP) err = 0; - else if (err) + else if (err) { + devlink_put(devlink); break; + } +inc: idx++; +retry: + devlink_put(devlink); } mutex_unlock(&devlink_mutex); @@ -6756,11 +6851,11 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, struct devlink_health_reporter *reporter, enum devlink_command cmd, u32 portid, u32 seq, int flags) { + struct devlink *devlink = reporter->devlink; struct nlattr *reporter_attr; void *hdr; @@ -6837,8 +6932,7 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, if (!msg) return; - err = devlink_nl_health_reporter_fill(msg, reporter->devlink, - reporter, cmd, 0, 0, 0); + err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; @@ -7028,6 +7122,7 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) goto unlock; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return reporter; unlock: @@ -7071,7 +7166,7 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, goto out; } - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, info->snd_portid, info->snd_seq, 0); @@ -7094,13 +7189,18 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, struct devlink_port *port; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry_rep; + mutex_lock(&devlink->reporters_lock); list_for_each_entry(reporter, &devlink->reporter_list, list) { @@ -7108,24 +7208,29 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_health_reporter_fill(msg, devlink, - reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill( + msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + NLM_F_MULTI); if (err) { mutex_unlock(&devlink->reporters_lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->reporters_lock); +retry_rep: + devlink_put(devlink); } - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry_port; + mutex_lock(&devlink->lock); list_for_each_entry(port, &devlink->port_list, list) { mutex_lock(&port->reporters_lock); @@ -7134,14 +7239,15 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill( + msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { mutex_unlock(&port->reporters_lock); mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; @@ -7149,6 +7255,8 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, mutex_unlock(&port->reporters_lock); } mutex_unlock(&devlink->lock); +retry_port: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7677,13 +7785,18 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, struct devlink_trap_item *trap_item; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < start) { @@ -7697,11 +7810,14 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7896,13 +8012,18 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, u32 portid = NETLINK_CB(cb->skb).portid; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(group_item, &devlink->trap_group_list, list) { @@ -7917,11 +8038,14 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -8202,13 +8326,18 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, u32 portid = NETLINK_CB(cb->skb).portid; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { @@ -8223,11 +8352,14 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -8768,30 +8900,44 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) } /** - * devlink_alloc - Allocate new devlink instance resources + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace * * @ops: ops * @priv_size: size of user private data + * @net: net namespace + * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ -struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev) { struct devlink *devlink; + static u32 last_id; + int ret; - if (WARN_ON(!ops)) - return NULL; - + WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; + + ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret < 0) { + kfree(devlink); + return NULL; + } + + devlink->dev = dev; devlink->ops = ops; xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - __devlink_net_set(devlink, &init_net); + write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->sb_list); @@ -8805,22 +8951,22 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->trap_policer_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); + refcount_set(&devlink->refcount, 1); + init_completion(&devlink->comp); + return devlink; } -EXPORT_SYMBOL_GPL(devlink_alloc); +EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_register - Register devlink instance * * @devlink: devlink - * @dev: parent device */ -int devlink_register(struct devlink *devlink, struct device *dev) +int devlink_register(struct devlink *devlink) { - devlink->dev = dev; - devlink->registered = true; mutex_lock(&devlink_mutex); - list_add_tail(&devlink->list, &devlink_list); + xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); return 0; @@ -8834,11 +8980,14 @@ EXPORT_SYMBOL_GPL(devlink_register); */ void devlink_unregister(struct devlink *devlink) { + devlink_put(devlink); + wait_for_completion(&devlink->comp); + mutex_lock(&devlink_mutex); WARN_ON(devlink_reload_supported(devlink->ops) && devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); - list_del(&devlink->list); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_unregister); @@ -8900,6 +9049,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); + xa_erase(&devlinks, devlink->index); kfree(devlink); } @@ -8960,9 +9110,10 @@ int devlink_port_register(struct devlink *devlink, mutex_unlock(&devlink->lock); return -EEXIST; } + + WARN_ON(devlink_port->devlink); devlink_port->devlink = devlink; devlink_port->index = port_index; - devlink_port->registered = true; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); @@ -9001,7 +9152,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->registered)) + if (WARN_ON(!devlink_port->devlink)) return; devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); @@ -9121,7 +9272,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); @@ -9145,7 +9296,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); @@ -9172,7 +9323,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); @@ -9200,7 +9351,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); @@ -9788,6 +9939,22 @@ static int devlink_param_verify(const struct devlink_param *param) return devlink_param_driver_verify(param); } +static int __devlink_param_register_one(struct devlink *devlink, + unsigned int port_index, + struct list_head *param_list, + const struct devlink_param *param, + enum devlink_command reg_cmd) +{ + int err; + + err = devlink_param_verify(param); + if (err) + return err; + + return devlink_param_register_one(devlink, port_index, + param_list, param, reg_cmd); +} + static int __devlink_params_register(struct devlink *devlink, unsigned int port_index, struct list_head *param_list, @@ -9802,12 +9969,8 @@ static int __devlink_params_register(struct devlink *devlink, mutex_lock(&devlink->lock); for (i = 0; i < params_count; i++, param++) { - err = devlink_param_verify(param); - if (err) - goto rollback; - - err = devlink_param_register_one(devlink, port_index, - param_list, param, reg_cmd); + err = __devlink_param_register_one(devlink, port_index, + param_list, param, reg_cmd); if (err) goto rollback; } @@ -9880,6 +10043,43 @@ void devlink_params_unregister(struct devlink *devlink, EXPORT_SYMBOL_GPL(devlink_params_unregister); /** + * devlink_param_register - register one configuration parameter + * + * @devlink: devlink + * @param: one configuration parameter + * + * Register the configuration parameter supported by the driver. + * Return: returns 0 on successful registration or error code otherwise. + */ +int devlink_param_register(struct devlink *devlink, + const struct devlink_param *param) +{ + int err; + + mutex_lock(&devlink->lock); + err = __devlink_param_register_one(devlink, 0, &devlink->param_list, + param, DEVLINK_CMD_PARAM_NEW); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_param_register); + +/** + * devlink_param_unregister - unregister one configuration parameter + * @devlink: devlink + * @param: configuration parameter to unregister + */ +void devlink_param_unregister(struct devlink *devlink, + const struct devlink_param *param) +{ + mutex_lock(&devlink->lock); + devlink_param_unregister_one(devlink, 0, &devlink->param_list, param, + DEVLINK_CMD_PARAM_DEL); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_param_unregister); + +/** * devlink_params_publish - publish configuration parameters * * @devlink: devlink @@ -9922,6 +10122,54 @@ void devlink_params_unpublish(struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_params_unpublish); /** + * devlink_param_publish - publish one configuration parameter + * + * @devlink: devlink + * @param: one configuration parameter + * + * Publish previously registered configuration parameter. + */ +void devlink_param_publish(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, &devlink->param_list, list) { + if (param_item->param != param || param_item->published) + continue; + param_item->published = true; + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); + break; + } +} +EXPORT_SYMBOL_GPL(devlink_param_publish); + +/** + * devlink_param_unpublish - unpublish one configuration parameter + * + * @devlink: devlink + * @param: one configuration parameter + * + * Unpublish previously registered configuration parameter. + */ +void devlink_param_unpublish(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, &devlink->param_list, list) { + if (param_item->param != param || !param_item->published) + continue; + param_item->published = false; + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + break; + } +} +EXPORT_SYMBOL_GPL(devlink_param_unpublish); + +/** * devlink_port_params_register - register port configuration parameters * * @devlink_port: devlink port @@ -11276,23 +11524,29 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; + unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (net_eq(devlink_net(devlink), net)) { - if (WARN_ON(!devlink_reload_supported(devlink->ops))) - continue; - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - } + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + + if (!net_eq(devlink_net(devlink), net)) + goto retry; + + WARN_ON(!devlink_reload_supported(devlink->ops)); + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); +retry: + devlink_put(devlink); } mutex_unlock(&devlink_mutex); } diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index ead2a8aa57b4..49442cae6f69 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -850,8 +850,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) } hw_metadata->input_dev = metadata->input_dev; - if (hw_metadata->input_dev) - dev_hold(hw_metadata->input_dev); + dev_hold(hw_metadata->input_dev); return hw_metadata; @@ -867,8 +866,7 @@ free_hw_metadata: static void net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) { - if (hw_metadata->input_dev) - dev_put(hw_metadata->input_dev); + dev_put(hw_metadata->input_dev); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); diff --git a/net/core/dst.c b/net/core/dst.c index fb3bcba87744..497ef9b3fc6a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -49,8 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags) { dst->dev = dev; - if (dev) - dev_hold(dev); + dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; @@ -118,8 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); - if (dst->dev) - dev_put(dst->dev); + dev_put(dst->dev); lwtstate_put(dst->lwtstate); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a9f937975080..79df7cd9dbc1 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -57,7 +57,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; @@ -541,7 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - nlrule = kzalloc(ops->rule_size, GFP_KERNEL); + nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index d70187ce851b..3aca07c44fad 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -77,6 +77,7 @@ #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> +#include <net/xdp.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -2179,17 +2180,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, skb->tstamp = 0; if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, hh_len); - if (unlikely(!skb2)) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } rcu_read_lock_bh(); @@ -2213,8 +2206,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } rcu_read_unlock_bh(); if (dst) - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; @@ -2286,17 +2278,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, skb->tstamp = 0; if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, hh_len); - if (unlikely(!skb2)) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } rcu_read_lock_bh(); @@ -3880,8 +3864,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; - if (unlikely((metalen & (sizeof(__u32) - 1)) || - (metalen > 32))) + if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; @@ -3950,6 +3933,31 @@ void bpf_clear_redirect_map(struct bpf_map *map) } } +DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); +EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp) +{ + struct net_device *master, *slave; + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); + slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); + if (slave && slave != xdp->rxq->dev) { + /* The target device is different from the receiving device, so + * redirect it to the new device. + * Using XDP_REDIRECT gets the correct behaviour from XDP enabled + * drivers to unmap the packet from their rx ring. + */ + ri->tgt_index = slave->ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + return XDP_REDIRECT; + } + return XDP_TX; +} +EXPORT_SYMBOL_GPL(xdp_master_redirect); + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { @@ -4040,8 +4048,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, goto err; consume_skb(skb); break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; default: - /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } @@ -5012,6 +5024,40 @@ err_clear: return -EINVAL; } +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 4b2415d34873..bac0184cf3de 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1056,8 +1056,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } @@ -1101,8 +1103,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &iph->saddr, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 715b67f6c62f..6beaea13564a 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -321,13 +321,13 @@ EXPORT_SYMBOL(flow_block_cb_setup_simple); static DEFINE_MUTEX(flow_indr_block_lock); static LIST_HEAD(flow_block_indr_list); static LIST_HEAD(flow_block_indr_dev_list); +static LIST_HEAD(flow_indir_dev_list); struct flow_indr_dev { struct list_head list; flow_indr_block_bind_cb_t *cb; void *cb_priv; refcount_t refcnt; - struct rcu_head rcu; }; static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, @@ -346,6 +346,33 @@ static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, return indr_dev; } +struct flow_indir_dev_info { + void *data; + struct net_device *dev; + struct Qdisc *sch; + enum tc_setup_type type; + void (*cleanup)(struct flow_block_cb *block_cb); + struct list_head list; + enum flow_block_command command; + enum flow_block_binder_type binder_type; + struct list_head *cb_list; +}; + +static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_block_offload bo; + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + memset(&bo, 0, sizeof(bo)); + bo.command = cur->command; + bo.binder_type = cur->binder_type; + INIT_LIST_HEAD(&bo.cb_list); + cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup); + list_splice(&bo.cb_list, cur->cb_list); + } +} + int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_indr_dev *indr_dev; @@ -367,6 +394,7 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) } list_add(&indr_dev->list, &flow_block_indr_dev_list); + existing_qdiscs_register(cb, cb_priv); mutex_unlock(&flow_indr_block_lock); return 0; @@ -463,7 +491,59 @@ out: } EXPORT_SYMBOL(flow_indr_block_cb_alloc); -int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, +static struct flow_indir_dev_info *find_indir_dev(void *data) +{ + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + if (cur->data == data) + return cur; + } + return NULL; +} + +static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb), + struct flow_block_offload *bo) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (info) + return -EEXIST; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->data = data; + info->dev = dev; + info->sch = sch; + info->type = type; + info->cleanup = cleanup; + info->command = bo->command; + info->binder_type = bo->binder_type; + info->cb_list = bo->cb_list_head; + + list_add(&info->list, &flow_indir_dev_list); + return 0; +} + +static int indir_dev_remove(void *data) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (!info) + return -ENOENT; + + list_del(&info->list); + + kfree(info); + return 0; +} + +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)) @@ -471,6 +551,12 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, struct flow_indr_dev *this; mutex_lock(&flow_indr_block_lock); + + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + list_for_each_entry(this, &flow_block_indr_dev_list, list) this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 8ec7d13d2860..d0ae987d2de9 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -43,6 +43,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; + case LWTUNNEL_ENCAP_IOAM6: + return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 53e85c70c6e5..2d5bc3a75fae 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -741,12 +741,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; - if (dev) - dev_hold(dev); + dev_hold(dev); if (tbl->pconstructor && tbl->pconstructor(n)) { - if (dev) - dev_put(dev); + dev_put(dev); kfree(n); n = NULL; goto out; @@ -778,8 +776,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); + dev_put(n->dev); kfree(n); return 0; } @@ -812,8 +809,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); + dev_put(n->dev); kfree(n); } return -ENOENT; @@ -1662,8 +1658,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); + dev_put(parms->dev); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -2533,6 +2528,13 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) return false; master = dev ? netdev_master_upper_dev_get(dev) : NULL; + + /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another + * invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + if (!master || master->ifindex != master_idx) return true; @@ -3315,12 +3317,13 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); + seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } - seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx %08lx\n", + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx " + "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..eab5fc88a002 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -77,8 +77,8 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " - "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + seq_printf(seq, "%9s: %16llu %12llu %4llu %6llu %4llu %5llu %10llu %9llu " + "%16llu %12llu %4llu %6llu %4llu %5llu %7llu %10llu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, @@ -103,11 +103,11 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) static int dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); + seq_puts(seq, "Interface| Receive " + " | Transmit\n" + " | bytes packets errs drop fifo frame " + "compressed multicast| bytes packets errs " + " drop fifo colls carrier compressed\n"); else dev_seq_printf_stats(seq, v); return 0; @@ -259,14 +259,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v) struct packet_type *pt = v; if (v == SEQ_START_TOKEN) - seq_puts(seq, "Type Device Function\n"); + seq_puts(seq, "Type Device Function\n"); else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-8s %ps\n", + seq_printf(seq, " %-9s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } @@ -327,12 +327,14 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct netdev_hw_addr *ha; struct net_device *dev = v; - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Ifindex Interface Refcount Global_use Address\n"); return 0; + } netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + seq_printf(seq, "%-7d %-9s %-8d %-10d %*phN\n", dev->ifindex, dev->name, ha->refcount, ha->global_use, (int)dev->addr_len, ha->addr); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9b5a767eddd5..a448a9b5bb2d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -98,7 +98,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) } ng = net_alloc_generic(); - if (ng == NULL) + if (!ng) return -ENOMEM; /* @@ -148,13 +148,6 @@ out: return err; } -static void ops_free(const struct pernet_operations *ops, struct net *net) -{ - if (ops->id && ops->size) { - kfree(net_generic(net, *ops->id)); - } -} - static void ops_pre_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { @@ -184,7 +177,7 @@ static void ops_free_list(const struct pernet_operations *ops, struct net *net; if (ops->size && ops->id) { list_for_each_entry(net, net_exit_list, exit_list) - ops_free(ops, net); + kfree(net_generic(net, *ops->id)); } } @@ -433,15 +426,18 @@ out_free: static void net_free(struct net *net) { - kfree(rcu_access_pointer(net->gen)); - kmem_cache_free(net_cachep, net); + if (refcount_dec_and_test(&net->passive)) { + kfree(rcu_access_pointer(net->gen)); + kmem_cache_free(net_cachep, net); + } } void net_drop_ns(void *p) { - struct net *ns = p; - if (ns && refcount_dec_and_test(&ns->passive)) - net_free(ns); + struct net *net = (struct net *)p; + + if (net) + net_free(net); } struct net *copy_net_ns(unsigned long flags, @@ -479,7 +475,7 @@ struct net *copy_net_ns(unsigned long flags, put_userns: key_remove_domain(net->key_domain); put_user_ns(user_ns); - net_drop_ns(net); + net_free(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -611,7 +607,7 @@ static void cleanup_net(struct work_struct *work) dec_net_namespaces(net->ucounts); key_remove_domain(net->key_domain); put_user_ns(net->user_ns); - net_drop_ns(net); + net_free(net); } } @@ -1120,6 +1116,14 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); +static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) +{ + ops_pre_exit_list(ops, net_exit_list); + synchronize_rcu(); + ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); +} + #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) @@ -1145,10 +1149,7 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + free_exit_list(ops, &net_exit_list); return error; } @@ -1161,10 +1162,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + + free_exit_list(ops, &net_exit_list); } #else @@ -1187,10 +1186,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + free_exit_list(ops, &net_exit_list); } } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 8ab7b402244c..1a6978427d6c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX LONG_MAX + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -67,6 +69,10 @@ static int page_pool_init(struct page_pool *pool, */ } + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -206,6 +212,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -222,7 +241,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } - page->pp_magic |= PP_SIGNATURE; + page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -266,7 +285,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } - page->pp_magic |= PP_SIGNATURE; + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -345,12 +365,12 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page->pp_magic = 0; + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return(&pool->pages_state_release_cnt); + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } EXPORT_SYMBOL(page_pool_release_page); @@ -405,6 +425,11 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -497,6 +522,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } EXPORT_SYMBOL(page_pool_put_page_bulk); +static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_atomic_sub_frag_count_return(page, + drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -602,6 +705,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_free_frag(pool); + if (!page_pool_release(pool)) return; @@ -652,7 +757,6 @@ bool page_pool_return_skb_page(struct page *page) * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page->pp = NULL; page_pool_put_full_page(pp, page, false); return true; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e258d255e90..9e5a3249373c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -175,6 +175,9 @@ #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) +/* Max number of internet mix entries that can be specified in imix_weights. */ +#define MAX_IMIX_ENTRIES 20 +#define IMIX_PRECISION 100 /* Precision of IMIX distribution */ #define func_enter() pr_debug("entering %s\n", __func__); @@ -242,6 +245,12 @@ static char *pkt_flag_names[] = { #define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) #define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) +struct imix_pkt { + u64 size; + u64 weight; + u64 count_so_far; +}; + struct flow_state { __be32 cur_daddr; int count; @@ -343,6 +352,12 @@ struct pktgen_dev { __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ + /* IMIX */ + unsigned int n_imix_entries; + struct imix_pkt imix_entries[MAX_IMIX_ENTRIES]; + /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/ + __u8 imix_distribution[IMIX_PRECISION]; + /* MPLS */ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */ __be32 labels[MAX_MPLS_LABELS]; @@ -471,6 +486,7 @@ static void pktgen_stop_all_threads(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static void fill_imix_distribution(struct pktgen_dev *pkt_dev); /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; @@ -552,6 +568,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + if (pkt_dev->n_imix_entries > 0) { + seq_puts(seq, " imix_weights: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].weight); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " frags: %d delay: %llu clone_skb: %d ifname: %s\n", pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, @@ -669,6 +695,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->sofar, (unsigned long long)pkt_dev->errors); + if (pkt_dev->n_imix_entries > 0) { + int i; + + seq_puts(seq, " imix_size_counts: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].count_so_far); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " started: %lluus stopped: %lluus idle: %lluus\n", (unsigned long long) ktime_to_us(pkt_dev->started_at), @@ -792,6 +830,62 @@ done_str: return i; } +/* Parses imix entries from user buffer. + * The user buffer should consist of imix entries separated by spaces + * where each entry consists of size and weight delimited by commas. + * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. + */ +static ssize_t get_imix_entries(const char __user *buffer, + struct pktgen_dev *pkt_dev) +{ + const int max_digits = 10; + int i = 0; + long len; + char c; + + pkt_dev->n_imix_entries = 0; + + do { + unsigned long weight; + unsigned long size; + + len = num_arg(&buffer[i], max_digits, &size); + if (len < 0) + return len; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + /* Check for comma between size_i and weight_i */ + if (c != ',') + return -EINVAL; + i++; + + if (size < 14 + 20 + 8) + size = 14 + 20 + 8; + + len = num_arg(&buffer[i], max_digits, &weight); + if (len < 0) + return len; + if (weight <= 0) + return -EINVAL; + + pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size; + pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; + + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + + i++; + pkt_dev->n_imix_entries++; + + if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) + return -E2BIG; + } while (c == ' '); + + return i; +} + static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) { unsigned int n = 0; @@ -960,6 +1054,20 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + if (!strcmp(name, "imix_weights")) { + if (pkt_dev->clone_skb > 0) + return -EINVAL; + + len = get_imix_entries(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + + fill_imix_distribution(pkt_dev); + + i += len; + return count; + } + if (!strcmp(name, "debug")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1082,10 +1190,16 @@ static ssize_t pktgen_if_write(struct file *file, len = num_arg(&user_buffer[i], 10, &value); if (len < 0) return len; + /* clone_skb is not supported for netif_receive xmit_mode and + * IMIX mode. + */ if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; + if (value > 0 && pkt_dev->n_imix_entries > 0) + return -EINVAL; + i += len; pkt_dev->clone_skb = value; @@ -1190,11 +1304,6 @@ static ssize_t pktgen_if_write(struct file *file, * pktgen_xmit() is called */ pkt_dev->last_ok = 1; - - /* override clone_skb if user passed default value - * at module loading time - */ - pkt_dev->clone_skb = 0; } else if (strcmp(f, "queue_xmit") == 0) { pkt_dev->xmit_mode = M_QUEUE_XMIT; pkt_dev->last_ok = 1; @@ -2477,6 +2586,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) t = pkt_dev->min_pkt_size; } pkt_dev->cur_pkt_size = t; + } else if (pkt_dev->n_imix_entries > 0) { + struct imix_pkt *entry; + __u32 t = prandom_u32() % IMIX_PRECISION; + __u8 entry_index = pkt_dev->imix_distribution[t]; + + entry = &pkt_dev->imix_entries[entry_index]; + entry->count_so_far++; + pkt_dev->cur_pkt_size = entry->size; } set_cur_queue_map(pkt_dev); @@ -2484,6 +2601,32 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].count++; } +static void fill_imix_distribution(struct pktgen_dev *pkt_dev) +{ + int cumulative_probabilites[MAX_IMIX_ENTRIES]; + int j = 0; + __u64 cumulative_prob = 0; + __u64 total_weight = 0; + int i = 0; + + for (i = 0; i < pkt_dev->n_imix_entries; i++) + total_weight += pkt_dev->imix_entries[i].weight; + + /* Fill cumulative_probabilites with sum of normalized probabilities */ + for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) { + cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight * + IMIX_PRECISION, + total_weight); + cumulative_probabilites[i] = cumulative_prob; + } + cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100; + + for (i = 0; i < IMIX_PRECISION; i++) { + if (i == cumulative_probabilites[j]) + j++; + pkt_dev->imix_distribution[i] = j; + } +} #ifdef CONFIG_XFRM static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { @@ -3145,7 +3288,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, ktime_to_ns(elapsed)); - bps = pps * 8 * pkt_dev->cur_pkt_size; + if (pkt_dev->n_imix_entries > 0) { + int i; + struct imix_pkt *entry; + + bps = 0; + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + entry = &pkt_dev->imix_entries[i]; + bps += entry->size * entry->count_so_far; + } + bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed)); + } else { + bps = pps * 8 * pkt_dev->cur_pkt_size; + } mbps = bps; do_div(mbps, 1000000); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 662eb1c37f47..972c8cb303a5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -710,15 +710,8 @@ out: int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; - int err = 0; - NETLINK_CB(skb).dst_group = group; - if (echo) - refcount_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); - if (echo) - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - return err; + return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) @@ -733,12 +726,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; - int report = 0; - if (nlh) - report = nlmsg_report(nlh); - - nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); @@ -1970,6 +1959,13 @@ static bool link_master_filtered(struct net_device *dev, int master_idx) return false; master = netdev_master_upper_dev_get(dev); + + /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need + * another invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + if (!master || master->ifindex != master_idx) return true; @@ -2268,7 +2264,8 @@ invalid_attr: return -EINVAL; } -static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) { if (dev) { if (tb[IFLA_ADDRESS] && @@ -2295,7 +2292,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EOPNOTSUPP; if (af_ops->validate_link_af) { - err = af_ops->validate_link_af(dev, af); + err = af_ops->validate_link_af(dev, af, extack); if (err < 0) return err; } @@ -2603,7 +2600,7 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; - err = validate_linkmsg(dev, tb); + err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; @@ -3302,7 +3299,7 @@ replay: m_ops = master_dev->rtnl_link_ops; } - err = validate_linkmsg(dev, tb); + err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; diff --git a/net/core/scm.c b/net/core/scm.c index ae3085d9aae8..5c356f0dee30 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -79,7 +79,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -355,7 +355,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); diff --git a/net/core/selftests.c b/net/core/selftests.c index ba7b0171974c..9077fa969892 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -318,6 +318,15 @@ static int net_test_phy_loopback_udp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +static int net_test_phy_loopback_udp_mtu(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.max_size = ndev->mtu; + return __net_test_loopback(ndev, &attr); +} + static int net_test_phy_loopback_tcp(struct net_device *ndev) { struct net_packet_attrs attr = { }; @@ -345,6 +354,9 @@ static const struct net_test { .name = "PHY internal loopback, UDP ", .fn = net_test_phy_loopback_udp, }, { + .name = "PHY internal loopback, MTU ", + .fn = net_test_phy_loopback_udp_mtu, + }, { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fc7942c0dddc..f9311762cc47 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -156,7 +156,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) void *data; fragsz = SKB_DATA_ALIGN(fragsz); - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); } else { @@ -502,7 +502,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; @@ -724,7 +724,7 @@ void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); if (skb->destructor) { - WARN_ON(in_irq()); + WARN_ON(in_hardirq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -954,9 +954,13 @@ void __kfree_skb_defer(struct sk_buff *skb) void napi_skb_free_stolen_head(struct sk_buff *skb) { - nf_reset_ct(skb); - skb_dst_drop(skb); - skb_ext_put(skb); + if (unlikely(skb->slow_gro)) { + nf_reset_ct(skb); + skb_dst_drop(skb); + skb_ext_put(skb); + skb_orphan(skb); + skb->slow_gro = 0; + } napi_skb_cache_put(skb); } @@ -1786,6 +1790,48 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) EXPORT_SYMBOL(skb_realloc_headroom); /** + * skb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @headroom: needed headroom + * + * Unlike skb_realloc_headroom, this one does not allocate a new skb + * if possible; copies skb->sk to new skb as needed + * and frees original skb in case of failures. + * + * It expect increased headroom and generates warning otherwise. + */ + +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) +{ + int delta = headroom - skb_headroom(skb); + + if (WARN_ONCE(delta <= 0, + "%s is expecting an increase in the headroom", __func__)) + return skb; + + /* pskb_expand_head() might crash, if skb is shared */ + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (likely(nskb)) { + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + consume_skb(skb); + } else { + kfree_skb(skb); + } + skb = nskb; + } + if (skb && + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(skb); + skb = NULL; + } + return skb; +} +EXPORT_SYMBOL(skb_expand_head); + +/** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head @@ -3889,6 +3935,9 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->last = skb; NAPI_GRO_CB(p)->count++; p->data_len += skb->len; + + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; p->truesize += skb->truesize; p->len += skb->len; @@ -4256,6 +4305,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int new_truesize; struct sk_buff *lp; if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) @@ -4287,10 +4337,10 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ - delta_truesize = skb->truesize - - SKB_TRUESIZE(skb_end_offset(skb)); + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; - skb->truesize -= skb->data_len; + skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; @@ -4319,12 +4369,16 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ - delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -6449,6 +6503,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) new->chunks = newlen; new->offset[id] = newoff; set_active: + skb->slow_gro = 1; skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); diff --git a/net/core/sock.c b/net/core/sock.c index a3eea6e0b30a..62627e868e03 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { @@ -1357,6 +1358,15 @@ set_sndbuf: ret = sock_bindtoindex_locked(sk, val); break; + case SO_BUF_LOCK: + if (val & ~SOCK_BUF_LOCK_MASK) { + ret = -EINVAL; + break; + } + sk->sk_userlocks = val | (sk->sk_userlocks & + ~SOCK_BUF_LOCK_MASK); + break; + default: ret = -ENOPROTOOPT; break; @@ -1719,6 +1729,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val64 = sock_net(sk)->net_cookie; break; + case SO_BUF_LOCK: + v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2560,7 +2574,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -#define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** @@ -2714,10 +2727,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); + bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; bool charged = true; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) + if (memcg_charge && + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge()))) goto suppress_allocation; /* Under limit. */ @@ -2771,8 +2786,14 @@ suppress_allocation: /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ - if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { + /* Force charge with __GFP_NOFAIL */ + if (memcg_charge && !charged) { + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); + } return 1; + } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) @@ -2780,7 +2801,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (memcg_charge && charged) mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 60decd6420ca..ae5fa4338d9c 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -211,8 +211,6 @@ out: return psock; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); @@ -223,13 +221,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock *psock; int ret; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (!sock_map_redirect_allowed(sk)) - goto no_progs; - stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); @@ -264,7 +255,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } -no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -527,12 +517,6 @@ static bool sk_is_tcp(const struct sock *sk) sk->sk_protocol == IPPROTO_TCP; } -static bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) @@ -550,10 +534,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); - else if (sk_is_udp(sk)) - return sk_hashed(sk); - - return false; + return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, @@ -1536,6 +1517,7 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } +EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, |