summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/bpf_sk_storage.c3
-rw-r--r--net/core/dev.c121
-rw-r--r--net/core/dev.h3
-rw-r--r--net/core/dev_ioctl.c7
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c29
-rw-r--r--net/core/link_watch.c8
-rw-r--r--net/core/net-sysfs.c17
-rw-r--r--net/core/net_namespace.c49
-rw-r--r--net/core/netdev-genl-gen.c110
-rw-r--r--net/core/netdev-genl-gen.h16
-rw-r--r--net/core/netdev-genl.c344
-rw-r--r--net/core/page_pool.c117
-rw-r--r--net/core/page_pool_priv.h12
-rw-r--r--net/core/page_pool_user.c410
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/rtnetlink.c84
-rw-r--r--net/core/scm.c2
-rw-r--r--net/core/skbuff.c84
-rw-r--r--net/core/sock.c8
-rw-r--r--net/core/sysctl_net_core.c15
-rw-r--r--net/core/xdp.c33
24 files changed, 1356 insertions, 130 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0cb734cbc24b..821aec06abf1 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,7 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
-obj-$(CONFIG_PAGE_POOL) += page_pool.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index cca7594be92e..6c4d90b24d46 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -275,9 +275,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
void *owner, u32 size)
{
- int optmem_max = READ_ONCE(sysctl_optmem_max);
struct sock *sk = (struct sock *)owner;
+ int optmem_max;
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
/* same check as in sock_kmalloc() */
if (size <= optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
diff --git a/net/core/dev.c b/net/core/dev.c
index ad20bebe153f..f01a9b858347 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -165,7 +165,6 @@ static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack);
-static struct napi_struct *napi_by_id(unsigned int napi_id);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -3757,6 +3756,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+
if (q->flags & TCQ_F_NOLOCK) {
if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
qdisc_run_begin(q)) {
@@ -3786,7 +3787,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
no_lock_out:
if (unlikely(to_free))
kfree_skb_list_reason(to_free,
- SKB_DROP_REASON_QDISC_DROP);
+ tcf_get_drop_reason(to_free));
return rc;
}
@@ -3841,7 +3842,8 @@ no_lock_out:
}
spin_unlock(root_lock);
if (unlikely(to_free))
- kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
+ kfree_skb_list_reason(to_free,
+ tcf_get_drop_reason(to_free));
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
@@ -3927,14 +3929,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
- res.drop_reason = *drop_reason;
+ tcf_set_drop_reason(skb, *drop_reason);
mini_qdisc_bstats_cpu_update(miniq, skb);
ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
/* Only tcf related quirks below. */
switch (ret) {
case TC_ACT_SHOT:
- *drop_reason = res.drop_reason;
+ *drop_reason = tcf_get_drop_reason(skb);
mini_qdisc_qstats_cpu_drop(miniq);
break;
case TC_ACT_OK:
@@ -6142,7 +6144,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
-static struct napi_struct *napi_by_id(unsigned int napi_id)
+struct napi_struct *napi_by_id(unsigned int napi_id)
{
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
struct napi_struct *napi;
@@ -6403,6 +6405,43 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
}
EXPORT_SYMBOL(dev_set_threaded);
+/**
+ * netif_queue_set_napi - Associate queue with the napi
+ * @dev: device to which NAPI and queue belong
+ * @queue_index: Index of queue
+ * @type: queue type as RX or TX
+ * @napi: NAPI context, pass NULL to clear previously set NAPI
+ *
+ * Set queue with its corresponding napi context. This should be done after
+ * registering the NAPI handler for the queue-vector and the queues have been
+ * mapped to the corresponding interrupt vector.
+ */
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+ enum netdev_queue_type type, struct napi_struct *napi)
+{
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+
+ if (WARN_ON_ONCE(napi && !napi->dev))
+ return;
+ if (dev->reg_state >= NETREG_REGISTERED)
+ ASSERT_RTNL();
+
+ switch (type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(dev, queue_index);
+ rxq->napi = napi;
+ return;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(dev, queue_index);
+ txq->napi = napi;
+ return;
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL(netif_queue_set_napi);
+
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6438,6 +6477,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = 0;
+ netif_napi_set_irq(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight);
@@ -10514,7 +10554,7 @@ void netdev_run_todo(void)
write_lock(&dev_base_lock);
dev->reg_state = NETREG_UNREGISTERED;
write_unlock(&dev_base_lock);
- linkwatch_forget_dev(dev);
+ linkwatch_sync_dev(dev);
}
while (!list_empty(&list)) {
@@ -11239,17 +11279,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_net_set(dev, net);
dev->ifindex = new_ifindex;
- /* Send a netdev-add uevent to the new namespace */
- kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
- netdev_adjacent_add_links(dev);
-
if (new_name[0]) /* Rename the netdev to prepared name */
strscpy(dev->name, new_name, IFNAMSIZ);
/* Fixup kobjects */
+ dev_set_uevent_suppress(&dev->dev, 1);
err = device_rename(&dev->dev, dev->name);
+ dev_set_uevent_suppress(&dev->dev, 0);
WARN_ON(err);
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
+
/* Adapt owner in case owning user namespace of target network
* namespace is different from the original one.
*/
@@ -11573,6 +11615,61 @@ static struct pernet_operations __net_initdata default_device_ops = {
.exit_batch = default_device_exit_batch,
};
+static void __init net_dev_struct_check(void)
+{
+ /* TX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
+#ifdef CONFIG_XPS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
+
+ /* TXRX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30);
+
+ /* RX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
+#ifdef CONFIG_NETPOLL
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
+}
+
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
@@ -11590,6 +11687,8 @@ static int __init net_dev_init(void)
BUG_ON(!dev_boot_phase);
+ net_dev_struct_check();
+
if (dev_proc_init())
goto out;
diff --git a/net/core/dev.h b/net/core/dev.h
index 5aa45f0fd4ae..cf93e188785b 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -30,7 +30,6 @@ int __init dev_proc_init(void);
#endif
void linkwatch_init_dev(struct net_device *dev);
-void linkwatch_forget_dev(struct net_device *dev);
void linkwatch_run_queue(void);
void dev_addr_flush(struct net_device *dev);
@@ -145,4 +144,6 @@ void xdp_do_check_flushed(struct napi_struct *napi);
#else
static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
#endif
+
+struct napi_struct *napi_by_id(unsigned int napi_id);
#endif
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index feeddf95f450..9a66cf5015f2 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -322,9 +322,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
* frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
* dev->priv_flags.
*/
-static int dev_set_hwtstamp_phylib(struct net_device *dev,
- struct kernel_hwtstamp_config *cfg,
- struct netlink_ext_ack *extack)
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
bool phy_ts = phy_has_hwtstamp(dev->phydev);
@@ -363,6 +363,7 @@ static int dev_set_hwtstamp_phylib(struct net_device *dev,
return 0;
}
+EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib);
static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
{
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index b240d9aae4a6..b0f221d658be 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -183,7 +183,7 @@ out:
}
static const struct genl_multicast_group dropmon_mcgrps[] = {
- { .name = "events", .cap_sys_admin = 1 },
+ { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, },
};
static void send_dm_alert(struct work_struct *work)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 75282222e0b4..3f933ffcefc3 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -53,7 +53,7 @@ bool fib_rule_matchall(const struct fib_rule *rule)
EXPORT_SYMBOL_GPL(fib_rule_matchall);
int fib_default_rule_add(struct fib_rules_ops *ops,
- u32 pref, u32 table, u32 flags)
+ u32 pref, u32 table)
{
struct fib_rule *r;
@@ -65,7 +65,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->action = FR_ACT_TO_TBL;
r->pref = pref;
r->table = table;
- r->flags = flags;
r->proto = RTPROT_KERNEL;
r->fr_net = ops->fro_net;
r->uid_range = fib_kuid_range_unset;
@@ -594,7 +593,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[FRA_TUN_ID])
nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
- err = -EINVAL;
if (tb[FRA_L3MDEV] &&
fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
goto errout_free;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1737884be52f..24061f29c9dd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -203,7 +203,7 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
nla = (struct nlattr *) &skb->data[a];
- if (nla->nla_len > skb->len - a)
+ if (!nla_ok(nla, skb->len - a))
return 0;
nla = nla_find_nested(nla, x);
@@ -1219,8 +1219,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
*/
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
u32 filter_size = bpf_prog_size(fp->prog->len);
- int optmem_max = READ_ONCE(sysctl_optmem_max);
/* same check as in sock_kmalloc() */
if (filter_size <= optmem_max &&
@@ -1550,12 +1550,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct bpf_prog *prog = __get_filter(fprog, sk);
- int err;
+ int err, optmem_max;
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max)
err = -ENOMEM;
else
err = reuseport_attach_prog(sk, prog);
@@ -1594,7 +1595,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog;
- int err;
+ int err, optmem_max;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
@@ -1622,7 +1623,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
}
} else {
/* BPF_PROG_TYPE_SOCKET_FILTER */
- if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max) {
err = -ENOMEM;
goto err_prog_put;
}
@@ -7257,7 +7259,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
- u32 cookie;
int ret;
if (unlikely(!sk || th_len < sizeof(*th)))
@@ -7279,8 +7280,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
if (tcp_synq_no_recent_overflow(sk))
return -ENOENT;
- cookie = ntohl(th->ack_seq) - 1;
-
/* Both struct iphdr and struct ipv6hdr have the version field at the
* same offset so we can cast to the shorter header (struct iphdr).
*/
@@ -7289,7 +7288,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
return -EINVAL;
- ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
+ ret = __cookie_v4_check((struct iphdr *)iph, th);
break;
#if IS_BUILTIN(CONFIG_IPV6)
@@ -7300,7 +7299,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
if (sk->sk_family != AF_INET6)
return -EINVAL;
- ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
+ ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
break;
#endif /* CONFIG_IPV6 */
@@ -7753,9 +7752,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
struct tcphdr *, th)
{
- u32 cookie = ntohl(th->ack_seq) - 1;
-
- if (__cookie_v4_check(iph, th, cookie) > 0)
+ if (__cookie_v4_check(iph, th) > 0)
return 0;
return -EACCES;
@@ -7776,9 +7773,7 @@ BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
struct tcphdr *, th)
{
#if IS_BUILTIN(CONFIG_IPV6)
- u32 cookie = ntohl(th->ack_seq) - 1;
-
- if (__cookie_v6_check(iph, th, cookie) > 0)
+ if (__cookie_v6_check(iph, th) > 0)
return 0;
return -EACCES;
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index c469d1c4db5d..429571c258da 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -192,7 +192,10 @@ static void __linkwatch_run_queue(int urgent_only)
#define MAX_DO_DEV_PER_LOOP 100
int do_dev = MAX_DO_DEV_PER_LOOP;
- struct net_device *dev;
+ /* Use a local list here since we add non-urgent
+ * events back to the global one when called with
+ * urgent_only=1.
+ */
LIST_HEAD(wrk);
/* Give urgent case more budget */
@@ -218,6 +221,7 @@ static void __linkwatch_run_queue(int urgent_only)
list_splice_init(&lweventlist, &wrk);
while (!list_empty(&wrk) && do_dev > 0) {
+ struct net_device *dev;
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
@@ -245,7 +249,7 @@ static void __linkwatch_run_queue(int urgent_only)
spin_unlock_irq(&lweventlist_lock);
}
-void linkwatch_forget_dev(struct net_device *dev)
+void linkwatch_sync_dev(struct net_device *dev)
{
unsigned long flags;
int clean = 0;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fccaa5bac0ed..a09d507c5b03 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -193,11 +193,22 @@ static ssize_t carrier_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
- if (netif_running(netdev))
- return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ if (!rtnl_trylock())
+ return restart_syscall();
- return -EINVAL;
+ if (netif_running(netdev)) {
+ /* Synchronize carrier state with link watch,
+ * see also rtnl_getlink().
+ */
+ linkwatch_sync_dev(netdev);
+
+ ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ }
+ rtnl_unlock();
+
+ return ret;
}
static DEVICE_ATTR_RW(carrier);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index f4183c4c1ec8..72799533426b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -372,6 +372,10 @@ out_undo:
static int __net_init net_defaults_init_net(struct net *net)
{
net->core.sysctl_somaxconn = SOMAXCONN;
+ /* Limits per socket sk_omem_alloc usage.
+ * TCP zerocopy regular usage needs 128 KB.
+ */
+ net->core.sysctl_optmem_max = 128 * 1024;
net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
return 0;
@@ -1099,11 +1103,56 @@ out:
rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}
+#ifdef CONFIG_NET_NS
+static void __init netns_ipv4_struct_check(void)
+{
+ /* TX readonly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_early_retrans);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_tso_win_divisor);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_tso_rtt_log);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_autocorking);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_min_snd_mss);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_notsent_lowat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_limit_output_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_min_rtt_wlen);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_wmem);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_ip_fwd_use_pmtu);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);
+
+ /* TXRX readonly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
+ sysctl_tcp_moderate_rcvbuf);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
+
+ /* RX readonly hotpath cache line */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_ip_early_demux);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_early_demux);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_reordering);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_rmem);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18);
+}
+#endif
+
void __init net_ns_init(void)
{
struct net_generic *ng;
#ifdef CONFIG_NET_NS
+ netns_ipv4_struct_check();
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
SLAB_PANIC|SLAB_ACCOUNT, NULL);
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index ea9231378aa6..be7f2ebd61b2 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -10,11 +10,64 @@
#include <uapi/linux/netdev.h>
+/* Integer value ranges */
+static const struct netlink_range_validation netdev_a_page_pool_id_range = {
+ .min = 1ULL,
+ .max = 4294967295ULL,
+};
+
+static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = {
+ .min = 1ULL,
+ .max = 2147483647ULL,
+};
+
+/* Common nested types */
+const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
+ [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
+ [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
+};
+
/* NETDEV_CMD_DEV_GET - do */
static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
[NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
};
+/* NETDEV_CMD_PAGE_POOL_GET - do */
+#ifdef CONFIG_PAGE_POOL
+static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = {
+ [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
+};
+#endif /* CONFIG_PAGE_POOL */
+
+/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */
+#ifdef CONFIG_PAGE_POOL_STATS
+static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = {
+ [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy),
+};
+#endif /* CONFIG_PAGE_POOL_STATS */
+
+/* NETDEV_CMD_QUEUE_GET - do */
+static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+};
+
+/* NETDEV_CMD_QUEUE_GET - dump */
+static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_NAPI_GET - do */
+static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+};
+
+/* NETDEV_CMD_NAPI_GET - dump */
+static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = {
+ [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -29,10 +82,67 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.dumpit = netdev_nl_dev_get_dumpit,
.flags = GENL_CMD_CAP_DUMP,
},
+#ifdef CONFIG_PAGE_POOL
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_GET,
+ .doit = netdev_nl_page_pool_get_doit,
+ .policy = netdev_page_pool_get_nl_policy,
+ .maxattr = NETDEV_A_PAGE_POOL_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_GET,
+ .dumpit = netdev_nl_page_pool_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#endif /* CONFIG_PAGE_POOL */
+#ifdef CONFIG_PAGE_POOL_STATS
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET,
+ .doit = netdev_nl_page_pool_stats_get_doit,
+ .policy = netdev_page_pool_stats_get_nl_policy,
+ .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET,
+ .dumpit = netdev_nl_page_pool_stats_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#endif /* CONFIG_PAGE_POOL_STATS */
+ {
+ .cmd = NETDEV_CMD_QUEUE_GET,
+ .doit = netdev_nl_queue_get_doit,
+ .policy = netdev_queue_get_do_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_TYPE,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_QUEUE_GET,
+ .dumpit = netdev_nl_queue_get_dumpit,
+ .policy = netdev_queue_get_dump_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_IFINDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_GET,
+ .doit = netdev_nl_napi_get_doit,
+ .policy = netdev_napi_get_do_nl_policy,
+ .maxattr = NETDEV_A_NAPI_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_GET,
+ .dumpit = netdev_nl_napi_get_dumpit,
+ .policy = netdev_napi_get_dump_nl_policy,
+ .maxattr = NETDEV_A_NAPI_IFINDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
[NETDEV_NLGRP_MGMT] = { "mgmt", },
+ [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", },
};
struct genl_family netdev_nl_family __ro_after_init = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 7b370c073e7d..a47f2bcbe4fa 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -11,11 +11,27 @@
#include <uapi/linux/netdev.h>
+/* Common nested types */
+extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
+
int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_queue_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
enum {
NETDEV_NLGRP_MGMT,
+ NETDEV_NLGRP_PAGE_POOL,
};
extern struct genl_family netdev_nl_family;
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index fe61f85bcf33..fd98936da3ae 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -6,13 +6,32 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/xdp.h>
+#include <net/xdp_sock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/busy_poll.h>
#include "netdev-genl-gen.h"
+#include "dev.h"
+
+struct netdev_nl_dump_ctx {
+ unsigned long ifindex;
+ unsigned int rxq_idx;
+ unsigned int txq_idx;
+ unsigned int napi_id;
+};
+
+static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
+{
+ NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx);
+
+ return (struct netdev_nl_dump_ctx *)cb->ctx;
+}
static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
+ u64 xsk_features = 0;
u64 xdp_rx_meta = 0;
void *hdr;
@@ -26,11 +45,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
+ if (netdev->xsk_tx_metadata_ops) {
+ if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ }
+
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
netdev->xdp_features, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
- xdp_rx_meta, NETDEV_A_DEV_PAD)) {
+ xdp_rx_meta, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
+ xsk_features, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
@@ -111,12 +139,13 @@ err_free_msg:
int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
struct net *net = sock_net(skb->sk);
struct net_device *netdev;
int err = 0;
rtnl_lock();
- for_each_netdev_dump(net, netdev, cb->args[0]) {
+ for_each_netdev_dump(net, netdev, ctx->ifindex) {
err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
if (err < 0)
break;
@@ -129,6 +158,317 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
+static int
+netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
+ const struct genl_info *info)
+{
+ void *hdr;
+ pid_t pid;
+
+ if (WARN_ON_ONCE(!napi->dev))
+ return -EINVAL;
+ if (!(napi->dev->flags & IFF_UP))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (napi->napi_id >= MIN_NAPI_ID &&
+ nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
+ goto nla_put_failure;
+
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
+ goto nla_put_failure;
+
+ if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq))
+ goto nla_put_failure;
+
+ if (napi->thread) {
+ pid = task_pid_nr(napi->thread);
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid))
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ struct sk_buff *rsp;
+ u32 napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ rtnl_lock();
+
+ napi = napi_by_id(napi_id);
+ if (napi)
+ err = netdev_nl_napi_fill_one(rsp, napi, info);
+ else
+ err = -EINVAL;
+
+ rtnl_unlock();
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ struct napi_struct *napi;
+ int err = 0;
+
+ if (!(netdev->flags & IFF_UP))
+ return err;
+
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
+ continue;
+
+ err = netdev_nl_napi_fill_one(rsp, napi, info);
+ if (err)
+ return err;
+ ctx->napi_id = napi->napi_id;
+ }
+ return err;
+}
+
+int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ u32 ifindex = 0;
+ int err = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
+
+ rtnl_lock();
+ if (ifindex) {
+ netdev = __dev_get_by_index(net, ifindex);
+ if (netdev)
+ err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
+ else
+ err = -ENODEV;
+ } else {
+ for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
+ if (err < 0)
+ break;
+ ctx->napi_id = 0;
+ }
+ }
+ rtnl_unlock();
+
+ if (err != -EMSGSIZE)
+ return err;
+
+ return skb->len;
+}
+
+static int
+netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
+ u32 q_idx, u32 q_type, const struct genl_info *info)
+{
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) ||
+ nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) ||
+ nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(netdev, q_idx);
+ if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
+ rxq->napi->napi_id))
+ goto nla_put_failure;
+ break;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(netdev, q_idx);
+ if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
+ txq->napi->napi_id))
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id,
+ u32 q_type)
+{
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ if (q_id >= netdev->real_num_rx_queues)
+ return -EINVAL;
+ return 0;
+ case NETDEV_QUEUE_TYPE_TX:
+ if (q_id >= netdev->real_num_tx_queues)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
+ u32 q_type, const struct genl_info *info)
+{
+ int err = 0;
+
+ if (!(netdev->flags & IFF_UP))
+ return err;
+
+ err = netdev_nl_queue_validate(netdev, q_idx, q_type);
+ if (err)
+ return err;
+
+ return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info);
+}
+
+int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 q_id, q_type, ifindex;
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX))
+ return -EINVAL;
+
+ q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]);
+ q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]);
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ rtnl_lock();
+
+ netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+ if (netdev)
+ err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
+ else
+ err = -ENODEV;
+
+ rtnl_unlock();
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ int err = 0;
+ int i;
+
+ if (!(netdev->flags & IFF_UP))
+ return err;
+
+ for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, i,
+ NETDEV_QUEUE_TYPE_RX, info);
+ if (err)
+ return err;
+ ctx->rxq_idx = i++;
+ }
+ for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, i,
+ NETDEV_QUEUE_TYPE_TX, info);
+ if (err)
+ return err;
+ ctx->txq_idx = i++;
+ }
+
+ return err;
+}
+
+int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ u32 ifindex = 0;
+ int err = 0;
+
+ if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ rtnl_lock();
+ if (ifindex) {
+ netdev = __dev_get_by_index(net, ifindex);
+ if (netdev)
+ err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
+ else
+ err = -ENODEV;
+ } else {
+ for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
+ if (err < 0)
+ break;
+ ctx->rxq_idx = 0;
+ ctx->txq_idx = 0;
+ }
+ }
+ rtnl_unlock();
+
+ if (err != -EMSGSIZE)
+ return err;
+
+ return skb->len;
+}
+
static int netdev_genl_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dec544337236..4933762e5a6b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -23,10 +23,12 @@
#include <trace/events/page_pool.h>
+#include "page_pool_priv.h"
+
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
-#define BIAS_MAX LONG_MAX
+#define BIAS_MAX (LONG_MAX >> 1)
#ifdef CONFIG_PAGE_POOL_STATS
/* alloc_stat_inc is intended to be used in softirq context */
@@ -69,7 +71,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = {
* is passed to this API which is filled in. The caller can then report
* those stats to the user (perhaps via ethtool, debugfs, etc.).
*/
-bool page_pool_get_stats(struct page_pool *pool,
+bool page_pool_get_stats(const struct page_pool *pool,
struct page_pool_stats *stats)
{
int cpu = 0;
@@ -173,7 +175,8 @@ static int page_pool_init(struct page_pool *pool,
{
unsigned int ring_qsize = 1024; /* Default */
- memcpy(&pool->p, params, sizeof(pool->p));
+ memcpy(&pool->p, &params->fast, sizeof(pool->p));
+ memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
/* Validate only known flags were used */
if (pool->p.flags & ~(PP_FLAG_ALL))
@@ -211,6 +214,8 @@ static int page_pool_init(struct page_pool *pool,
*/
}
+ pool->has_init_callback = !!pool->slow.init_callback;
+
#ifdef CONFIG_PAGE_POOL_STATS
pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
if (!pool->recycle_stats)
@@ -235,6 +240,18 @@ static int page_pool_init(struct page_pool *pool,
return 0;
}
+static void page_pool_uninit(struct page_pool *pool)
+{
+ ptr_ring_cleanup(&pool->ring, NULL);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ put_device(pool->p.dev);
+
+#ifdef CONFIG_PAGE_POOL_STATS
+ free_percpu(pool->recycle_stats);
+#endif
+}
+
/**
* page_pool_create() - create a page pool.
* @params: parameters, see struct page_pool_params
@@ -249,13 +266,21 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
return ERR_PTR(-ENOMEM);
err = page_pool_init(pool, params);
- if (err < 0) {
- pr_warn("%s() gave up with errno %d\n", __func__, err);
- kfree(pool);
- return ERR_PTR(err);
- }
+ if (err < 0)
+ goto err_free;
+
+ err = page_pool_list(pool);
+ if (err)
+ goto err_uninit;
return pool;
+
+err_uninit:
+ page_pool_uninit(pool);
+err_free:
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
}
EXPORT_SYMBOL(page_pool_create);
@@ -388,8 +413,8 @@ static void page_pool_set_pp_info(struct page_pool *pool,
* the overhead is negligible.
*/
page_pool_fragment_page(page, 1);
- if (pool->p.init_callback)
- pool->p.init_callback(page, pool->p.init_arg);
+ if (pool->has_init_callback)
+ pool->slow.init_callback(page, pool->slow.init_arg);
}
static void page_pool_clear_pp_info(struct page *page)
@@ -504,7 +529,7 @@ EXPORT_SYMBOL(page_pool_alloc_pages);
*/
#define _distance(a, b) (s32)((a) - (b))
-static s32 page_pool_inflight(struct page_pool *pool)
+s32 page_pool_inflight(const struct page_pool *pool, bool strict)
{
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
@@ -512,27 +537,27 @@ static s32 page_pool_inflight(struct page_pool *pool)
inflight = _distance(hold_cnt, release_cnt);
- trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
- WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+ if (strict) {
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages",
+ inflight);
+ } else {
+ inflight = max(0, inflight);
+ }
return inflight;
}
-/* Disconnects a page (from a page_pool). API users can have a need
- * to disconnect a page (from a page_pool), to allow it to be used as
- * a regular page (that will eventually be returned to the normal
- * page-allocator via put_page).
- */
-static void page_pool_return_page(struct page_pool *pool, struct page *page)
+static __always_inline
+void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
- int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
/* Always account for inflight pages, even if we didn't
* map them
*/
- goto skip_dma_unmap;
+ return;
dma = page_pool_get_dma_addr(page);
@@ -541,7 +566,19 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page)
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr(page, 0);
-skip_dma_unmap:
+}
+
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+ int count;
+
+ __page_pool_release_page_dma(pool, page);
+
page_pool_clear_pp_info(page);
/* This may be the last page returned, releasing the pool, so
@@ -647,8 +684,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
return NULL;
}
-void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
if (page && !page_pool_recycle_in_ring(pool, page)) {
@@ -657,7 +694,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
page_pool_return_page(pool, page);
}
}
-EXPORT_SYMBOL(page_pool_put_defragged_page);
+EXPORT_SYMBOL(page_pool_put_unrefed_page);
/**
* page_pool_put_page_bulk() - release references on multiple pages
@@ -684,7 +721,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
struct page *page = virt_to_head_page(data[i]);
/* It is not the last user for the page frag case */
- if (!page_pool_is_last_frag(page))
+ if (!page_pool_is_last_ref(page))
continue;
page = __page_pool_put_page(pool, page, -1, false);
@@ -726,7 +763,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
long drain_count = BIAS_MAX - pool->frag_users;
/* Some user is still using the page frag */
- if (likely(page_pool_defrag_page(page, drain_count)))
+ if (likely(page_pool_unref_page(page, drain_count)))
return NULL;
if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
@@ -747,7 +784,7 @@ static void page_pool_free_frag(struct page_pool *pool)
pool->frag_page = NULL;
- if (!page || page_pool_defrag_page(page, drain_count))
+ if (!page || page_pool_unref_page(page, drain_count))
return;
page_pool_return_page(pool, page);
@@ -818,14 +855,8 @@ static void __page_pool_destroy(struct page_pool *pool)
if (pool->disconnect)
pool->disconnect(pool);
- ptr_ring_cleanup(&pool->ring, NULL);
-
- if (pool->p.flags & PP_FLAG_DMA_MAP)
- put_device(pool->p.dev);
-
-#ifdef CONFIG_PAGE_POOL_STATS
- free_percpu(pool->recycle_stats);
-#endif
+ page_pool_unlist(pool);
+ page_pool_uninit(pool);
kfree(pool);
}
@@ -862,7 +893,7 @@ static int page_pool_release(struct page_pool *pool)
int inflight;
page_pool_scrub(pool);
- inflight = page_pool_inflight(pool);
+ inflight = page_pool_inflight(pool, true);
if (!inflight)
__page_pool_destroy(pool);
@@ -873,18 +904,21 @@ static void page_pool_release_retry(struct work_struct *wq)
{
struct delayed_work *dwq = to_delayed_work(wq);
struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+ void *netdev;
int inflight;
inflight = page_pool_release(pool);
if (!inflight)
return;
- /* Periodic warning */
- if (time_after_eq(jiffies, pool->defer_warn)) {
+ /* Periodic warning for page pools the user can't see */
+ netdev = READ_ONCE(pool->slow.netdev);
+ if (time_after_eq(jiffies, pool->defer_warn) &&
+ (!netdev || netdev == NET_PTR_POISON)) {
int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
- pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
- __func__, inflight, sec);
+ pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
+ __func__, pool->user.id, inflight, sec);
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
}
@@ -929,6 +963,7 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_release(pool))
return;
+ page_pool_detached(pool);
pool->defer_start = jiffies;
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
new file mode 100644
index 000000000000..90665d40f1eb
--- /dev/null
+++ b/net/core/page_pool_priv.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PAGE_POOL_PRIV_H
+#define __PAGE_POOL_PRIV_H
+
+s32 page_pool_inflight(const struct page_pool *pool, bool strict);
+
+int page_pool_list(struct page_pool *pool);
+void page_pool_detached(struct page_pool *pool);
+void page_pool_unlist(struct page_pool *pool);
+
+#endif
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
new file mode 100644
index 000000000000..ffe5244e5597
--- /dev/null
+++ b/net/core/page_pool_user.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/xarray.h>
+#include <net/net_debug.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
+#include <net/sock.h>
+
+#include "page_pool_priv.h"
+#include "netdev-genl-gen.h"
+
+static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
+/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user.
+ * Ordering: inside rtnl_lock
+ */
+static DEFINE_MUTEX(page_pools_lock);
+
+/* Page pools are only reachable from user space (via netlink) if they are
+ * linked to a netdev at creation time. Following page pool "visibility"
+ * states are possible:
+ * - normal
+ * - user.list: linked to real netdev, netdev: real netdev
+ * - orphaned - real netdev has disappeared
+ * - user.list: linked to lo, netdev: lo
+ * - invisible - either (a) created without netdev linking, (b) unlisted due
+ * to error, or (c) the entire namespace which owned this pool disappeared
+ * - user.list: unhashed, netdev: unknown
+ */
+
+typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info);
+
+static int
+netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill)
+{
+ struct page_pool *pool;
+ struct sk_buff *rsp;
+ int err;
+
+ mutex_lock(&page_pools_lock);
+ pool = xa_load(&page_pools, id);
+ if (!pool || hlist_unhashed(&pool->user.list) ||
+ !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) {
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+
+ err = fill(rsp, pool, info);
+ if (err)
+ goto err_free_msg;
+
+ mutex_unlock(&page_pools_lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+err_unlock:
+ mutex_unlock(&page_pools_lock);
+ return err;
+}
+
+struct page_pool_dump_cb {
+ unsigned long ifindex;
+ u32 pp_id;
+};
+
+static int
+netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ pp_nl_fill_cb fill)
+{
+ struct page_pool_dump_cb *state = (void *)cb->ctx;
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ struct page_pool *pool;
+ int err = 0;
+
+ rtnl_lock();
+ mutex_lock(&page_pools_lock);
+ for_each_netdev_dump(net, netdev, state->ifindex) {
+ hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+ if (state->pp_id && state->pp_id < pool->user.id)
+ continue;
+
+ state->pp_id = pool->user.id;
+ err = fill(skb, pool, info);
+ if (err)
+ break;
+ }
+
+ state->pp_id = 0;
+ }
+ mutex_unlock(&page_pools_lock);
+ rtnl_unlock();
+
+ if (skb->len && err == -EMSGSIZE)
+ return skb->len;
+ return err;
+}
+
+static int
+page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info)
+{
+#ifdef CONFIG_PAGE_POOL_STATS
+ struct page_pool_stats stats = {};
+ struct nlattr *nest;
+ void *hdr;
+
+ if (!page_pool_get_stats(pool, &stats))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO);
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) ||
+ (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
+ nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
+ pool->slow.netdev->ifindex)))
+ goto err_cancel_nest;
+
+ nla_nest_end(rsp, nest);
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST,
+ stats.alloc_stats.fast) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW,
+ stats.alloc_stats.slow) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER,
+ stats.alloc_stats.slow_high_order) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY,
+ stats.alloc_stats.empty) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL,
+ stats.alloc_stats.refill) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE,
+ stats.alloc_stats.waive) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED,
+ stats.recycle_stats.cached) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL,
+ stats.recycle_stats.cache_full) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING,
+ stats.recycle_stats.ring) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL,
+ stats.recycle_stats.ring_full) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT,
+ stats.recycle_stats.released_refcnt))
+ goto err_cancel_msg;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+err_cancel_nest:
+ nla_nest_cancel(rsp, nest);
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+#else
+ GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS");
+ return -EOPNOTSUPP;
+#endif
+}
+
+int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)];
+ struct nlattr *nest;
+ int err;
+ u32 id;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO))
+ return -EINVAL;
+
+ nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO];
+ err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest,
+ netdev_page_pool_info_nl_policy,
+ info->extack);
+ if (err)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID))
+ return -EINVAL;
+ if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NETDEV_A_PAGE_POOL_IFINDEX],
+ "selecting by ifindex not supported");
+ return -EINVAL;
+ }
+
+ id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]);
+
+ return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill);
+}
+
+int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill);
+}
+
+static int
+page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info)
+{
+ size_t inflight, refsz;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id))
+ goto err_cancel;
+
+ if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
+ nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
+ pool->slow.netdev->ifindex))
+ goto err_cancel;
+ if (pool->user.napi_id &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id))
+ goto err_cancel;
+
+ inflight = page_pool_inflight(pool, false);
+ refsz = PAGE_SIZE << pool->p.order;
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
+ inflight * refsz))
+ goto err_cancel;
+ if (pool->user.detach_time &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME,
+ pool->user.detach_time))
+ goto err_cancel;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+err_cancel:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd)
+{
+ struct genl_info info;
+ struct sk_buff *ntf;
+ struct net *net;
+
+ lockdep_assert_held(&page_pools_lock);
+
+ /* 'invisible' page pools don't matter */
+ if (hlist_unhashed(&pool->user.list))
+ return;
+ net = dev_net(pool->slow.netdev);
+
+ if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL))
+ return;
+
+ genl_info_init_ntf(&info, &netdev_nl_family, cmd);
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ if (page_pool_nl_fill(ntf, pool, &info)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&netdev_nl_family, net, ntf,
+ 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL);
+}
+
+int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 id;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID))
+ return -EINVAL;
+
+ id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]);
+
+ return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill);
+}
+
+int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill);
+}
+
+int page_pool_list(struct page_pool *pool)
+{
+ static u32 id_alloc_next;
+ int err;
+
+ mutex_lock(&page_pools_lock);
+ err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b,
+ &id_alloc_next, GFP_KERNEL);
+ if (err < 0)
+ goto err_unlock;
+
+ INIT_HLIST_NODE(&pool->user.list);
+ if (pool->slow.netdev) {
+ hlist_add_head(&pool->user.list,
+ &pool->slow.netdev->page_pools);
+ pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0;
+
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
+ }
+
+ mutex_unlock(&page_pools_lock);
+ return 0;
+
+err_unlock:
+ mutex_unlock(&page_pools_lock);
+ return err;
+}
+
+void page_pool_detached(struct page_pool *pool)
+{
+ mutex_lock(&page_pools_lock);
+ pool->user.detach_time = ktime_get_boottime_seconds();
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
+ mutex_unlock(&page_pools_lock);
+}
+
+void page_pool_unlist(struct page_pool *pool)
+{
+ mutex_lock(&page_pools_lock);
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF);
+ xa_erase(&page_pools, pool->user.id);
+ if (!hlist_unhashed(&pool->user.list))
+ hlist_del(&pool->user.list);
+ mutex_unlock(&page_pools_lock);
+}
+
+static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
+{
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) {
+ hlist_del_init(&pool->user.list);
+ pool->slow.netdev = NET_PTR_POISON;
+ }
+ mutex_unlock(&page_pools_lock);
+}
+
+static void page_pool_unreg_netdev(struct net_device *netdev)
+{
+ struct page_pool *pool, *last;
+ struct net_device *lo;
+
+ lo = dev_net(netdev)->loopback_dev;
+
+ mutex_lock(&page_pools_lock);
+ last = NULL;
+ hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+ pool->slow.netdev = lo;
+ netdev_nl_page_pool_event(pool,
+ NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
+ last = pool;
+ }
+ if (last)
+ hlist_splice_init(&netdev->page_pools, &last->user.list,
+ &lo->page_pools);
+ mutex_unlock(&page_pools_lock);
+}
+
+static int
+page_pool_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ if (hlist_empty(&netdev->page_pools))
+ return NOTIFY_OK;
+
+ if (netdev->ifindex != LOOPBACK_IFINDEX)
+ page_pool_unreg_netdev(netdev);
+ else
+ page_pool_unreg_netdev_wipe(netdev);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block page_pool_netdevice_nb = {
+ .notifier_call = page_pool_netdevice_event,
+};
+
+static int __init page_pool_user_init(void)
+{
+ return register_netdevice_notifier(&page_pool_netdevice_nb);
+}
+
+subsys_initcall(page_pool_user_init);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 57cea67b7562..ea55a758a475 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3669,10 +3669,8 @@ static int pktgen_thread_worker(void *arg)
if (unlikely(!pkt_dev && t->control == 0)) {
if (t->net->pktgen_exiting)
break;
- wait_event_interruptible_timeout(t->queue,
- t->control != 0,
- HZ/10);
- try_to_freeze();
+ wait_event_freezable_timeout(t->queue,
+ t->control != 0, HZ / 10);
continue;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e8431c6c8490..5f6ed6da3cfc 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -342,8 +342,7 @@ int rtnl_unregister(int protocol, int msgtype)
return -ENOENT;
}
- link = rtnl_dereference(tab[msgindex]);
- RCU_INIT_POINTER(tab[msgindex], NULL);
+ link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
rtnl_unlock();
kfree_rcu(link, rcu);
@@ -368,18 +367,13 @@ void rtnl_unregister_all(int protocol)
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
rtnl_lock();
- tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
+ tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL);
if (!tab) {
rtnl_unlock();
return;
}
- RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
- link = rtnl_dereference(tab[msgindex]);
- if (!link)
- continue;
-
- RCU_INIT_POINTER(tab[msgindex], NULL);
+ link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
kfree_rcu(link, rcu);
}
rtnl_unlock();
@@ -3849,10 +3843,18 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
err = -ENOBUFS;
- nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
+ nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
if (nskb == NULL)
goto out;
+ /* Synchronize the carrier state so we don't report a state
+ * that we're not actually going to honour immediately; if
+ * the driver just did a carrier off->on transition, we can
+ * only TX if link watch work has run, but without this we'd
+ * already report carrier on, even if it doesn't work yet.
+ */
+ linkwatch_sync_dev(dev);
+
err = rtnl_fill_ifinfo(nskb, dev, net,
RTM_NEWLINK, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, 0, 0, ext_filter_mask,
@@ -6408,17 +6410,64 @@ static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack);
}
+static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+ struct br_mdb_entry zero_entry = {};
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
+ NL_SET_ERR_MSG(extack, "Unknown entry state");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be set");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_N_VID - 1) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) {
+ NL_SET_ERR_MSG(extack, "Entry address cannot be set");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = {
+ [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_del_bulk,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
struct net *net = sock_net(skb->sk);
struct br_port_msg *bpm;
struct net_device *dev;
int err;
- err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
- MDBA_SET_ENTRY_MAX, mdba_policy, extack);
+ if (!del_bulk)
+ err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
+ MDBA_SET_ENTRY_MAX, mdba_policy,
+ extack);
+ else
+ err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX,
+ mdba_del_bulk_policy, extack);
if (err)
return err;
@@ -6439,6 +6488,14 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
+ if (del_bulk) {
+ if (!dev->netdev_ops->ndo_mdb_del_bulk) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion");
+ return -EOPNOTSUPP;
+ }
+ return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack);
+ }
+
if (!dev->netdev_ops->ndo_mdb_del) {
NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
return -EOPNOTSUPP;
@@ -6684,5 +6741,6 @@ void __init rtnetlink_init(void)
rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0);
rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL,
+ RTNL_FLAG_BULK_DEL_SUPPORTED);
}
diff --git a/net/core/scm.c b/net/core/scm.c
index db3f7cd519c2..d0e0852a24d5 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -105,7 +105,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
/* don't allow io_uring files */
- if (io_uring_get_socket(file)) {
+ if (io_is_uring_fops(file)) {
fput(file);
return -EINVAL;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ee648829849..edbbef563d4d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -890,6 +890,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+static bool is_pp_page(struct page *page)
+{
+ return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+}
+
#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(struct page *page, bool napi_safe)
{
@@ -905,7 +910,7 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
* and page_is_pfmemalloc() is checked in __page_pool_put_page()
* to avoid recycling the pfmemalloc page.
*/
- if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ if (unlikely(!is_pp_page(page)))
return false;
pp = page->pp;
@@ -942,6 +947,37 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
return napi_pp_put_page(virt_to_page(data), napi_safe);
}
+/**
+ * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
+ * @skb: page pool aware skb
+ *
+ * Increase the fragment reference count (pp_ref_count) of a skb. This is
+ * intended to gain fragment references only for page pool aware skbs,
+ * i.e. when skb->pp_recycle is true, and not for fragments in a
+ * non-pp-recycling skb. It has a fallback to increase references on normal
+ * pages, as page pool aware skbs may also have normal page fragments.
+ */
+static int skb_pp_frag_ref(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo;
+ struct page *head_page;
+ int i;
+
+ if (!skb->pp_recycle)
+ return -EINVAL;
+
+ shinfo = skb_shinfo(skb);
+
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
+ if (likely(is_pp_page(head_page)))
+ page_pool_ref_page(head_page);
+ else
+ page_ref_inc(head_page);
+ }
+ return 0;
+}
+
static void skb_kfree_head(void *head, unsigned int end_offset)
{
if (end_offset == SKB_SMALL_HEAD_HEADROOM)
@@ -5769,17 +5805,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
return false;
/* In general, avoid mixing page_pool and non-page_pool allocated
- * pages within the same SKB. Additionally avoid dealing with clones
- * with page_pool pages, in case the SKB is using page_pool fragment
- * references (page_pool_alloc_frag()). Since we only take full page
- * references for cloned SKBs at the moment that would result in
- * inconsistent reference counts.
- * In theory we could take full references if @from is cloned and
- * !@to->pp_recycle but its tricky (due to potential race with
- * the clone disappearing) and rare, so not worth dealing with.
+ * pages within the same SKB. In theory we could take full
+ * references if @from is cloned and !@to->pp_recycle but its
+ * tricky (due to potential race with the clone disappearing) and
+ * rare, so not worth dealing with.
*/
- if (to->pp_recycle != from->pp_recycle ||
- (from->pp_recycle && skb_cloned(from)))
+ if (to->pp_recycle != from->pp_recycle)
return false;
if (len <= skb_tailroom(to)) {
@@ -5836,8 +5867,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
/* if the skb is not cloned this does nothing
* since we set nr_frags to 0.
*/
- for (i = 0; i < from_shinfo->nr_frags; i++)
- __skb_frag_ref(&from_shinfo->frags[i]);
+ if (skb_pp_frag_ref(from)) {
+ for (i = 0; i < from_shinfo->nr_frags; i++)
+ __skb_frag_ref(&from_shinfo->frags[i]);
+ }
to->truesize += delta;
to->len += len;
@@ -5964,6 +5997,31 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
}
EXPORT_SYMBOL(skb_ensure_writable);
+int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
+{
+ int needed_headroom = dev->needed_headroom;
+ int needed_tailroom = dev->needed_tailroom;
+
+ /* For tail taggers, we need to pad short frames ourselves, to ensure
+ * that the tail tag does not fail at its role of being at the end of
+ * the packet, once the conduit interface pads the frame. Account for
+ * that pad length here, and pad later.
+ */
+ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+ needed_tailroom += ETH_ZLEN - skb->len;
+ /* skb_headroom() returns unsigned int... */
+ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+ needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+ if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+ /* No reallocation needed, yay! */
+ return 0;
+
+ return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+ GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable_head_tail);
+
/* remove VLAN header from packet and update csum accordingly.
* expects a non skb_vlan_tag_present skb with a vlan tag payload
*/
diff --git a/net/core/sock.c b/net/core/sock.c
index d02534c77413..158dbdebce6a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -283,10 +283,6 @@ EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-/* Maximal space eaten by iovec or ancillary data plus some space */
-int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
-EXPORT_SYMBOL(sysctl_optmem_max);
-
int sysctl_tstamp_allow_data __read_mostly = 1;
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
@@ -2658,7 +2654,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- READ_ONCE(sysctl_optmem_max))
+ READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -2676,7 +2672,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- int optmem_max = READ_ONCE(sysctl_optmem_max);
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
if ((unsigned int)size <= optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 03f1edb948d7..0f0cb1465e08 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -509,13 +509,6 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "optmem_max",
- .data = &sysctl_optmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tstamp_allow_data",
.data = &sysctl_tstamp_allow_data,
.maxlen = sizeof(int),
@@ -674,6 +667,14 @@ static struct ctl_table netns_core_table[] = {
.proc_handler = proc_dointvec_minmax
},
{
+ .procname = "optmem_max",
+ .data = &init_net.core.sysctl_optmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_dointvec_minmax
+ },
+ {
.procname = "txrehash",
.data = &init_net.core.sysctl_txrehash,
.maxlen = sizeof(u8),
diff --git a/net/core/xdp.c b/net/core/xdp.c
index b6f1d6dab3f2..4869c1c2d8f3 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -736,6 +736,39 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
return -EOPNOTSUPP;
}
+/**
+ * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag
+ * @ctx: XDP context pointer.
+ * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID).
+ * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP)
+ *
+ * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*,
+ * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use
+ * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)**
+ * and should be used as follows:
+ * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();``
+ *
+ * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag.
+ * Driver is expected to provide those in **host byte order (usually LE)**,
+ * so the bpf program should not perform byte conversion.
+ * According to 802.1Q standard, *VLAN TCI (Tag control information)*
+ * is a bit field that contains:
+ * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``,
+ * *Drop eligible indicator (DEI)* - 1 bit,
+ * *Priority code point (PCP)* - 3 bits.
+ * For detailed meaning of DEI and PCP, please refer to other sources.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
+ * * ``-ENODATA`` : VLAN tag was not stripped or is not available
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
+ __be16 *vlan_proto, u16 *vlan_tci)
+{
+ return -EOPNOTSUPP;
+}
+
__bpf_kfunc_end_defs();
BTF_SET8_START(xdp_metadata_kfunc_ids)