summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c12
-rw-r--r--net/core/dev.c603
-rw-r--r--net/core/dev_ioctl.c20
-rw-r--r--net/core/devlink.c315
-rw-r--r--net/core/filter.c230
-rw-r--r--net/core/flow_dissector.c35
-rw-r--r--net/core/neighbour.c1
-rw-r--r--net/core/net-sysfs.c53
-rw-r--r--net/core/net_namespace.c19
-rw-r--r--net/core/netpoll.c22
-rw-r--r--net/core/page_pool.c14
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/rtnetlink.c4
-rw-r--r--net/core/skbuff.c524
-rw-r--r--net/core/skmsg.c3
-rw-r--r--net/core/sock.c212
-rw-r--r--net/core/sock_map.c2
-rw-r--r--net/core/sysctl_net_core.c2
-rw-r--r--net/core/xdp.c70
19 files changed, 1473 insertions, 670 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 81809fa735a7..15ab9ffb27fe 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -721,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
__wsum *csump)
{
- return __skb_datagram_iter(skb, offset, to, len, true,
- csum_and_copy_to_iter, csump);
+ struct csum_state csdata = { .csum = *csump };
+ int ret;
+
+ ret = __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, &csdata);
+ if (ret)
+ return ret;
+
+ *csump = csdata.csum;
+ return 0;
}
/**
diff --git a/net/core/dev.c b/net/core/dev.c
index a979b86dbacd..6c5967e80132 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -91,6 +91,7 @@
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
@@ -101,6 +102,7 @@
#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/gro.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -1493,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_notify_peers);
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ int err = 0;
+
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
+ */
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
+
+ return err;
+}
+
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -2194,28 +2217,14 @@ static inline void net_timestamp_set(struct sk_buff *skb)
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
- unsigned int len;
-
- if (!(dev->flags & IFF_UP))
- return false;
-
- len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
- if (skb->len <= len)
- return true;
-
- /* if TSO is enabled, we don't care about the length as the packet
- * could be forwarded without being segmented before
- */
- if (skb_is_gso(skb))
- return true;
-
- return false;
+ return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);
-int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+ bool check_mtu)
{
- int ret = ____dev_forward_skb(dev, skb);
+ int ret = ____dev_forward_skb(dev, skb, check_mtu);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
@@ -2224,6 +2233,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, true);
+}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
@@ -2250,6 +2264,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
+int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+}
+
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
@@ -3617,11 +3636,22 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
{
- if (unlikely(skb->csum_not_inet))
+ if (unlikely(skb_csum_is_sctp(skb)))
return !!(features & NETIF_F_SCTP_CRC) ? 0 :
skb_crc32c_csum_help(skb);
- return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+ if (features & NETIF_F_HW_CSUM)
+ return 0;
+
+ if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ switch (skb->csum_offset) {
+ case offsetof(struct tcphdr, check):
+ case offsetof(struct udphdr, check):
+ return 0;
+ }
+ }
+
+ return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
@@ -3878,6 +3908,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
qdisc_skb_cb(skb)->mru = 0;
+ qdisc_skb_cb(skb)->post_ct = false;
mini_qdisc_bstats_cpu_update(miniq, skb);
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@ -4083,7 +4114,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
- __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
@@ -4252,6 +4283,22 @@ int gro_normal_batch __read_mostly = 8;
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
+ struct task_struct *thread;
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable()/dev_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ wake_up_process(thread);
+ return;
+ }
+ }
+
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
@@ -4603,14 +4650,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
+ void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
- void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
+ u32 mac_len, frame_sz;
__be16 orig_eth_type;
struct ethhdr *eth;
bool orig_bcast;
- int hlen, off;
- u32 mac_len;
+ int off;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
@@ -4642,15 +4689,16 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
* header.
*/
mac_len = skb->data - skb_mac_header(skb);
- hlen = skb_headlen(skb) + mac_len;
- xdp->data = skb->data - mac_len;
- xdp->data_meta = xdp->data;
- xdp->data_end = xdp->data + hlen;
- xdp->data_hard_start = skb->data - skb_headroom(skb);
+ hard_start = skb->data - skb_headroom(skb);
/* SKB "head" area always have tailroom for skb_shared_info */
- xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
- xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ frame_sz = (void *)skb_end_pointer(skb) - hard_start;
+ frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ rxqueue = netif_get_rxqueue(skb);
+ xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
+ xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
+ skb_headlen(skb) + mac_len, true);
orig_data_end = xdp->data_end;
orig_data = xdp->data;
@@ -4658,9 +4706,6 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
orig_eth_type = eth->h_proto;
- rxqueue = netif_get_rxqueue(skb);
- xdp->rxq = &rxqueue->xdp_rxq;
-
act = bpf_prog_run_xdp(xdp_prog, xdp);
/* check if bpf_xdp_adjust_head was used */
@@ -4895,8 +4940,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
else
__kfree_skb_defer(skb);
}
-
- __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -4962,6 +5005,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
qdisc_skb_cb(skb)->pkt_len = skb->len;
qdisc_skb_cb(skb)->mru = 0;
+ qdisc_skb_cb(skb)->post_ct = false;
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
@@ -5151,8 +5195,7 @@ another_round:
skb_reset_mac_len(skb);
}
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
- skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ if (eth_type_vlan(skb->protocol)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
@@ -5236,8 +5279,7 @@ check_vlan_id:
* find vlan device.
*/
skb->pkt_type = PACKET_OTHERHOST;
- } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
- skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ } else if (eth_type_vlan(skb->protocol)) {
/* Outer header is 802.1P with vlan 0, inner header is
* 802.1Q or 802.1AD and vlan_do_receive() above could
* not find vlan dev for vlan id 0.
@@ -5713,7 +5755,7 @@ static void flush_all_backlogs(void)
}
/* we can have in flight packet[s] on the cpus we are not flushing,
- * synchronize_net() in rollback_registered_many() will take care of
+ * synchronize_net() in unregister_netdevice_many() will take care of
* them
*/
for_each_cpu(cpu, &flush_cpus)
@@ -5735,15 +5777,14 @@ static void gro_normal_list(struct napi_struct *napi)
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
-static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
{
list_add_tail(&skb->list, &napi->rx_list);
- if (++napi->rx_count >= gro_normal_batch)
+ napi->rx_count += segs;
+ if (napi->rx_count >= gro_normal_batch)
gro_normal_list(napi);
}
-INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
-INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
struct packet_offload *ptype;
@@ -5777,7 +5818,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
}
out:
- gro_normal_one(napi, skb);
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
return NET_RX_SUCCESS;
}
@@ -5912,10 +5953,6 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
napi_gro_complete(napi, oldest);
}
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
- struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
- struct sk_buff *));
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@ -6054,31 +6091,20 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
-static void napi_skb_free_stolen_head(struct sk_buff *skb)
-{
- skb_dst_drop(skb);
- skb_ext_put(skb);
- kmem_cache_free(skbuff_head_cache, skb);
-}
-
static gro_result_t napi_skb_finish(struct napi_struct *napi,
struct sk_buff *skb,
gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- gro_normal_one(napi, skb);
- break;
-
- case GRO_DROP:
- kfree_skb(skb);
+ gro_normal_one(napi, skb, 1);
break;
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
else
- __kfree_skb(skb);
+ __kfree_skb_defer(skb);
break;
case GRO_HELD:
@@ -6155,11 +6181,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb);
- break;
-
- case GRO_DROP:
- napi_reuse_skb(napi, skb);
+ gro_normal_one(napi, skb, 1);
break;
case GRO_MERGED_FREE:
@@ -6223,9 +6245,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
gro_result_t ret;
struct sk_buff *skb = napi_frags_skb(napi);
- if (!skb)
- return GRO_DROP;
-
trace_napi_gro_frags_entry(skb);
ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
@@ -6708,6 +6727,49 @@ static void init_gro_hash(struct napi_struct *napi)
napi->gro_bitmask = 0;
}
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+ struct napi_struct *napi;
+ int err = 0;
+
+ if (dev->threaded == threaded)
+ return 0;
+
+ if (threaded) {
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (!napi->thread) {
+ err = napi_kthread_create(napi);
+ if (err) {
+ threaded = false;
+ break;
+ }
+ }
+ }
+ }
+
+ dev->threaded = threaded;
+
+ /* Make sure kthread is created before THREADED bit
+ * is set.
+ */
+ smp_mb__before_atomic();
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return err;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6735,6 +6797,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
+ */
+ if (dev->threaded && napi_kthread_create(napi))
+ dev->threaded = 0;
}
EXPORT_SYMBOL(netif_napi_add);
@@ -6752,9 +6820,28 @@ void napi_disable(struct napi_struct *n)
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
+ clear_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_disable);
+/**
+ * napi_enable - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ smp_mb__before_atomic();
+ clear_bit(NAPI_STATE_SCHED, &n->state);
+ clear_bit(NAPI_STATE_NPSVC, &n->state);
+ if (n->dev->threaded && n->thread)
+ set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
static void flush_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6780,18 +6867,18 @@ void __netif_napi_del(struct napi_struct *napi)
flush_gro_hash(napi);
napi->gro_bitmask = 0;
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
}
EXPORT_SYMBOL(__netif_napi_del);
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
{
- void *have;
int work, weight;
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6811,7 +6898,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
n->poll, work, weight);
if (likely(work < weight))
- goto out_unlock;
+ return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
@@ -6820,7 +6907,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
- goto out_unlock;
+ return work;
}
/* The NAPI context has more processing work, but busy-polling
@@ -6833,7 +6920,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
*/
napi_schedule(n);
}
- goto out_unlock;
+ return work;
}
if (n->gro_bitmask) {
@@ -6851,17 +6938,78 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
- goto out_unlock;
+ return work;
}
- list_add_tail(&n->poll_list, repoll);
+ *repoll = true;
+
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll)
+ list_add_tail(&n->poll_list, repoll);
-out_unlock:
netpoll_poll_unlock(have);
return work;
}
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ void *have;
+
+ while (!napi_thread_wait(napi)) {
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ local_bh_enable();
+
+ if (!repoll)
+ break;
+
+ cond_resched();
+ }
+ }
+ return 0;
+}
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -6880,7 +7028,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- goto out;
+ return;
break;
}
@@ -6907,8 +7055,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
-out:
- __kfree_skb_flush();
}
struct netdev_adjacent {
@@ -8120,6 +8266,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_get_xmit_slave);
+static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_sk_get_lower_dev)
+ return NULL;
+ return ops->ndo_sk_get_lower_dev(dev, sk);
+}
+
+/**
+ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
+ * @dev: device
+ * @sk: the socket
+ *
+ * %NULL is returned if no lower device is found.
+ */
+
+struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ struct net_device *lower;
+
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ while (lower) {
+ dev = lower;
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ }
+
+ return dev;
+}
+EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
+
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
@@ -8742,6 +8921,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
}
EXPORT_SYMBOL(dev_set_mac_address);
+static DECLARE_RWSEM(dev_addr_sem);
+
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ ret = dev_set_mac_address(dev, sa, extack);
+ up_write(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+ size_t size = sizeof(sa->sa_data);
+ struct net_device *dev;
+ int ret = 0;
+
+ down_read(&dev_addr_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ if (!dev->addr_len)
+ memset(sa->sa_data, 0, size);
+ else
+ memcpy(sa->sa_data, dev->dev_addr,
+ min_t(size_t, size, dev->addr_len));
+ sa->sa_family = dev->type;
+
+unlock:
+ rcu_read_unlock();
+ up_read(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_get_mac_address);
+
/**
* dev_change_carrier - Change device carrier
* @dev: device
@@ -9441,106 +9662,6 @@ static void net_set_todo(struct net_device *dev)
dev_net(dev)->dev_unreg_count++;
}
-static void rollback_registered_many(struct list_head *head)
-{
- struct net_device *dev, *tmp;
- LIST_HEAD(close_head);
-
- BUG_ON(dev_boot_phase);
- ASSERT_RTNL();
-
- list_for_each_entry_safe(dev, tmp, head, unreg_list) {
- /* Some devices call without registering
- * for initialization unwind. Remove those
- * devices and proceed with the remaining.
- */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- pr_debug("unregister_netdevice: device %s/%p never was registered\n",
- dev->name, dev);
-
- WARN_ON(1);
- list_del(&dev->unreg_list);
- continue;
- }
- dev->dismantle = true;
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
- }
-
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
- dev_close_many(&close_head, true);
-
- list_for_each_entry(dev, head, unreg_list) {
- /* And unlink it from device chain. */
- unlist_netdevice(dev);
-
- dev->reg_state = NETREG_UNREGISTERING;
- }
- flush_all_backlogs();
-
- synchronize_net();
-
- list_for_each_entry(dev, head, unreg_list) {
- struct sk_buff *skb = NULL;
-
- /* Shutdown queueing discipline. */
- dev_shutdown(dev);
-
- dev_xdp_uninstall(dev);
-
- /* Notify protocols, that we are about to destroy
- * this device. They should clean all the things.
- */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
-
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
- GFP_KERNEL, NULL, 0);
-
- /*
- * Flush the unicast and multicast chains
- */
- dev_uc_flush(dev);
- dev_mc_flush(dev);
-
- netdev_name_node_alt_flush(dev);
- netdev_name_node_free(dev->name_node);
-
- if (dev->netdev_ops->ndo_uninit)
- dev->netdev_ops->ndo_uninit(dev);
-
- if (skb)
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
-
- /* Notifier chain MUST detach us all upper devices. */
- WARN_ON(netdev_has_any_upper_dev(dev));
- WARN_ON(netdev_has_any_lower_dev(dev));
-
- /* Remove entries from kobject tree */
- netdev_unregister_kobject(dev);
-#ifdef CONFIG_XPS
- /* Remove XPS queueing entries */
- netif_reset_xps_queues_gt(dev, 0);
-#endif
- }
-
- synchronize_net();
-
- list_for_each_entry(dev, head, unreg_list)
- dev_put(dev);
-}
-
-static void rollback_registered(struct net_device *dev)
-{
- LIST_HEAD(single);
-
- list_add(&dev->unreg_list, &single);
- rollback_registered_many(&single);
- list_del(&single);
-}
-
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
struct net_device *upper, netdev_features_t features)
{
@@ -10013,7 +10134,7 @@ int register_netdevice(struct net_device *dev)
dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
- if (dev->netdev_ops->ndo_udp_tunnel_add) {
+ if (dev->udp_tunnel_nic_info) {
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
@@ -10090,8 +10211,7 @@ int register_netdevice(struct net_device *dev)
if (ret) {
/* Expect explicit free_netdev() on failure */
dev->needs_free_netdev = false;
- rollback_registered(dev);
- net_set_todo(dev);
+ unregister_netdevice_queue(dev, NULL);
goto out;
}
/*
@@ -10713,9 +10833,10 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
if (head) {
list_move_tail(&dev->unreg_list, head);
} else {
- rollback_registered(dev);
- /* Finish processing unregister after unlock */
- net_set_todo(dev);
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ unregister_netdevice_many(&single);
}
}
EXPORT_SYMBOL(unregister_netdevice_queue);
@@ -10729,14 +10850,100 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
*/
void unregister_netdevice_many(struct list_head *head)
{
- struct net_device *dev;
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
+
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ if (list_empty(head))
+ return;
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
+
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
+
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+
+ dev->reg_state = NETREG_UNREGISTERING;
+ }
+ flush_all_backlogs();
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ dev_xdp_uninstall(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ GFP_KERNEL, NULL, 0);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ netdev_name_node_alt_flush(dev);
+ netdev_name_node_free(dev->name_node);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
- if (!list_empty(head)) {
- rollback_registered_many(head);
- list_for_each_entry(dev, head, unreg_list)
- net_set_todo(dev);
- list_del(head);
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
}
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ dev_put(dev);
+ net_set_todo(dev);
+ }
+
+ list_del(head);
}
EXPORT_SYMBOL(unregister_netdevice_many);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index db8a0ff86f36..478d032f34ac 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -123,17 +123,6 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
ifr->ifr_mtu = dev->mtu;
return 0;
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0,
- sizeof(ifr->ifr_hwaddr.sa_data));
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof(ifr->ifr_hwaddr.sa_data),
- (size_t)dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
-
case SIOCGIFSLAVE:
err = -EINVAL;
break;
@@ -274,7 +263,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
case SIOCSIFHWADDR:
if (dev->addr_len > sizeof(struct sockaddr))
return -EINVAL;
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL);
+ return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -418,6 +407,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
*/
switch (cmd) {
+ case SIOCGIFHWADDR:
+ dev_load(net, ifr->ifr_name);
+ ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name);
+ if (colon)
+ *colon = ':';
+ return ret;
/*
* These ioctl calls:
* - can be done by all.
@@ -427,7 +422,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
- case SIOCGIFHWADDR:
case SIOCGIFSLAVE:
case SIOCGIFMAP:
case SIOCGIFINDEX:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 738d4344d679..737b61c2976e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -87,6 +87,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
+ [DEVLINK_PORT_FN_ATTR_STATE] =
+ NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
+ DEVLINK_PORT_FN_STATE_ACTIVE),
};
static LIST_HEAD(devlink_list);
@@ -690,6 +693,15 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
return -EMSGSIZE;
break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_sf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_sf.pf) ||
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
+ attrs->pci_sf.sf))
+ return -EMSGSIZE;
+ break;
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
@@ -713,42 +725,105 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
}
static int
+devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops,
+ struct devlink_port *port, struct sk_buff *msg,
+ struct netlink_ext_ack *extack, bool *msg_updated)
+{
+ u8 hw_addr[MAX_ADDR_LEN];
+ int hw_addr_len;
+ int err;
+
+ if (!ops->port_function_hw_addr_get)
+ return 0;
+
+ err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
+ if (err)
+ return err;
+ *msg_updated = true;
+ return 0;
+}
+
+static bool
+devlink_port_fn_state_valid(enum devlink_port_fn_state state)
+{
+ return state == DEVLINK_PORT_FN_STATE_INACTIVE ||
+ state == DEVLINK_PORT_FN_STATE_ACTIVE;
+}
+
+static bool
+devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
+{
+ return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED ||
+ opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
+}
+
+static int
+devlink_port_fn_state_fill(struct devlink *devlink,
+ const struct devlink_ops *ops,
+ struct devlink_port *port, struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ enum devlink_port_fn_opstate opstate;
+ enum devlink_port_fn_state state;
+ int err;
+
+ if (!ops->port_fn_state_get)
+ return 0;
+
+ err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ if (!devlink_port_fn_state_valid(state)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver");
+ return -EINVAL;
+ }
+ if (!devlink_port_fn_opstate_valid(opstate)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid operational state read from driver");
+ return -EINVAL;
+ }
+ if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) ||
+ nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate))
+ return -EMSGSIZE;
+ *msg_updated = true;
+ return 0;
+}
+
+static int
devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
struct netlink_ext_ack *extack)
{
struct devlink *devlink = port->devlink;
const struct devlink_ops *ops;
struct nlattr *function_attr;
- bool empty_nest = true;
- int err = 0;
+ bool msg_updated = false;
+ int err;
function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION);
if (!function_attr)
return -EMSGSIZE;
ops = devlink->ops;
- if (ops->port_function_hw_addr_get) {
- int hw_addr_len;
- u8 hw_addr[MAX_ADDR_LEN];
-
- err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack);
- if (err == -EOPNOTSUPP) {
- /* Port function attributes are optional for a port. If port doesn't
- * support function attribute, returning -EOPNOTSUPP is not an error.
- */
- err = 0;
- goto out;
- } else if (err) {
- goto out;
- }
- err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
- if (err)
- goto out;
- empty_nest = false;
- }
-
+ err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg,
+ extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack,
+ &msg_updated);
out:
- if (err || empty_nest)
+ if (err || !msg_updated)
nla_nest_cancel(msg, function_attr);
else
nla_nest_end(msg, function_attr);
@@ -986,7 +1061,6 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *
const struct devlink_ops *ops;
const u8 *hw_addr;
int hw_addr_len;
- int err;
hw_addr = nla_data(attr);
hw_addr_len = nla_len(attr);
@@ -1011,12 +1085,25 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *
return -EOPNOTSUPP;
}
- err = ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack);
- if (err)
- return err;
+ return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack);
+}
- devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
- return 0;
+static int devlink_port_fn_state_set(struct devlink *devlink,
+ struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ enum devlink_port_fn_state state;
+ const struct devlink_ops *ops;
+
+ state = nla_get_u8(attr);
+ ops = devlink->ops;
+ if (!ops->port_fn_state_set) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Function does not support state setting");
+ return -EOPNOTSUPP;
+ }
+ return ops->port_fn_state_set(devlink, port, state, extack);
}
static int
@@ -1034,9 +1121,21 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
}
attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
- if (attr)
+ if (attr) {
err = devlink_port_function_hw_addr_set(devlink, port, attr, extack);
+ if (err)
+ return err;
+ }
+ /* Keep this as the last function attribute set, so that when
+ * multiple port function attributes are set along with state,
+ * Those can be applied first before activating the state.
+ */
+ attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
+ if (attr)
+ err = devlink_port_fn_state_set(devlink, port, attr, extack);
+ if (!err)
+ devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
return err;
}
@@ -1136,6 +1235,111 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
return devlink_port_unsplit(devlink, port_index, info->extack);
}
+static int devlink_port_new_notifiy(struct devlink *devlink,
+ unsigned int port_index,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&devlink->lock);
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_NEW, info->snd_portid,
+ info->snd_seq, 0, NULL);
+ if (err)
+ goto out;
+
+ err = genlmsg_reply(msg, info);
+ mutex_unlock(&devlink->lock);
+ return err;
+
+out:
+ mutex_unlock(&devlink->lock);
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink_port_new_attrs new_attrs = {};
+ struct devlink *devlink = info->user_ptr[0];
+ unsigned int new_port_index;
+ int err;
+
+ if (!devlink->ops->port_new || !devlink->ops->port_del)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] ||
+ !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
+ NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified");
+ return -EINVAL;
+ }
+ new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]);
+ new_attrs.pfnum =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ /* Port index of the new port being created by driver. */
+ new_attrs.port_index =
+ nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ new_attrs.port_index_valid = true;
+ }
+ if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) {
+ new_attrs.controller =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]);
+ new_attrs.controller_valid = true;
+ }
+ if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF &&
+ info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) {
+ new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]);
+ new_attrs.sfnum_valid = true;
+ }
+
+ err = devlink->ops->port_new(devlink, &new_attrs, extack,
+ &new_port_index);
+ if (err)
+ return err;
+
+ err = devlink_port_new_notifiy(devlink, new_port_index, info);
+ if (err && err != -ENODEV) {
+ /* Fail to send the response; destroy newly created port. */
+ devlink->ops->port_del(devlink, new_port_index, extack);
+ }
+ return err;
+}
+
+static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ unsigned int port_index;
+
+ if (!devlink->ops->port_del)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ NL_SET_ERR_MSG_MOD(extack, "Port index is not specified");
+ return -EINVAL;
+ }
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ return devlink->ops->port_del(devlink, port_index, extack);
+}
+
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_sb *devlink_sb,
enum devlink_command cmd, u32 portid,
@@ -7594,6 +7798,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
DEVLINK_RELOAD_ACTION_MAX),
[DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK),
+ [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 },
};
static const struct genl_small_ops devlink_nl_ops[] = {
@@ -7634,6 +7842,18 @@ static const struct genl_small_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NO_LOCK,
},
{
+ .cmd = DEVLINK_CMD_PORT_NEW,
+ .doit = devlink_nl_cmd_port_new_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_DEL,
+ .doit = devlink_nl_cmd_port_del_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NO_LOCK,
+ },
+ {
.cmd = DEVLINK_CMD_SB_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_get_doit,
@@ -8372,6 +8592,32 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+/**
+ * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PF for the devlink port instance
+ * @sf: associated SF of a PF for the devlink port instance
+ */
+void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u32 sf)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
+
+ if (WARN_ON(devlink_port->registered))
+ return;
+ ret = __devlink_port_attrs_set(devlink_port,
+ DEVLINK_PORT_FLAVOUR_PCI_SF);
+ if (ret)
+ return;
+ attrs->pci_sf.controller = controller;
+ attrs->pci_sf.pf = pf;
+ attrs->pci_sf.sf = sf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
+
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
@@ -8420,6 +8666,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
n = snprintf(name, len, "pf%uvf%u",
attrs->pci_vf.pf, attrs->pci_vf.vf);
break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
+ attrs->pci_sf.sf);
+ break;
}
if (n >= len)
@@ -8617,6 +8867,10 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
* @resource_id: resource's id
* @parent_resource_id: resource's parent id
* @size_params: size parameters
+ *
+ * Generic resources should reuse the same names across drivers.
+ * Please see the generic resources list at:
+ * Documentation/networking/devlink/devlink-resource.rst
*/
int devlink_resource_register(struct devlink *devlink,
const char *resource_name,
@@ -9508,6 +9762,7 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(GTP_PARSING, DROP),
DEVLINK_TRAP(ESP_PARSING, DROP),
DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
+ DEVLINK_TRAP(DMAC_FILTER, DROP),
};
#define DEVLINK_TRAP_GROUP(_id) \
diff --git a/net/core/filter.c b/net/core/filter.c
index 255aeee72402..adfdad234674 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2083,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = {
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
- return dev_forward_skb(dev, skb);
+ return dev_forward_skb_nomtu(dev, skb);
}
static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
struct sk_buff *skb)
{
- int ret = ____dev_forward_skb(dev, skb);
+ int ret = ____dev_forward_skb(dev, skb, false);
if (likely(!ret)) {
skb->dev = dev;
@@ -2480,7 +2480,7 @@ int skb_do_redirect(struct sk_buff *skb)
goto out_drop;
dev = ops->ndo_get_peer_dev(dev);
if (unlikely(!dev ||
- !is_skb_forwardable(dev, skb) ||
+ !(dev->flags & IFF_UP) ||
net_eq(net, dev_net(dev))))
goto out_drop;
skb->dev = dev;
@@ -3552,11 +3552,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
return 0;
}
-static u32 __bpf_skb_max_len(const struct sk_buff *skb)
-{
- return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
- SKB_MAX_ALLOC;
-}
+#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
@@ -3605,7 +3601,7 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
{
u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb);
- u32 len_max = __bpf_skb_max_len(skb);
+ u32 len_max = BPF_SKB_MAX_LEN;
__be16 proto = skb->protocol;
bool shrink = len_diff < 0;
u32 off;
@@ -3688,7 +3684,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 min_len = __bpf_skb_min_len(skb);
int ret;
@@ -3764,7 +3760,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = {
static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 new_len = skb->len + head_room;
int ret;
@@ -4631,6 +4627,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+{
+ return sk ? sock_gen_cookie(sk) : 0;
+}
+
+const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+ .func = bpf_get_socket_ptr_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
return __sock_gen_cookie(ctx->sk);
@@ -4645,11 +4653,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
static u64 __bpf_get_netns_cookie(struct sock *sk)
{
-#ifdef CONFIG_NET_NS
- return __net_gen_cookie(sk ? sk->sk_net.net : &init_net);
-#else
- return 0;
-#endif
+ const struct net *net = sk ? sock_net(sk) : &init_net;
+
+ return net->net_cookie;
}
BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
@@ -4770,6 +4776,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
ifindex = dev->ifindex;
dev_put(dev);
}
+ fallthrough;
+ case SO_BINDTOIFINDEX:
+ if (optname == SO_BINDTOIFINDEX)
+ ifindex = val;
ret = sock_bindtoindex(sk, ifindex, false);
break;
case SO_KEEPALIVE:
@@ -4932,8 +4942,25 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
sock_owned_by_me(sk);
+ if (level == SOL_SOCKET) {
+ if (optlen != sizeof(int))
+ goto err_clear;
+
+ switch (optname) {
+ case SO_MARK:
+ *((int *)optval) = sk->sk_mark;
+ break;
+ case SO_PRIORITY:
+ *((int *)optval) = sk->sk_priority;
+ break;
+ case SO_BINDTOIFINDEX:
+ *((int *)optval) = sk->sk_bound_dev_if;
+ break;
+ default:
+ goto err_clear;
+ }
#ifdef CONFIG_INET
- if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
+ } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
struct inet_connection_sock *icsk;
struct tcp_sock *tp;
@@ -4987,11 +5014,11 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
goto err_clear;
}
#endif
+#endif
} else {
goto err_clear;
}
return 0;
-#endif
err_clear:
memset(optval, 0, optlen);
return -EINVAL;
@@ -5272,12 +5299,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
const struct neighbour *neigh,
- const struct net_device *dev)
+ const struct net_device *dev, u32 mtu)
{
memcpy(params->dmac, neigh->ha, ETH_ALEN);
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
+ if (mtu)
+ params->mtu_result = mtu; /* union with tot_len */
return 0;
}
@@ -5293,8 +5322,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
struct net_device *dev;
struct fib_result res;
struct flowi4 fl4;
+ u32 mtu = 0;
int err;
- u32 mtu;
dev = dev_get_by_index_rcu(net, params->ifindex);
if (unlikely(!dev))
@@ -5361,8 +5390,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) {
mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
- if (params->tot_len > mtu)
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
}
nhc = res.nhc;
@@ -5396,7 +5427,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;
- return bpf_fib_set_fwd_params(params, neigh, dev);
+ return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
}
#endif
@@ -5413,7 +5444,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
struct flowi6 fl6;
int strict = 0;
int oif, err;
- u32 mtu;
+ u32 mtu = 0;
/* link local addresses are never forwarded */
if (rt6_need_strict(dst) || rt6_need_strict(src))
@@ -5488,8 +5519,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) {
mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
- if (params->tot_len > mtu)
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
}
if (res.nh->fib_nh_lws)
@@ -5509,7 +5542,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;
- return bpf_fib_set_fwd_params(params, neigh, dev);
+ return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
}
#endif
@@ -5552,6 +5585,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
{
struct net *net = dev_net(skb->dev);
int rc = -EAFNOSUPPORT;
+ bool check_mtu = false;
if (plen < sizeof(*params))
return -EINVAL;
@@ -5559,25 +5593,33 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
return -EINVAL;
+ if (params->tot_len)
+ check_mtu = true;
+
switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
case AF_INET:
- rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
break;
#endif
}
- if (!rc) {
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
struct net_device *dev;
+ /* When tot_len isn't provided by user, check skb
+ * against MTU of FIB lookup resulting net_device
+ */
dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+
+ params->mtu_result = dev->mtu; /* union with tot_len */
}
return rc;
@@ -5593,6 +5635,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
.arg4_type = ARG_ANYTHING,
};
+static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
+ u32 ifindex)
+{
+ struct net *netns = dev_net(dev_curr);
+
+ /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
+ if (ifindex == 0)
+ return dev_curr;
+
+ return dev_get_by_index_rcu(netns, ifindex);
+}
+
+BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+ struct net_device *dev = skb->dev;
+ int skb_len, dev_len;
+ int mtu;
+
+ if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
+ return -EINVAL;
+
+ if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+
+ dev_len = mtu + dev->hard_header_len;
+ skb_len = skb->len + len_diff; /* minus result pass check */
+ if (skb_len <= dev_len) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+ goto out;
+ }
+ /* At this point, skb->len exceed MTU, but as it include length of all
+ * segments, it can still be below MTU. The SKB can possibly get
+ * re-segmented in transmit path (see validate_xmit_skb). Thus, user
+ * must choose if segs are to be MTU checked.
+ */
+ if (skb_is_gso(skb)) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+
+ if (flags & BPF_MTU_CHK_SEGS &&
+ !skb_gso_validate_network_len(skb, mtu))
+ ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+ }
+out:
+ /* BPF verifier guarantees valid pointer */
+ *mtu_len = mtu;
+
+ return ret;
+}
+
+BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ struct net_device *dev = xdp->rxq->dev;
+ int xdp_len = xdp->data_end - xdp->data;
+ int ret = BPF_MTU_CHK_RET_SUCCESS;
+ int mtu, dev_len;
+
+ /* XDP variant doesn't support multi-buffer segment check (yet) */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+
+ /* Add L2-header as dev MTU is L3 size */
+ dev_len = mtu + dev->hard_header_len;
+
+ xdp_len += len_diff; /* minus result pass check */
+ if (xdp_len > dev_len)
+ ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+
+ /* BPF verifier guarantees valid pointer */
+ *mtu_len = mtu;
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
+ .func = bpf_skb_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_INT,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
+ .func = bpf_xdp_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_INT,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
@@ -7002,6 +7154,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
return &bpf_sock_addr_setsockopt_proto;
default:
return NULL;
@@ -7012,6 +7172,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
return &bpf_sock_addr_getsockopt_proto;
default:
return NULL;
@@ -7162,6 +7330,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
return &bpf_skb_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_skb_check_mtu_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
@@ -7231,6 +7401,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_adjust_tail_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_xdp_check_mtu_proto;
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_udp:
return &bpf_xdp_sk_lookup_udp_proto;
@@ -8795,7 +8967,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
target_size));
break;
case offsetof(struct bpf_sock, rx_queue_mapping):
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
*insn++ = BPF_LDX_MEM(
BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
si->dst_reg, si->src_reg,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6f1adba6695f..2ef2224b3bff 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -23,6 +23,7 @@
#include <linux/if_ether.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
+#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
@@ -236,9 +237,8 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
void
skb_flow_dissect_ct(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- u16 *ctinfo_map,
- size_t mapsize)
+ void *target_container, u16 *ctinfo_map,
+ size_t mapsize, bool post_ct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct flow_dissector_key_ct *key;
@@ -250,13 +250,19 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
return;
ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
+ if (!ct && !post_ct)
return;
key = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_CT,
target_container);
+ if (!ct) {
+ key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_INVALID;
+ return;
+ }
+
if (ctinfo < mapsize)
key->ct_state = ctinfo_map[ctinfo];
#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
@@ -1050,6 +1056,9 @@ proto_again:
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -1066,9 +1075,6 @@ proto_again:
}
}
- __skb_flow_dissect_ipv4(skb, flow_dissector,
- target_container, data, iph);
-
break;
}
case htons(ETH_P_IPV6): {
@@ -1251,6 +1257,21 @@ proto_again:
&proto, &nhoff, hlen, flags);
break;
+ case htons(ETH_P_1588): {
+ struct ptp_header *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += ntohs(hdr->message_length);
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
default:
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6d2d557442dc..e2982b3970b8 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -41,7 +41,6 @@
#include <trace/events/neigh.h>
-#define DEBUG
#define NEIGH_DEBUG 1
#define neigh_dbg(level, fmt, ...) \
do { \
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index daf502c13d6d..307628fdf380 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_switch_id);
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev))
+ ret = sprintf(buf, fmt_dec, netdev->threaded);
+
+ rtnl_unlock();
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = dev_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_proto_down.attr,
&dev_attr_carrier_up_count.attr,
&dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
@@ -1136,18 +1176,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
char *buf)
{
struct net_device *dev = queue->dev;
+ int num_tc, tc;
int index;
- int tc;
if (!netif_is_multiqueue(dev))
return -ENOENT;
+ if (!rtnl_trylock())
+ return restart_syscall();
+
index = get_netdev_queue_index(queue);
/* If queue belongs to subordinate dev use its TC mapping */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ num_tc = dev->num_tc;
tc = netdev_txq_to_tc(dev, index);
+
+ rtnl_unlock();
+
if (tc < 0)
return -EINVAL;
@@ -1158,8 +1205,8 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
* belongs to the root device it will be reported with just the
* traffic class, so just "0" for TC 0 for example.
*/
- return dev->num_tc < 0 ? sprintf(buf, "%d%d\n", tc, dev->num_tc) :
- sprintf(buf, "%d\n", tc);
+ return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) :
+ sprintf(buf, "%d\n", tc);
}
#ifdef CONFIG_XPS
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2ef3b4557f40..43b6ac4c4439 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -72,18 +72,6 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
DEFINE_COOKIE(net_cookie);
-u64 __net_gen_cookie(struct net *net)
-{
- while (1) {
- u64 res = atomic64_read(&net->net_cookie);
-
- if (res)
- return res;
- res = gen_cookie_next(&net_cookie);
- atomic64_cmpxchg(&net->net_cookie, 0, res);
- }
-}
-
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
@@ -332,6 +320,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
refcount_set(&net->ns.count, 1);
refcount_set(&net->passive, 1);
get_random_bytes(&net->hash_mix, sizeof(u32));
+ preempt_disable();
+ net->net_cookie = gen_cookie_next(&net_cookie);
+ preempt_enable();
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
@@ -1103,10 +1094,6 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
- preempt_disable();
- __net_gen_cookie(&init_net);
- preempt_enable();
-
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 960948290001..c310c7c1cef7 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/if_vlan.h>
-#include <net/dsa.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/addrconf.h>
@@ -658,15 +657,15 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
int netpoll_setup(struct netpoll *np)
{
- struct net_device *ndev = NULL, *dev = NULL;
- struct net *net = current->nsproxy->net_ns;
+ struct net_device *ndev = NULL;
struct in_device *in_dev;
int err;
rtnl_lock();
- if (np->dev_name[0])
+ if (np->dev_name[0]) {
+ struct net *net = current->nsproxy->net_ns;
ndev = __dev_get_by_name(net, np->dev_name);
-
+ }
if (!ndev) {
np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
err = -ENODEV;
@@ -674,19 +673,6 @@ int netpoll_setup(struct netpoll *np)
}
dev_hold(ndev);
- /* bring up DSA management network devices up first */
- for_each_netdev(net, dev) {
- if (!netdev_uses_dsa(dev))
- continue;
-
- err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
- if (err < 0) {
- np_err(np, "%s failed to open %s\n",
- np->dev_name, dev->name);
- goto put;
- }
- }
-
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n", np->dev_name);
err = -EBUSY;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f3c690b8c8e3..ad8b0707af04 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -350,14 +350,6 @@ static bool page_pool_recycle_in_cache(struct page *page,
return true;
}
-/* page is NOT reusable when:
- * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
- */
-static bool pool_page_reusable(struct page_pool *pool, struct page *page)
-{
- return !page_is_pfmemalloc(page);
-}
-
/* If the page refcnt == 1, this will try to recycle the page.
* if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
* the configured size min(dma_sync_size, pool->max_len).
@@ -373,9 +365,11 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* regular page allocator APIs.
*
* refcnt == 1 means page_pool owns page, and can recycle it.
+ *
+ * page is NOT reusable when allocated when system is under
+ * some pressure. (page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1 &&
- pool_page_reusable(pool, page))) {
+ if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 105978604ffd..3fba429f1f57 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3464,7 +3464,7 @@ static int pktgen_thread_worker(void *arg)
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- BUG_ON(smp_processor_id() != cpu);
+ WARN_ON(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
complete(&t->start_done);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3d6ab194d0f5..0edc0b2baaa4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -55,7 +55,7 @@
#include <net/net_namespace.h>
#define RTNL_MAX_TYPE 50
-#define RTNL_SLAVE_MAX_TYPE 36
+#define RTNL_SLAVE_MAX_TYPE 40
struct rtnl_link {
rtnl_doit_func doit;
@@ -2660,7 +2660,7 @@ static int do_setlink(const struct sk_buff *skb,
sa->sa_family = dev->type;
memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
dev->addr_len);
- err = dev_set_mac_address(dev, sa, extack);
+ err = dev_set_mac_address_user(dev, sa, extack);
kfree(sa);
if (err)
goto errout;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 785daff48030..545a472273a5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -119,151 +119,75 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
skb_panic(skb, sz, addr, __func__);
}
-/*
- * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
- * the caller if emergency pfmemalloc reserves are being used. If it is and
- * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
- * may be used. Otherwise, the packet data may be discarded until enough
- * memory is free
- */
-#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
- __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
-
-static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
- unsigned long ip, bool *pfmemalloc)
-{
- void *obj;
- bool ret_pfmemalloc = false;
+#define NAPI_SKB_CACHE_SIZE 64
+#define NAPI_SKB_CACHE_BULK 16
+#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
- /*
- * Try a regular allocation, when that fails and we're not entitled
- * to the reserves, fail.
- */
- obj = kmalloc_node_track_caller(size,
- flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
- node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
- goto out;
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ unsigned int skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
- /* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
+static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return obj;
+ return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
}
-/* Allocate a new skbuff. We do this ourselves so we can fill in a few
- * 'private' fields and also do memory statistics to find all the
- * [BEEP] leaks.
- *
- */
-
-/**
- * __alloc_skb - allocate a network buffer
- * @size: size to allocate
- * @gfp_mask: allocation mask
- * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
- * instead of head cache and allocate a cloned (child) skb.
- * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- * allocations in case the data is required for writeback
- * @node: numa node to allocate memory on
- *
- * Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of at least size bytes. The object has a reference count
- * of one. The return is the buffer. On a failure the return is %NULL.
- *
- * Buffers may only be allocated from interrupts using a @gfp_mask of
- * %GFP_ATOMIC.
- */
-struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int flags, int node)
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
- struct kmem_cache *cache;
- struct skb_shared_info *shinfo;
- struct sk_buff *skb;
- u8 *data;
- bool pfmemalloc;
-
- cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_head_cache;
-
- if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
- gfp_mask |= __GFP_MEMALLOC;
-
- /* Get the HEAD */
- skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
- if (!skb)
- goto out;
- prefetchw(skb);
-
- /* We do our best to align skb_shared_info on a separate cache
- * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
- * aligned memory blocks, unless SLUB/SLAB debug is enabled.
- * Both skb->head and skb_shared_info are cache line aligned.
- */
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
- if (!data)
- goto nodata;
- /* kmalloc(size) might give us more room than requested.
- * Put skb_shared_info exactly at the end of allocated zone,
- * to allow max possible filling before reallocation.
- */
- size = SKB_WITH_OVERHEAD(ksize(data));
- prefetchw(data + size);
-
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- /* Account for allocated memory : skb + skb->head */
- skb->truesize = SKB_TRUESIZE(size);
- skb->pfmemalloc = pfmemalloc;
- refcount_set(&skb->users, 1);
- skb->head = data;
- skb->data = data;
- skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
- skb->mac_header = (typeof(skb->mac_header))~0U;
- skb->transport_header = (typeof(skb->transport_header))~0U;
+ fragsz = SKB_DATA_ALIGN(fragsz);
- /* make sure we initialize shinfo sequentially */
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
+ return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
+}
+EXPORT_SYMBOL(__napi_alloc_frag_align);
- if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff_fclones *fclones;
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ struct page_frag_cache *nc;
+ void *data;
- fclones = container_of(skb, struct sk_buff_fclones, skb1);
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+ } else {
+ local_bh_disable();
+ data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
+ local_bh_enable();
+ }
+ return data;
+}
+EXPORT_SYMBOL(__netdev_alloc_frag_align);
- skb->fclone = SKB_FCLONE_ORIG;
- refcount_set(&fclones->fclone_ref, 1);
+static struct sk_buff *napi_skb_cache_get(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb;
- fclones->skb2.fclone = SKB_FCLONE_CLONE;
- }
+ if (unlikely(!nc->skb_count))
+ nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ GFP_ATOMIC,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
+ if (unlikely(!nc->skb_count))
+ return NULL;
- skb_set_kcov_handle(skb, kcov_common_handle());
+ skb = nc->skb_cache[--nc->skb_count];
+ kasan_unpoison_object_data(skbuff_head_cache, skb);
-out:
return skb;
-nodata:
- kmem_cache_free(cache, skb);
- skb = NULL;
- goto out;
}
-EXPORT_SYMBOL(__alloc_skb);
/* Caller must provide SKB that is memset cleared */
-static struct sk_buff *__build_skb_around(struct sk_buff *skb,
- void *data, unsigned int frag_size)
+static void __build_skb_around(struct sk_buff *skb, void *data,
+ unsigned int frag_size)
{
struct skb_shared_info *shinfo;
unsigned int size = frag_size ? : ksize(data);
@@ -286,8 +210,6 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
atomic_set(&shinfo->dataref, 1);
skb_set_kcov_handle(skb, kcov_common_handle());
-
- return skb;
}
/**
@@ -318,8 +240,9 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
- return __build_skb_around(skb, data, frag_size);
+ return skb;
}
/* build_skb() is wrapper over __build_skb(), that specifically
@@ -352,9 +275,9 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
if (unlikely(!skb))
return NULL;
- skb = __build_skb_around(skb, data, frag_size);
+ __build_skb_around(skb, data, frag_size);
- if (skb && frag_size) {
+ if (frag_size) {
skb->head_frag = 1;
if (page_is_pfmemalloc(virt_to_head_page(data)))
skb->pfmemalloc = 1;
@@ -363,56 +286,178 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
}
EXPORT_SYMBOL(build_skb_around);
-#define NAPI_SKB_CACHE_SIZE 64
+/**
+ * __napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __build_skb() that uses NAPI percpu caches to obtain
+ * skbuff_head instead of inplace allocation.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb;
-struct napi_alloc_cache {
- struct page_frag_cache page;
- unsigned int skb_count;
- void *skb_cache[NAPI_SKB_CACHE_SIZE];
-};
+ skb = napi_skb_cache_get();
+ if (unlikely(!skb))
+ return NULL;
-static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+ return skb;
+}
+
+/**
+ * napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __napi_build_skb() that takes care of skb->head_frag
+ * and skb->pfmemalloc when the data is a page or page fragment.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb = __napi_build_skb(data, frag_size);
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+ if (likely(skb) && frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+
+ return skb;
}
+EXPORT_SYMBOL(napi_build_skb);
-void *napi_alloc_frag(unsigned int fragsz)
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
+ bool *pfmemalloc)
{
- fragsz = SKB_DATA_ALIGN(fragsz);
+ void *obj;
+ bool ret_pfmemalloc = false;
- return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
}
-EXPORT_SYMBOL(napi_alloc_frag);
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
/**
- * netdev_alloc_frag - allocate a page fragment
- * @fragsz: fragment size
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
*
- * Allocates a frag from a page for receive buffer.
- * Uses GFP_ATOMIC allocations.
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
*/
-void *netdev_alloc_frag(unsigned int fragsz)
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
{
- struct page_frag_cache *nc;
- void *data;
+ struct kmem_cache *cache;
+ struct sk_buff *skb;
+ u8 *data;
+ bool pfmemalloc;
- fragsz = SKB_DATA_ALIGN(fragsz);
- if (in_irq() || irqs_disabled()) {
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
- } else {
- local_bh_disable();
- data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
- local_bh_enable();
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ /* Get the HEAD */
+ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
+ likely(node == NUMA_NO_NODE || node == numa_mem_id()))
+ skb = napi_skb_cache_get();
+ else
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
+ if (unlikely(!skb))
+ return NULL;
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+ if (unlikely(!data))
+ goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
+ prefetchw(data + size);
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, 0);
+ skb->pfmemalloc = pfmemalloc;
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ refcount_set(&fclones->fclone_ref, 1);
+
+ fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
- return data;
+
+ return skb;
+
+nodata:
+ kmem_cache_free(cache, skb);
+ return NULL;
}
-EXPORT_SYMBOL(netdev_alloc_frag);
+EXPORT_SYMBOL(__alloc_skb);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -517,7 +562,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (len <= SKB_WITH_OVERHEAD(1024) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
- skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
+ NUMA_NO_NODE);
if (!skb)
goto skb_fail;
goto skb_success;
@@ -534,7 +580,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (unlikely(!data))
return NULL;
- skb = __build_skb(data, len);
+ skb = __napi_build_skb(data, len);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
@@ -614,13 +660,14 @@ static void skb_release_data(struct sk_buff *skb)
&shinfo->dataref))
return;
+ skb_zcopy_clear(skb, true);
+
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i]);
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
- skb_zcopy_clear(skb, true);
skb_free_head(skb);
}
@@ -864,43 +911,36 @@ void __consume_stateless_skb(struct sk_buff *skb)
kfree_skbmem(skb);
}
-void __kfree_skb_flush(void)
+static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 i;
- /* flush skb_cache if containing objects */
- if (nc->skb_count) {
- kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
- nc->skb_cache);
- nc->skb_count = 0;
- }
-}
-
-static inline void _kfree_skb_defer(struct sk_buff *skb)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- /* drop skb->head and call any destructors for packet */
- skb_release_all(skb);
-
- /* record skb to CPU local list */
+ kasan_poison_object_data(skbuff_head_cache, skb);
nc->skb_cache[nc->skb_count++] = skb;
-#ifdef CONFIG_SLUB
- /* SLUB writes into objects when freeing */
- prefetchw(skb);
-#endif
-
- /* flush skb_cache if it is filled */
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
- nc->skb_cache);
- nc->skb_count = 0;
+ for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
+ kasan_unpoison_object_data(skbuff_head_cache,
+ nc->skb_cache[i]);
+
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+ nc->skb_cache + NAPI_SKB_CACHE_HALF);
+ nc->skb_count = NAPI_SKB_CACHE_HALF;
}
}
+
void __kfree_skb_defer(struct sk_buff *skb)
{
- _kfree_skb_defer(skb);
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
+}
+
+void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
+ napi_skb_cache_put(skb);
}
void napi_consume_skb(struct sk_buff *skb, int budget)
@@ -925,7 +965,8 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- _kfree_skb_defer(skb);
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1102,7 +1143,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
struct ubuf_info *uarg;
struct sk_buff *skb;
@@ -1122,25 +1163,26 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
return NULL;
}
- uarg->callback = sock_zerocopy_callback;
+ uarg->callback = msg_zerocopy_callback;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
+ uarg->flags = SKBFL_ZEROCOPY_FRAG;
refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
return uarg;
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+EXPORT_SYMBOL_GPL(msg_zerocopy_alloc);
static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
-struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg)
{
if (uarg) {
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
@@ -1172,16 +1214,16 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
/* no extra ref when appending to datagram (MSG_MORE) */
if (sk->sk_type == SOCK_STREAM)
- sock_zerocopy_get(uarg);
+ net_zcopy_get(uarg);
return uarg;
}
}
new_alloc:
- return sock_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
@@ -1203,7 +1245,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
return true;
}
-void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+static void __msg_zerocopy_callback(struct ubuf_info *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
@@ -1231,7 +1273,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
serr->ee.ee_data = hi;
serr->ee.ee_info = lo;
- if (!success)
+ if (!uarg->zerocopy)
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
q = &sk->sk_error_queue;
@@ -1250,32 +1292,28 @@ release:
consume_skb(skb);
sock_put(sk);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
-void sock_zerocopy_put(struct ubuf_info *uarg)
+void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
- if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
- if (uarg->callback)
- uarg->callback(uarg, uarg->zerocopy);
- else
- consume_skb(skb_from_uarg(uarg));
- }
+ uarg->zerocopy = uarg->zerocopy & success;
+
+ if (refcount_dec_and_test(&uarg->refcnt))
+ __msg_zerocopy_callback(uarg);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put);
+EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
-void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
- if (uarg) {
- struct sock *sk = skb_from_uarg(uarg)->sk;
+ struct sock *sk = skb_from_uarg(uarg)->sk;
- atomic_dec(&sk->sk_zckey);
- uarg->len--;
+ atomic_dec(&sk->sk_zckey);
+ uarg->len--;
- if (have_uref)
- sock_zerocopy_put(uarg);
- }
+ if (have_uref)
+ msg_zerocopy_callback(NULL, uarg, true);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
{
@@ -1339,7 +1377,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
* @skb: the skb to modify
* @gfp_mask: allocation priority
*
- * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
* It will copy all frags into kernel and drop the reference
* to userspace pages.
*
@@ -3276,8 +3314,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
- skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
@@ -3292,7 +3329,19 @@ EXPORT_SYMBOL(skb_split);
*/
static int skb_prepare_for_shift(struct sk_buff *skb)
{
- return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ int ret = 0;
+
+ if (skb_cloned(skb)) {
+ /* Save and restore truesize: pskb_expand_head() may reallocate
+ * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
+ * cannot change truesize at this point.
+ */
+ unsigned int save_truesize = skb->truesize;
+
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ skb->truesize = save_truesize;
+ }
+ return ret;
}
/**
@@ -3901,12 +3950,8 @@ normal:
}
hsize = skb_headlen(head_skb) - offset;
- if (hsize < 0)
- hsize = 0;
- if (hsize > len || !sg)
- hsize = len;
- if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
@@ -3949,6 +3994,11 @@ normal:
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
nskb = __alloc_skb(hsize + doffset + headroom,
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
@@ -4002,8 +4052,8 @@ normal:
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+ SKBFL_SHARED_FRAG;
if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
@@ -4723,6 +4773,7 @@ err:
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
@@ -4745,7 +4796,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM) {
- skb = tcp_get_timestamping_opt_stats(sk, orig_skb);
+ skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
+ ack_skb);
opt_stats = true;
} else
#endif
@@ -4774,7 +4826,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
- return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 25cdbb20f3a0..1261512d6807 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -669,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
kfree(psock);
}
-void sk_psock_destroy(struct rcu_head *rcu)
+static void sk_psock_destroy(struct rcu_head *rcu)
{
struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
schedule_work(&psock->gc);
}
-EXPORT_SYMBOL_GPL(sk_psock_destroy);
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..0ed98f20448a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -526,11 +526,17 @@ discard_and_relse:
}
EXPORT_SYMBOL(__sk_receive_skb);
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
+ u32));
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && dst->obsolete &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
@@ -546,7 +552,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && dst->obsolete &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_dst_reset(sk);
dst_release(dst);
return NULL;
@@ -1657,6 +1665,16 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
+
+ /* If we move sk_tx_queue_mapping out of the private section,
+ * we must check if sk_tx_queue_clear() is called after
+ * sock_copy() in sk_clone_lock().
+ */
+ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+ offsetof(struct sock, sk_dontcopy_begin) ||
+ offsetof(struct sock, sk_tx_queue_mapping) >=
+ offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
@@ -1690,7 +1708,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
- sk_tx_queue_clear(sk);
}
return sk;
@@ -1876,123 +1893,120 @@ static void sk_init_common(struct sock *sk)
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct proto *prot = READ_ONCE(sk->sk_prot);
- struct sock *newsk;
+ struct sk_filter *filter;
bool is_charged = true;
+ struct sock *newsk;
newsk = sk_prot_alloc(prot, priority, sk->sk_family);
- if (newsk != NULL) {
- struct sk_filter *filter;
+ if (!newsk)
+ goto out;
- sock_copy(newsk, sk);
+ sock_copy(newsk, sk);
- newsk->sk_prot_creator = prot;
+ newsk->sk_prot_creator = prot;
- /* SANITY */
- if (likely(newsk->sk_net_refcnt))
- get_net(sock_net(newsk));
- sk_node_init(&newsk->sk_node);
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_backlog.len = 0;
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt))
+ get_net(sock_net(newsk));
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
- atomic_set(&newsk->sk_rmem_alloc, 0);
- /*
- * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
- */
- refcount_set(&newsk->sk_wmem_alloc, 1);
- atomic_set(&newsk->sk_omem_alloc, 0);
- sk_init_common(newsk);
+ atomic_set(&newsk->sk_rmem_alloc, 0);
- newsk->sk_dst_cache = NULL;
- newsk->sk_dst_pending_confirm = 0;
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
- atomic_set(&newsk->sk_drops, 0);
- newsk->sk_send_head = NULL;
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- atomic_set(&newsk->sk_zckey, 0);
+ /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
+ refcount_set(&newsk->sk_wmem_alloc, 1);
- sock_reset_flag(newsk, SOCK_DONE);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
- /* sk->sk_memcg will be populated at accept() time */
- newsk->sk_memcg = NULL;
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ atomic_set(&newsk->sk_drops, 0);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
- cgroup_sk_clone(&newsk->sk_cgrp_data);
+ sock_reset_flag(newsk, SOCK_DONE);
- rcu_read_lock();
- filter = rcu_dereference(sk->sk_filter);
- if (filter != NULL)
- /* though it's an empty new sock, the charging may fail
- * if sysctl_optmem_max was changed between creation of
- * original socket and cloning
- */
- is_charged = sk_filter_charge(newsk, filter);
- RCU_INIT_POINTER(newsk->sk_filter, filter);
- rcu_read_unlock();
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* We need to make sure that we don't uncharge the new
- * socket if we couldn't charge it in the first place
- * as otherwise we uncharge the parent's filter.
- */
- if (!is_charged)
- RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
- RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
- if (bpf_sk_storage_clone(sk, newsk)) {
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
- /* Clear sk_user_data if parent had the pointer tagged
- * as not suitable for copying when cloning.
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
*/
- if (sk_user_data_is_nocopy(newsk))
- newsk->sk_user_data = NULL;
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
- newsk->sk_err = 0;
- newsk->sk_err_soft = 0;
- newsk->sk_priority = 0;
- newsk->sk_incoming_cpu = raw_smp_processor_id();
- if (likely(newsk->sk_net_refcnt))
- sock_inuse_add(sock_net(newsk), 1);
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
- /*
- * Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.rst for details)
- */
- smp_wmb();
- refcount_set(&newsk->sk_refcnt, 2);
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ newsk->sk_user_data = NULL;
- /*
- * Increment the counter in the same struct proto as the master
- * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
- * is the same as sk->sk_prot->socks, as this field was copied
- * with memcpy).
- *
- * This _changes_ the previous behaviour, where
- * tcp_create_openreq_child always was incrementing the
- * equivalent to tcp_prot->socks (inet_sock_nr), so this have
- * to be taken into account in all callers. -acme
- */
- sk_refcnt_debug_inc(newsk);
- sk_set_socket(newsk, NULL);
- sk_tx_queue_clear(newsk);
- RCU_INIT_POINTER(newsk->sk_wq, NULL);
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+ if (likely(newsk->sk_net_refcnt))
+ sock_inuse_add(sock_net(newsk), 1);
- if (newsk->sk_prot->sockets_allocated)
- sk_sockets_allocated_inc(newsk);
+ /* Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
- if (sock_needs_netstamp(sk) &&
- newsk->sk_flags & SK_FLAGS_TIMESTAMP)
- net_enable_timestamp();
- }
+ /* Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, NULL);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
out:
return newsk;
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 64b5ec14ff50..d758fb83c884 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -602,7 +602,7 @@ int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
- fput(sock->file);
+ sockfd_put(sock);
return ret;
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index d86d8d11cfe4..4567de519603 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -309,7 +309,6 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
#endif
static struct ctl_table net_core_table[] = {
-#ifdef CONFIG_NET
{
.procname = "wmem_max",
.data = &sysctl_wmem_max,
@@ -507,7 +506,6 @@ static struct ctl_table net_core_table[] = {
.proc_handler = set_default_qdisc
},
#endif
-#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
.data = &netdev_budget,
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 3a8c9ab4ecbe..05354976c1fc 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -513,3 +513,73 @@ void xdp_warn(const char *msg, const char *func, const int line)
WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
};
EXPORT_SYMBOL_GPL(xdp_warn);
+
+int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
+{
+ n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp,
+ n_skb, skbs);
+ if (unlikely(!n_skb))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
+
+struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct sk_buff *skb,
+ struct net_device *dev)
+{
+ unsigned int headroom, frame_size;
+ void *hard_start;
+
+ /* Part of headroom was reserved to xdpf */
+ headroom = sizeof(*xdpf) + xdpf->headroom;
+
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
+ */
+ frame_size = xdpf->frame_sz;
+
+ hard_start = xdpf->data - headroom;
+ skb = build_skb_around(skb, hard_start, frame_size);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, headroom);
+ __skb_put(skb, xdpf->len);
+ if (xdpf->metasize)
+ skb_metadata_set(skb, xdpf->metasize);
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ /* Until page_pool get SKB return path, release DMA here */
+ xdp_release_frame(xdpf);
+
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame);
+
+struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct net_device *dev)
+{
+ struct sk_buff *skb;
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+
+ return __xdp_build_skb_from_frame(xdpf, skb, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);