diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2026-04-20 04:28:57 +0300 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2026-04-20 04:28:57 +0300 |
| commit | f4b369c6fe0ceaba2da2daff8c9eb415f85926dd (patch) | |
| tree | 30465d0a429b2c224685b5d8e804bf053c4d129a /net | |
| parent | ff14dafde15c11403fac61367a34fea08926e9ee (diff) | |
| parent | 2ca45e57ea027fffe3350ae5e21ad9cecb0dce74 (diff) | |
| download | linux-f4b369c6fe0ceaba2da2daff8c9eb415f85926dd.tar.xz | |
Merge branch 'next' into for-linus
Prepare input updates for 7.1 merge window.
Diffstat (limited to 'net')
861 files changed, 19224 insertions, 10956 deletions
diff --git a/net/802/Makefile b/net/802/Makefile index 99abc29d537c..9503ef6b2e06 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o -obj-$(CONFIG_HIPPI) += hippi.o obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o diff --git a/net/802/garp.c b/net/802/garp.c index 2d1ffc4d9462..6f563b6797d9 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -547,7 +547,7 @@ static int garp_init_port(struct net_device *dev) { struct garp_port *port; - port = kzalloc(sizeof(*port), GFP_KERNEL); + port = kzalloc_obj(*port); if (!port) return -ENOMEM; rcu_assign_pointer(dev->garp_port, port); @@ -581,7 +581,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl) } err = -ENOMEM; - app = kzalloc(sizeof(*app), GFP_KERNEL); + app = kzalloc_obj(*app); if (!app) goto err2; diff --git a/net/802/hippi.c b/net/802/hippi.c deleted file mode 100644 index 1997b7dd265e..000000000000 --- a/net/802/hippi.c +++ /dev/null @@ -1,193 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * HIPPI-type device handling. - * - * Version: @(#)hippi.c 1.0.0 05/29/97 - * - * Authors: Ross Biro - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Mark Evans, <evansmp@uhura.aston.ac.uk> - * Florian La Roche, <rzsfl@rz.uni-sb.de> - * Alan Cox, <gw4pts@gw4pts.ampr.org> - * Jes Sorensen, <Jes.Sorensen@cern.ch> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/hippidevice.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <net/arp.h> -#include <net/sock.h> -#include <linux/uaccess.h> - -/* - * Create the HIPPI MAC header for an arbitrary protocol layer - * - * saddr=NULL means use device source address - * daddr=NULL means leave destination address (eg unresolved arp) - */ - -static int hippi_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) -{ - struct hippi_hdr *hip = skb_push(skb, HIPPI_HLEN); - struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; - - if (!len){ - len = skb->len - HIPPI_HLEN; - printk("hippi_header(): length not supplied\n"); - } - - /* - * Due to the stupidity of the little endian byte-order we - * have to set the fp field this way. - */ - hip->fp.fixed = htonl(0x04800018); - hip->fp.d2_size = htonl(len + 8); - hip->le.fc = 0; - hip->le.double_wide = 0; /* only HIPPI 800 for the time being */ - hip->le.message_type = 0; /* Data PDU */ - - hip->le.dest_addr_type = 2; /* 12 bit SC address */ - hip->le.src_addr_type = 2; /* 12 bit SC address */ - - memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3); - memset_startat(&hip->le, 0, reserved); - - hip->snap.dsap = HIPPI_EXTENDED_SAP; - hip->snap.ssap = HIPPI_EXTENDED_SAP; - hip->snap.ctrl = HIPPI_UI_CMD; - hip->snap.oui[0] = 0x00; - hip->snap.oui[1] = 0x00; - hip->snap.oui[2] = 0x00; - hip->snap.ethertype = htons(type); - - if (daddr) - { - memcpy(hip->le.dest_switch_addr, daddr + 3, 3); - memcpy(&hcb->ifield, daddr + 2, 4); - return HIPPI_HLEN; - } - hcb->ifield = 0; - return -((int)HIPPI_HLEN); -} - - -/* - * Determine the packet's protocol ID. - */ - -__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev) -{ - struct hippi_hdr *hip; - - /* - * This is actually wrong ... question is if we really should - * set the raw address here. - */ - skb->dev = dev; - skb_reset_mac_header(skb); - hip = (struct hippi_hdr *)skb_mac_header(skb); - skb_pull(skb, HIPPI_HLEN); - - /* - * No fancy promisc stuff here now. - */ - - return hip->snap.ethertype; -} - -EXPORT_SYMBOL(hippi_type_trans); - -/* - * For HIPPI we will actually use the lower 4 bytes of the hardware - * address as the I-FIELD rather than the actual hardware address. - */ -int hippi_mac_addr(struct net_device *dev, void *p) -{ - struct sockaddr *addr = p; - if (netif_running(dev)) - return -EBUSY; - dev_addr_set(dev, addr->sa_data); - return 0; -} -EXPORT_SYMBOL(hippi_mac_addr); - -int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) -{ - /* Never send broadcast/multicast ARP messages */ - NEIGH_VAR_INIT(p, MCAST_PROBES, 0); - - /* In IPv6 unicast probes are valid even on NBMA, - * because they are encapsulated in normal IPv6 protocol. - * Should be a generic flag. - */ - if (p->tbl->family != AF_INET6) - NEIGH_VAR_INIT(p, UCAST_PROBES, 0); - return 0; -} -EXPORT_SYMBOL(hippi_neigh_setup_dev); - -static const struct header_ops hippi_header_ops = { - .create = hippi_header, -}; - - -static void hippi_setup(struct net_device *dev) -{ - dev->header_ops = &hippi_header_ops; - - /* - * We don't support HIPPI `ARP' for the time being, and probably - * never will unless someone else implements it. However we - * still need a fake ARPHRD to make ifconfig and friends play ball. - */ - dev->type = ARPHRD_HIPPI; - dev->hard_header_len = HIPPI_HLEN; - dev->mtu = 65280; - dev->min_mtu = 68; - dev->max_mtu = 65280; - dev->addr_len = HIPPI_ALEN; - dev->tx_queue_len = 25 /* 5 */; - memset(dev->broadcast, 0xFF, HIPPI_ALEN); - - - /* - * HIPPI doesn't support broadcast+multicast and we only use - * static ARP tables. ARP is disabled by hippi_neigh_setup_dev. - */ - dev->flags = 0; -} - -/** - * alloc_hippi_dev - Register HIPPI device - * @sizeof_priv: Size of additional driver-private structure to be allocated - * for this HIPPI device - * - * Fill in the fields of the device structure with HIPPI-generic values. - * - * Constructs a new net device, complete with a private data area of - * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for - * this private data area. - */ - -struct net_device *alloc_hippi_dev(int sizeof_priv) -{ - return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN, - hippi_setup); -} - -EXPORT_SYMBOL(alloc_hippi_dev); diff --git a/net/802/mrp.c b/net/802/mrp.c index 23a88305f900..ff0e80574e6b 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -832,7 +832,7 @@ static int mrp_init_port(struct net_device *dev) { struct mrp_port *port; - port = kzalloc(sizeof(*port), GFP_KERNEL); + port = kzalloc_obj(*port); if (!port) return -ENOMEM; rcu_assign_pointer(dev->mrp_port, port); @@ -866,7 +866,7 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) } err = -ENOMEM; - app = kzalloc(sizeof(*app), GFP_KERNEL); + app = kzalloc_obj(*app); if (!app) goto err2; diff --git a/net/802/psnap.c b/net/802/psnap.c index 389df460c8c4..8ae835e1cbae 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -132,7 +132,7 @@ struct datalink_proto *register_snap_client(const unsigned char *desc, if (find_snap_client(desc)) goto out; - proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + proto = kmalloc_obj(*proto, GFP_ATOMIC); if (proto) { memcpy(proto->type, desc, 5); proto->rcvfunc = rcvfunc; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 9404dd551dfd..d23965e76c16 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -150,7 +150,7 @@ static struct vlan_info *vlan_info_alloc(struct net_device *dev) { struct vlan_info *vlan_info; - vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL); + vlan_info = kzalloc_obj(struct vlan_info); if (!vlan_info) return NULL; @@ -193,7 +193,7 @@ static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid) { struct vlan_vid_info *vid_info; - vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL); + vid_info = kzalloc_obj(struct vlan_vid_info); if (!vid_info) return NULL; vid_info->proto = proto; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index fbf296137b09..c40f7d5c4fca 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -192,7 +192,7 @@ int vlan_dev_set_egress_priority(const struct net_device *dev, /* Create a new mapping then. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; - np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL); + np = kmalloc_obj(struct vlan_priority_tci_mapping); if (!np) return -ENOBUFS; @@ -708,7 +708,7 @@ static int vlan_dev_netpoll_setup(struct net_device *dev) struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc_obj(*netpoll); err = -ENOMEM; if (!netpoll) goto out; diff --git a/net/9p/client.c b/net/9p/client.c index 5c1ca57ccd28..f0dcf252af7e 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -20,8 +20,8 @@ #include <linux/uio.h> #include <linux/netfs.h> #include <net/9p/9p.h> -#include <linux/parser.h> #include <linux/seq_file.h> +#include <linux/fs_context.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "protocol.h" @@ -29,32 +29,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/9p.h> -/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + - * room for write (16 extra) or read (11 extra) operands. - */ - -#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) - /* Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options */ -enum { - Opt_msize, - Opt_trans, - Opt_legacy, - Opt_version, - Opt_err, -}; - -static const match_table_t tokens = { - {Opt_msize, "msize=%u"}, - {Opt_legacy, "noextend"}, - {Opt_trans, "trans=%s"}, - {Opt_version, "version=%s"}, - {Opt_err, NULL}, -}; - inline int p9_is_proto_dotl(struct p9_client *clnt) { return clnt->proto_version == p9_proto_2000L; @@ -103,124 +81,16 @@ static int safe_errno(int err) return err; } -/* Interpret mount option for protocol version */ -static int get_protocol_version(char *s) +static int apply_client_options(struct p9_client *clnt, struct fs_context *fc) { - int version = -EINVAL; - - if (!strcmp(s, "9p2000")) { - version = p9_proto_legacy; - p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n"); - } else if (!strcmp(s, "9p2000.u")) { - version = p9_proto_2000u; - p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); - } else if (!strcmp(s, "9p2000.L")) { - version = p9_proto_2000L; - p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n"); - } else { - pr_info("Unknown protocol version %s\n", s); - } + struct v9fs_context *ctx = fc->fs_private; - return version; -} + clnt->msize = ctx->client_opts.msize; + clnt->trans_mod = ctx->client_opts.trans_mod; + ctx->client_opts.trans_mod = NULL; + clnt->proto_version = ctx->client_opts.proto_version; -/** - * parse_opts - parse mount options into client structure - * @opts: options string passed from mount - * @clnt: existing v9fs client information - * - * Return 0 upon success, -ERRNO upon failure - */ - -static int parse_opts(char *opts, struct p9_client *clnt) -{ - char *options, *tmp_options; - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *s; - int ret = 0; - - clnt->proto_version = p9_proto_2000L; - clnt->msize = DEFAULT_MSIZE; - - if (!opts) - return 0; - - tmp_options = kstrdup(opts, GFP_KERNEL); - if (!tmp_options) - return -ENOMEM; - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token, r; - - if (!*p) - continue; - token = match_token(p, tokens, args); - switch (token) { - case Opt_msize: - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - ret = r; - continue; - } - if (option < 4096) { - p9_debug(P9_DEBUG_ERROR, - "msize should be at least 4k\n"); - ret = -EINVAL; - continue; - } - clnt->msize = option; - break; - case Opt_trans: - s = match_strdup(&args[0]); - if (!s) { - ret = -ENOMEM; - p9_debug(P9_DEBUG_ERROR, - "problem allocating copy of trans arg\n"); - goto free_and_return; - } - - v9fs_put_trans(clnt->trans_mod); - clnt->trans_mod = v9fs_get_trans_by_name(s); - if (!clnt->trans_mod) { - pr_info("Could not find request transport: %s\n", - s); - ret = -EINVAL; - } - kfree(s); - break; - case Opt_legacy: - clnt->proto_version = p9_proto_legacy; - break; - case Opt_version: - s = match_strdup(&args[0]); - if (!s) { - ret = -ENOMEM; - p9_debug(P9_DEBUG_ERROR, - "problem allocating copy of version arg\n"); - goto free_and_return; - } - r = get_protocol_version(s); - if (r < 0) - ret = r; - else - clnt->proto_version = r; - kfree(s); - break; - default: - continue; - } - } - -free_and_return: - if (ret) - v9fs_put_trans(clnt->trans_mod); - kfree(tmp_options); - return ret; + return 0; } static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, @@ -229,8 +99,15 @@ static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, if (likely(c->fcall_cache) && alloc_msize == c->msize) { fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); fc->cache = c->fcall_cache; + if (!fc->sdata && c->trans_mod->supports_vmalloc) { + fc->sdata = kvmalloc(alloc_msize, GFP_NOFS); + fc->cache = NULL; + } } else { - fc->sdata = kmalloc(alloc_msize, GFP_NOFS); + if (c->trans_mod->supports_vmalloc) + fc->sdata = kvmalloc(alloc_msize, GFP_NOFS); + else + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); fc->cache = NULL; } if (!fc->sdata) @@ -252,7 +129,7 @@ void p9_fcall_fini(struct p9_fcall *fc) if (fc->cache) kmem_cache_free(fc->cache, fc->sdata); else - kfree(fc->sdata); + kvfree(fc->sdata); } EXPORT_SYMBOL(p9_fcall_fini); @@ -713,8 +590,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) } again: /* Wait for the response */ - err = wait_event_killable(req->wq, - READ_ONCE(req->status) >= REQ_STATUS_RCVD); + err = io_wait_event_killable(req->wq, + READ_ONCE(req->status) >= REQ_STATUS_RCVD); /* Make sure our req is coherent with regard to updates in other * threads - echoes to wmb() in the callback @@ -853,7 +730,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) struct p9_fid *fid; p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); - fid = kzalloc(sizeof(*fid), GFP_KERNEL); + fid = kzalloc_obj(*fid); if (!fid) return NULL; @@ -974,7 +851,7 @@ error: return err; } -struct p9_client *p9_client_create(const char *dev_name, char *options) +struct p9_client *p9_client_create(struct fs_context *fc) { int err; static atomic_t seqno = ATOMIC_INIT(0); @@ -982,7 +859,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) char *client_id; char *cache_name; - clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); + clnt = kmalloc_obj(*clnt); if (!clnt) return ERR_PTR(-ENOMEM); @@ -997,8 +874,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) idr_init(&clnt->fids); idr_init(&clnt->reqs); - err = parse_opts(options, clnt); - if (err < 0) + err = apply_client_options(clnt, fc); + if (err) goto free_client; if (!clnt->trans_mod) @@ -1014,7 +891,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); - err = clnt->trans_mod->create(clnt, dev_name, options); + err = clnt->trans_mod->create(clnt, fc); if (err) goto put_trans; @@ -1738,7 +1615,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid); - ret = kmalloc(sizeof(*ret), GFP_KERNEL); + ret = kmalloc_obj(*ret); if (!ret) return ERR_PTR(-ENOMEM); @@ -1790,7 +1667,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n", fid->fid, request_mask); - ret = kmalloc(sizeof(*ret), GFP_KERNEL); + ret = kmalloc_obj(*ret); if (!ret) return ERR_PTR(-ENOMEM); diff --git a/net/9p/mod.c b/net/9p/mod.c index 55576c1866fa..85160b52da55 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -16,7 +16,6 @@ #include <linux/moduleparam.h> #include <net/9p/9p.h> #include <linux/fs.h> -#include <linux/parser.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/list.h> @@ -171,6 +170,7 @@ void v9fs_put_trans(struct p9_trans_module *m) if (m) module_put(m->owner); } +EXPORT_SYMBOL(v9fs_put_trans); /** * init_p9 - Initialize module diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 0e6603b1ec90..67b0586d807f 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -451,9 +451,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, p9pdu_readf(pdu, proto_version, "w", nwqid); if (!errcode) { *wqids = - kmalloc_array(*nwqid, - sizeof(struct p9_qid), - GFP_NOFS); + kmalloc_objs(struct p9_qid, *nwqid, + GFP_NOFS); if (*wqids == NULL) errcode = -ENOMEM; } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index a516745f732f..dbad3213ba84 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/file.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> @@ -31,48 +31,12 @@ #include <linux/syscalls.h> /* killme */ -#define P9_PORT 564 #define MAX_SOCK_BUF (1024*1024) #define MAXPOLLWADDR 2 static struct p9_trans_module p9_tcp_trans; static struct p9_trans_module p9_fd_trans; -/** - * struct p9_fd_opts - per-transport options - * @rfd: file descriptor for reading (trans=fd) - * @wfd: file descriptor for writing (trans=fd) - * @port: port to connect to (trans=tcp) - * @privport: port is privileged - */ - -struct p9_fd_opts { - int rfd; - int wfd; - u16 port; - bool privport; -}; - -/* - * Option Parsing (code inspired by NFS code) - * - a little lazy - parse all fd-transport options - */ - -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, - /* Options that take no arguments */ - Opt_privport, -}; - -static const match_table_t tokens = { - {Opt_port, "port=%u"}, - {Opt_rfdno, "rfdno=%u"}, - {Opt_wfdno, "wfdno=%u"}, - {Opt_privport, "privport"}, - {Opt_err, NULL}, -}; - enum { Rworksched = 1, /* read work scheduled or running */ Rpending = 2, /* can read */ @@ -742,7 +706,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->trans_mod == &p9_tcp_trans) { - if (clnt->trans_opts.tcp.port != P9_PORT) + if (clnt->trans_opts.tcp.port != P9_FD_PORT) seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port); } else if (clnt->trans_mod == &p9_fd_trans) { if (clnt->trans_opts.fd.rfd != ~0) @@ -753,77 +717,9 @@ static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) return 0; } -/** - * parse_opts - parse mount options into p9_fd_opts structure - * @params: options string passed from mount - * @opts: fd transport-specific structure to parse options into - * - * Returns 0 upon success, -ERRNO upon failure - */ - -static int parse_opts(char *params, struct p9_fd_opts *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *options, *tmp_options; - - opts->port = P9_PORT; - opts->rfd = ~0; - opts->wfd = ~0; - opts->privport = false; - - if (!params) - return 0; - - tmp_options = kstrdup(params, GFP_KERNEL); - if (!tmp_options) { - p9_debug(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - return -ENOMEM; - } - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - int r; - if (!*p) - continue; - token = match_token(p, tokens, args); - if ((token != Opt_err) && (token != Opt_privport)) { - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; - } - } - switch (token) { - case Opt_port: - opts->port = option; - break; - case Opt_rfdno: - opts->rfd = option; - break; - case Opt_wfdno: - opts->wfd = option; - break; - case Opt_privport: - opts->privport = true; - break; - default: - continue; - } - } - - kfree(tmp_options); - return 0; -} - static int p9_fd_open(struct p9_client *client, int rfd, int wfd) { - struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd), - GFP_KERNEL); + struct p9_trans_fd *ts = kzalloc_obj(struct p9_trans_fd); if (!ts) return -ENOMEM; @@ -867,7 +763,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) struct p9_trans_fd *p; struct file *file; - p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); + p = kzalloc_obj(struct p9_trans_fd); if (!p) { sock_release(csocket); return -ENOMEM; @@ -966,7 +862,7 @@ static int p9_bind_privport(struct socket *sock) ((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port); else ((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port); - err = kernel_bind(sock, (struct sockaddr *)&stor, sizeof(stor)); + err = kernel_bind(sock, (struct sockaddr_unsized *)&stor, sizeof(stor)); if (err != -EADDRINUSE) break; } @@ -974,17 +870,18 @@ static int p9_bind_privport(struct socket *sock) } static int -p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) +p9_fd_create_tcp(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; + struct v9fs_context *ctx = fc->fs_private; int err; char port_str[6]; struct socket *csocket; struct sockaddr_storage stor = { 0 }; struct p9_fd_opts opts; - err = parse_opts(args, &opts); - if (err < 0) - return err; + /* opts are already parsed in context */ + opts = ctx->fd_opts; if (!addr) return -EINVAL; @@ -1018,7 +915,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } err = READ_ONCE(csocket->ops)->connect(csocket, - (struct sockaddr *)&stor, + (struct sockaddr_unsized *)&stor, sizeof(stor), 0); if (err < 0) { pr_err("%s (%d): problem connecting socket to %s\n", @@ -1031,8 +928,9 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } static int -p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) +p9_fd_create_unix(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; int err; struct socket *csocket; struct sockaddr_un sun_server; @@ -1058,8 +956,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) return err; } - err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, - sizeof(struct sockaddr_un) - 1, 0); + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr_unsized *)&sun_server, + sizeof(struct sockaddr_un) - 1, 0); if (err < 0) { pr_err("%s (%d): problem connecting socket: %s: %d\n", __func__, task_pid_nr(current), addr, err); @@ -1071,14 +969,12 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) } static int -p9_fd_create(struct p9_client *client, const char *addr, char *args) +p9_fd_create(struct p9_client *client, struct fs_context *fc) { + struct v9fs_context *ctx = fc->fs_private; + struct p9_fd_opts opts = ctx->fd_opts; int err; - struct p9_fd_opts opts; - err = parse_opts(args, &opts); - if (err < 0) - return err; client->trans_opts.fd.rfd = opts.rfd; client->trans_opts.fd.wfd = opts.wfd; @@ -1100,7 +996,8 @@ static struct p9_trans_module p9_tcp_trans = { .name = "tcp", .maxsize = MAX_SOCK_BUF, .pooled_rbuffers = false, - .def = 0, + .def = false, + .supports_vmalloc = true, .create = p9_fd_create_tcp, .close = p9_fd_close, .request = p9_fd_request, @@ -1114,7 +1011,8 @@ MODULE_ALIAS_9P("tcp"); static struct p9_trans_module p9_unix_trans = { .name = "unix", .maxsize = MAX_SOCK_BUF, - .def = 0, + .def = false, + .supports_vmalloc = true, .create = p9_fd_create_unix, .close = p9_fd_close, .request = p9_fd_request, @@ -1128,7 +1026,8 @@ MODULE_ALIAS_9P("unix"); static struct p9_trans_module p9_fd_trans = { .name = "fd", .maxsize = MAX_SOCK_BUF, - .def = 0, + .def = false, + .supports_vmalloc = true, .create = p9_fd_create, .close = p9_fd_close, .request = p9_fd_request, diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b84748baf9cb..aa5bd74d333f 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/file.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <linux/semaphore.h> #include <linux/slab.h> #include <linux/seq_file.h> @@ -32,14 +32,10 @@ #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> -#define P9_PORT 5640 -#define P9_RDMA_SQ_DEPTH 32 -#define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_SEND_SGE 4 #define P9_RDMA_RECV_SGE 4 #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 -#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** @@ -110,48 +106,11 @@ struct p9_rdma_context { }; }; -/** - * struct p9_rdma_opts - Collection of mount options - * @port: port of connection - * @privport: Whether a privileged port may be used - * @sq_depth: The requested depth of the SQ. This really doesn't need - * to be any deeper than the number of threads used in the client - * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth - * @timeout: Time to wait in msecs for CM events - */ -struct p9_rdma_opts { - short port; - bool privport; - int sq_depth; - int rq_depth; - long timeout; -}; - -/* - * Option Parsing (code inspired by NFS code) - */ -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, - /* Options that take no argument */ - Opt_privport, - Opt_err, -}; - -static match_table_t tokens = { - {Opt_port, "port=%u"}, - {Opt_sq_depth, "sq=%u"}, - {Opt_rq_depth, "rq=%u"}, - {Opt_timeout, "timeout=%u"}, - {Opt_privport, "privport"}, - {Opt_err, NULL}, -}; - static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) { struct p9_trans_rdma *rdma = clnt->trans; - if (rdma->port != P9_PORT) + if (rdma->port != P9_RDMA_PORT) seq_printf(m, ",port=%u", rdma->port); if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) seq_printf(m, ",sq=%u", rdma->sq_depth); @@ -164,77 +123,6 @@ static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) return 0; } -/** - * parse_opts - parse mount options into rdma options structure - * @params: options string passed from mount - * @opts: rdma transport-specific structure to parse options into - * - * Returns 0 upon success, -ERRNO upon failure - */ -static int parse_opts(char *params, struct p9_rdma_opts *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *options, *tmp_options; - - opts->port = P9_PORT; - opts->sq_depth = P9_RDMA_SQ_DEPTH; - opts->rq_depth = P9_RDMA_RQ_DEPTH; - opts->timeout = P9_RDMA_TIMEOUT; - opts->privport = false; - - if (!params) - return 0; - - tmp_options = kstrdup(params, GFP_KERNEL); - if (!tmp_options) { - p9_debug(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - return -ENOMEM; - } - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - int r; - if (!*p) - continue; - token = match_token(p, tokens, args); - if ((token != Opt_err) && (token != Opt_privport)) { - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; - } - } - switch (token) { - case Opt_port: - opts->port = option; - break; - case Opt_sq_depth: - opts->sq_depth = option; - break; - case Opt_rq_depth: - opts->rq_depth = option; - break; - case Opt_timeout: - opts->timeout = option; - break; - case Opt_privport: - opts->privport = true; - break; - default: - continue; - } - } - /* RQ must be at least as large as the SQ */ - opts->rq_depth = max(opts->rq_depth, opts->sq_depth); - kfree(tmp_options); - return 0; -} - static int p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -446,7 +334,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) } /* Allocate an fcall for the reply */ - rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); + rpl_context = kmalloc_obj(*rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto recv_error; @@ -475,7 +363,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) dont_need_post_recv: /* Post the request */ - c = kmalloc(sizeof *c, GFP_NOFS); + c = kmalloc_obj(*c, GFP_NOFS); if (!c) { err = -ENOMEM; goto send_error; @@ -572,7 +460,7 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) { struct p9_trans_rdma *rdma; - rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); + rdma = kzalloc_obj(struct p9_trans_rdma); if (!rdma) return NULL; @@ -628,14 +516,15 @@ static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) /** * rdma_create_trans - Transport method for creating a transport instance * @client: client instance - * @addr: IP address string - * @args: Mount options string + * @fc: The filesystem context */ static int -rdma_create_trans(struct p9_client *client, const char *addr, char *args) +rdma_create_trans(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; + struct v9fs_context *ctx = fc->fs_private; + struct p9_rdma_opts opts = ctx->rdma_opts; int err; - struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; @@ -643,10 +532,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (addr == NULL) return -EINVAL; - /* Parse the transport specific mount options */ - err = parse_opts(args, &opts); - if (err < 0) - return err; + /* options are already parsed, in the fs context */ + opts = ctx->rdma_opts; /* Create and initialize the RDMA transport structure */ rdma = alloc_rdma(&opts); @@ -748,7 +635,8 @@ static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, .pooled_rbuffers = true, - .def = 0, + .def = false, + .supports_vmalloc = false, .owner = THIS_MODULE, .create = rdma_create_trans, .close = rdma_close, diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c index 468f7e8f0277..1ce70338999c 100644 --- a/net/9p/trans_usbg.c +++ b/net/9p/trans_usbg.c @@ -27,6 +27,7 @@ #include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/fs_context.h> #include <linux/usb/composite.h> #include <linux/usb/func_utils.h> @@ -376,8 +377,9 @@ out: return ret; } -static int p9_usbg_create(struct p9_client *client, const char *devname, char *args) +static int p9_usbg_create(struct p9_client *client, struct fs_context *fc) { + const char *devname = fc->source; struct f_usb9pfs_dev *dev; struct f_usb9pfs *usb9pfs; int ret = -ENOENT; @@ -514,6 +516,7 @@ static struct p9_trans_module p9_usbg_trans = { .close = p9_usbg_close, .request = p9_usbg_request, .cancel = p9_usbg_cancel, + .supports_vmalloc = false, .owner = THIS_MODULE, }; @@ -754,7 +757,7 @@ static struct usb_function *usb9pfs_alloc(struct usb_function_instance *fi) struct f_usb9pfs_opts *usb9pfs_opts; struct f_usb9pfs *usb9pfs; - usb9pfs = kzalloc(sizeof(*usb9pfs), GFP_KERNEL); + usb9pfs = kzalloc_obj(*usb9pfs); if (!usb9pfs) return ERR_PTR(-ENOMEM); @@ -907,7 +910,7 @@ static struct usb_function_instance *usb9pfs_alloc_instance(void) struct f_usb9pfs_opts *usb9pfs_opts; struct f_usb9pfs_dev *dev; - usb9pfs_opts = kzalloc(sizeof(*usb9pfs_opts), GFP_KERNEL); + usb9pfs_opts = kzalloc_obj(*usb9pfs_opts); if (!usb9pfs_opts) return ERR_PTR(-ENOMEM); @@ -918,7 +921,7 @@ static struct usb_function_instance *usb9pfs_alloc_instance(void) usb9pfs_opts->buflen = DEFAULT_BUFLEN; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc_obj(*dev); if (!dev) { kfree(usb9pfs_opts); return ERR_PTR(-ENOMEM); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 0b8086f58ad5..4cdab7094b27 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -26,7 +26,7 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <net/9p/9p.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/scatterlist.h> @@ -284,8 +284,8 @@ req_retry: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_killable(*chan->vc_wq, - chan->ring_bufs_avail); + err = io_wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; @@ -325,7 +325,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, * Other zc request to finish here */ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { - err = wait_event_killable(vp_wq, + err = io_wait_event_killable(vp_wq, (atomic_read(&vp_pinned) < chan->p9_max_pages)); if (err == -ERESTARTSYS) return err; @@ -358,8 +358,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - (unsigned long)p / PAGE_SIZE; - *pages = kmalloc_array(nr_pages, sizeof(struct page *), - GFP_NOFS); + *pages = kmalloc_objs(struct page *, nr_pages, GFP_NOFS); if (!*pages) return -ENOMEM; @@ -512,8 +511,8 @@ req_retry_pinned: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_killable(*chan->vc_wq, - chan->ring_bufs_avail); + err = io_wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) goto err_out; @@ -531,8 +530,8 @@ req_retry_pinned: spin_unlock_irqrestore(&chan->lock, flags); kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); - err = wait_event_killable(req->wq, - READ_ONCE(req->status) >= REQ_STATUS_RCVD); + err = io_wait_event_killable(req->wq, + READ_ONCE(req->status) >= REQ_STATUS_RCVD); // RERROR needs reply (== error string) in static data if (READ_ONCE(req->status) == REQ_STATUS_RCVD && unlikely(req->rc.sdata[4] == P9_RERROR)) @@ -602,7 +601,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) return -EINVAL; } - chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); + chan = kmalloc_obj(struct virtio_chan); if (!chan) { pr_err("Failed to allocate virtio 9P channel\n"); err = -ENOMEM; @@ -642,7 +641,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) if (err) { goto out_free_tag; } - chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + chan->vc_wq = kmalloc_obj(wait_queue_head_t); if (!chan->vc_wq) { err = -ENOMEM; goto out_remove_file; @@ -679,8 +678,7 @@ fail: /** * p9_virtio_create - allocate a new virtio channel * @client: client instance invoking this transport - * @devname: string identifying the channel to connect to (unused) - * @args: args passed from sys_mount() for per-transport options (unused) + * @fc: the filesystem context * * This sets up a transport channel for 9p communication. Right now * we only match the first available channel, but eventually we could look up @@ -691,8 +689,9 @@ fail: */ static int -p9_virtio_create(struct p9_client *client, const char *devname, char *args) +p9_virtio_create(struct p9_client *client, struct fs_context *fc) { + const char *devname = fc->source; struct virtio_chan *chan; int ret = -ENOENT; int found = 0; @@ -802,7 +801,8 @@ static struct p9_trans_module p9_virtio_trans = { */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), .pooled_rbuffers = false, - .def = 1, + .def = true, + .supports_vmalloc = false, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index b9ff69c7522a..47af5a10e921 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/spinlock.h> +#include <linux/fs_context.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> @@ -66,8 +67,9 @@ static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) return 1; } -static int p9_xen_create(struct p9_client *client, const char *addr, char *args) +static int p9_xen_create(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; struct xen_9pfs_front_priv *priv; if (addr == NULL) @@ -134,8 +136,8 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) ring = &priv->rings[num]; again: - while (wait_event_killable(ring->wq, - p9_xen_write_todo(ring, size)) != 0) + while (io_wait_event_killable(ring->wq, + p9_xen_write_todo(ring, size)) != 0) ; spin_lock_irqsave(&ring->lock, flags); @@ -257,7 +259,8 @@ static struct p9_trans_module p9_xen_trans = { .name = "xen", .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2), .pooled_rbuffers = false, - .def = 1, + .def = true, + .supports_vmalloc = false, .create = p9_xen_create, .close = p9_xen_close, .request = p9_xen_request, @@ -274,45 +277,52 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) { int i, j; - write_lock(&xen_9pfs_lock); - list_del(&priv->list); - write_unlock(&xen_9pfs_lock); - - for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) { - struct xen_9pfs_dataring *ring = &priv->rings[i]; - - cancel_work_sync(&ring->work); - - if (!priv->rings[i].intf) - break; - if (priv->rings[i].irq > 0) - unbind_from_irqhandler(priv->rings[i].irq, ring); - if (priv->rings[i].data.in) { - for (j = 0; - j < (1 << priv->rings[i].intf->ring_order); - j++) { - grant_ref_t ref; - - ref = priv->rings[i].intf->ref[j]; - gnttab_end_foreign_access(ref, NULL); - } - free_pages_exact(priv->rings[i].data.in, + if (priv->rings) { + for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) { + struct xen_9pfs_dataring *ring = &priv->rings[i]; + + cancel_work_sync(&ring->work); + + if (!priv->rings[i].intf) + break; + if (priv->rings[i].irq > 0) + unbind_from_irqhandler(priv->rings[i].irq, ring); + if (priv->rings[i].data.in) { + for (j = 0; + j < (1 << priv->rings[i].intf->ring_order); + j++) { + grant_ref_t ref; + + ref = priv->rings[i].intf->ref[j]; + gnttab_end_foreign_access(ref, NULL); + } + free_pages_exact(priv->rings[i].data.in, 1UL << (priv->rings[i].intf->ring_order + XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(priv->rings[i].ref, NULL); + free_page((unsigned long)priv->rings[i].intf); } - gnttab_end_foreign_access(priv->rings[i].ref, NULL); - free_page((unsigned long)priv->rings[i].intf); + kfree(priv->rings); } - kfree(priv->rings); kfree(priv->tag); kfree(priv); } static void xen_9pfs_front_remove(struct xenbus_device *dev) { - struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + struct xen_9pfs_front_priv *priv; + write_lock(&xen_9pfs_lock); + priv = dev_get_drvdata(&dev->dev); + if (priv == NULL) { + write_unlock(&xen_9pfs_lock); + return; + } dev_set_drvdata(&dev->dev, NULL); + list_del(&priv->list); + write_unlock(&xen_9pfs_lock); + xen_9pfs_front_free(priv); } @@ -379,7 +389,7 @@ static int xen_9pfs_front_init(struct xenbus_device *dev) { int ret, i; struct xenbus_transaction xbt; - struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + struct xen_9pfs_front_priv *priv; char *versions, *v; unsigned int max_rings, max_ring_order, len = 0; @@ -407,8 +417,11 @@ static int xen_9pfs_front_init(struct xenbus_device *dev) if (p9_xen_trans.maxsize > XEN_FLEX_RING_SIZE(max_ring_order)) p9_xen_trans.maxsize = XEN_FLEX_RING_SIZE(max_ring_order) / 2; - priv->rings = kcalloc(XEN_9PFS_NUM_RINGS, sizeof(*priv->rings), - GFP_KERNEL); + priv = kzalloc_obj(*priv); + if (!priv) + return -ENOMEM; + priv->dev = dev; + priv->rings = kzalloc_objs(*priv->rings, XEN_9PFS_NUM_RINGS); if (!priv->rings) { kfree(priv); return -ENOMEM; @@ -465,6 +478,11 @@ static int xen_9pfs_front_init(struct xenbus_device *dev) goto error; } + write_lock(&xen_9pfs_lock); + dev_set_drvdata(&dev->dev, priv); + list_add_tail(&priv->list, &xen_9pfs_devs); + write_unlock(&xen_9pfs_lock); + xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -479,19 +497,6 @@ static int xen_9pfs_front_init(struct xenbus_device *dev) static int xen_9pfs_front_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { - struct xen_9pfs_front_priv *priv = NULL; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) - return -ENOMEM; - - priv->dev = dev; - dev_set_drvdata(&dev->dev, priv); - - write_lock(&xen_9pfs_lock); - list_add_tail(&priv->list, &xen_9pfs_devs); - write_unlock(&xen_9pfs_lock); - return 0; } diff --git a/net/Kconfig b/net/Kconfig index 1d3f757d4b07..62266eaf0e95 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -400,15 +400,15 @@ config NET_PKTGEN module will be called pktgen. config NET_DROP_MONITOR - tristate "Network packet drop alerting service" + tristate "Legacy network packet drop alerting service" depends on INET && TRACEPOINTS help This feature provides an alerting service to userspace in the event that packets are discarded in the network stack. Alerts are broadcast via netlink socket to any listening user space - process. If you don't need network drop alerts, or if you are ok - just checking the various proc files and other utilities for - drop statistics, say N here. + process. This feature is NOT related to "perf" based drop monitoring. + Say N here unless you need to support older userspace tools like + "dropwatch". endmenu # Network testing diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 4744e3fd4544..e7315c01a299 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -393,7 +393,7 @@ static void aarp_purge(void) */ static struct aarp_entry *aarp_alloc(void) { - struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC); + struct aarp_entry *a = kmalloc_obj(*a, GFP_ATOMIC); if (!a) return NULL; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 30242fe10341..30a6dc06291c 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -233,7 +233,7 @@ static void atif_drop_device(struct net_device *dev) static struct atalk_iface *atif_add_device(struct net_device *dev, struct atalk_addr *sa) { - struct atalk_iface *iface = kzalloc(sizeof(*iface), GFP_KERNEL); + struct atalk_iface *iface = kzalloc_obj(*iface); if (!iface) goto out; @@ -564,7 +564,7 @@ static int atrtr_create(struct rtentry *r, struct net_device *devhint) } if (!rt) { - rt = kzalloc(sizeof(*rt), GFP_ATOMIC); + rt = kzalloc_obj(*rt, GFP_ATOMIC); retval = -ENOBUFS; if (!rt) @@ -1149,7 +1149,7 @@ out: } /* Set the address 'our end' of the connection */ -static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int atalk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_at *addr = (struct sockaddr_at *)uaddr; struct sock *sk = sock->sk; @@ -1204,7 +1204,7 @@ out: } /* Set the address we talk to */ -static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, +static int atalk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/atm/addr.c b/net/atm/addr.c index 0530b63f509a..938f360ae230 100644 --- a/net/atm/addr.c +++ b/net/atm/addr.c @@ -87,7 +87,7 @@ int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr, return -EEXIST; } } - this = kmalloc(sizeof(struct atm_dev_addr), GFP_ATOMIC); + this = kmalloc_obj(struct atm_dev_addr, GFP_ATOMIC); if (!this) { spin_unlock_irqrestore(&dev->lock, flags); return -ENOMEM; diff --git a/net/atm/br2684.c b/net/atm/br2684.c index f666f2f98ba5..6580d67c3456 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -538,7 +538,7 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) if (copy_from_user(&be, arg, sizeof be)) return -EFAULT; - brvcc = kzalloc(sizeof(struct br2684_vcc), GFP_KERNEL); + brvcc = kzalloc_obj(struct br2684_vcc); if (!brvcc) return -ENOMEM; /* diff --git a/net/atm/clip.c b/net/atm/clip.c index f7a5565e794e..516b2214680b 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -168,10 +168,10 @@ static int neigh_check_cb(struct neighbour *n) static void idle_timer_check(struct timer_list *unused) { - write_lock(&arp_tbl.lock); + spin_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); - write_unlock(&arp_tbl.lock); + spin_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) @@ -431,7 +431,7 @@ static int clip_mkip(struct atm_vcc *vcc, int timeout) return -EBADFD; if (vcc->user_back) return -EINVAL; - clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); + clip_vcc = kmalloc_obj(struct clip_vcc); if (!clip_vcc) return -ENOMEM; pr_debug("%p vcc %p\n", clip_vcc, vcc); diff --git a/net/atm/common.c b/net/atm/common.c index c4edc1111bf0..fe77f51f6ce1 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -157,7 +157,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family, i memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc)); memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc)); vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */ - refcount_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); atomic_set(&sk->sk_rmem_alloc, 0); vcc->push = NULL; vcc->pop = NULL; diff --git a/net/atm/lec.c b/net/atm/lec.c index afb8d3eb2185..10e260acf602 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -154,10 +154,19 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) /* 0x01 is topology change */ priv = netdev_priv(dev); - atm_force_charge(priv->lecd, skb2->truesize); - sk = sk_atm(priv->lecd); - skb_queue_tail(&sk->sk_receive_queue, skb2); - sk->sk_data_ready(sk); + struct atm_vcc *vcc; + + rcu_read_lock(); + vcc = rcu_dereference(priv->lecd); + if (vcc) { + atm_force_charge(vcc, skb2->truesize); + sk = sk_atm(vcc); + skb_queue_tail(&sk->sk_receive_queue, skb2); + sk->sk_data_ready(sk); + } else { + dev_kfree_skb(skb2); + } + rcu_read_unlock(); } } #endif /* IS_ENABLED(CONFIG_BRIDGE) */ @@ -216,7 +225,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb, int is_rdesc; pr_debug("called\n"); - if (!priv->lecd) { + if (!rcu_access_pointer(priv->lecd)) { pr_info("%s:No lecd attached\n", dev->name); dev->stats.tx_errors++; netif_stop_queue(dev); @@ -449,10 +458,19 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) break; skb2->len = sizeof(struct atmlec_msg); skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg)); - atm_force_charge(priv->lecd, skb2->truesize); - sk = sk_atm(priv->lecd); - skb_queue_tail(&sk->sk_receive_queue, skb2); - sk->sk_data_ready(sk); + struct atm_vcc *vcc; + + rcu_read_lock(); + vcc = rcu_dereference(priv->lecd); + if (vcc) { + atm_force_charge(vcc, skb2->truesize); + sk = sk_atm(vcc); + skb_queue_tail(&sk->sk_receive_queue, skb2); + sk->sk_data_ready(sk); + } else { + dev_kfree_skb(skb2); + } + rcu_read_unlock(); } } #endif /* IS_ENABLED(CONFIG_BRIDGE) */ @@ -468,23 +486,16 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) static void lec_atm_close(struct atm_vcc *vcc) { - struct sk_buff *skb; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); - priv->lecd = NULL; + rcu_assign_pointer(priv->lecd, NULL); + synchronize_rcu(); /* Do something needful? */ netif_stop_queue(dev); lec_arp_destroy(priv); - if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) - pr_info("%s closing with messages pending\n", dev->name); - while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { - atm_return(vcc, skb->truesize); - dev_kfree_skb(skb); - } - pr_info("%s: Shut down!\n", dev->name); module_put(THIS_MODULE); } @@ -510,12 +521,14 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, const unsigned char *mac_addr, const unsigned char *atm_addr, struct sk_buff *data) { + struct atm_vcc *vcc; struct sock *sk; struct sk_buff *skb; struct atmlec_msg *mesg; - if (!priv || !priv->lecd) + if (!priv || !rcu_access_pointer(priv->lecd)) return -1; + skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (!skb) return -1; @@ -532,18 +545,27 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, if (atm_addr) memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN); - atm_force_charge(priv->lecd, skb->truesize); - sk = sk_atm(priv->lecd); + rcu_read_lock(); + vcc = rcu_dereference(priv->lecd); + if (!vcc) { + rcu_read_unlock(); + kfree_skb(skb); + return -1; + } + + atm_force_charge(vcc, skb->truesize); + sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); if (data != NULL) { pr_debug("about to send %d bytes of data\n", data->len); - atm_force_charge(priv->lecd, data->truesize); + atm_force_charge(vcc, data->truesize); skb_queue_tail(&sk->sk_receive_queue, data); sk->sk_data_ready(sk); } + rcu_read_unlock(); return 0; } @@ -618,7 +640,7 @@ static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) atm_return(vcc, skb->truesize); if (*(__be16 *) skb->data == htons(priv->lecid) || - !priv->lecd || !(dev->flags & IFF_UP)) { + !rcu_access_pointer(priv->lecd) || !(dev->flags & IFF_UP)) { /* * Probably looping back, or if lecd is missing, * lecd has gone down @@ -696,7 +718,7 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) ioc_data.dev_num = array_index_nospec(ioc_data.dev_num, MAX_LEC_ITF); if (!dev_lec[ioc_data.dev_num]) return -EINVAL; - vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL); + vpriv = kmalloc_obj(struct lec_vcc_priv); if (!vpriv) return -ENOMEM; vpriv->xoff = 0; @@ -753,12 +775,12 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) priv = netdev_priv(dev_lec[i]); } else { priv = netdev_priv(dev_lec[i]); - if (priv->lecd) + if (rcu_access_pointer(priv->lecd)) return -EADDRINUSE; } lec_arp_init(priv); priv->itfnum = i; /* LANE2 addition */ - priv->lecd = vcc; + rcu_assign_pointer(priv->lecd, vcc); vcc->dev = &lecatm_dev; vcc_insert_socket(sk_atm(vcc)); @@ -1260,24 +1282,28 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry) struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = (struct net_device *)vcc->proto_data; - vcc->pop = vpriv->old_pop; - if (vpriv->xoff) - netif_wake_queue(dev); - kfree(vpriv); - vcc->user_back = NULL; - vcc->push = entry->old_push; - vcc_release_async(vcc, -EPIPE); + if (vpriv) { + vcc->pop = vpriv->old_pop; + if (vpriv->xoff) + netif_wake_queue(dev); + kfree(vpriv); + vcc->user_back = NULL; + vcc->push = entry->old_push; + vcc_release_async(vcc, -EPIPE); + } entry->vcc = NULL; } if (entry->recv_vcc) { struct atm_vcc *vcc = entry->recv_vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); - kfree(vpriv); - vcc->user_back = NULL; + if (vpriv) { + kfree(vpriv); + vcc->user_back = NULL; - entry->recv_vcc->push = entry->old_recv_push; - vcc_release_async(entry->recv_vcc, -EPIPE); + entry->recv_vcc->push = entry->old_recv_push; + vcc_release_async(entry->recv_vcc, -EPIPE); + } entry->recv_vcc = NULL; } } @@ -1541,7 +1567,7 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, { struct lec_arp_table *to_return; - to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC); + to_return = kzalloc_obj(struct lec_arp_table, GFP_ATOMIC); if (!to_return) return NULL; ether_addr_copy(to_return->mac_addr, mac_addr); @@ -2125,7 +2151,7 @@ static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc) struct lec_vcc_priv *vpriv; int err = 0; - vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL); + vpriv = kmalloc_obj(struct lec_vcc_priv); if (!vpriv) return -ENOMEM; vpriv->xoff = 0; diff --git a/net/atm/lec.h b/net/atm/lec.h index be0e2667bd8c..ec85709bf818 100644 --- a/net/atm/lec.h +++ b/net/atm/lec.h @@ -91,7 +91,7 @@ struct lec_priv { */ spinlock_t lec_arp_lock; struct atm_vcc *mcast_vcc; /* Default Multicast Send VCC */ - struct atm_vcc *lecd; + struct atm_vcc __rcu *lecd; struct delayed_work lec_arp_work; /* C10 */ unsigned int maximum_unknown_frame_count; /* diff --git a/net/atm/mpc.c b/net/atm/mpc.c index f6b447bba329..ce8e9780373b 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -184,7 +184,7 @@ struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos) return entry; } - entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL); + entry = kmalloc_obj(struct atm_mpoa_qos); if (entry == NULL) { pr_info("mpoa: out of memory\n"); return entry; @@ -282,7 +282,7 @@ static struct mpoa_client *alloc_mpc(void) { struct mpoa_client *mpc; - mpc = kzalloc(sizeof(struct mpoa_client), GFP_KERNEL); + mpc = kzalloc_obj(struct mpoa_client); if (mpc == NULL) return NULL; rwlock_init(&mpc->ingress_lock); diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c index f7a2f0e41105..c8d4e6f2e831 100644 --- a/net/atm/mpoa_caches.c +++ b/net/atm/mpoa_caches.c @@ -97,7 +97,7 @@ static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc, static in_cache_entry *in_cache_add_entry(__be32 dst_ip, struct mpoa_client *client) { - in_cache_entry *entry = kzalloc(sizeof(in_cache_entry), GFP_KERNEL); + in_cache_entry *entry = kzalloc_obj(in_cache_entry); if (entry == NULL) { pr_info("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n"); @@ -456,7 +456,7 @@ static void eg_cache_remove_entry(eg_cache_entry *entry, static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, struct mpoa_client *client) { - eg_cache_entry *entry = kzalloc(sizeof(eg_cache_entry), GFP_KERNEL); + eg_cache_entry *entry = kzalloc_obj(eg_cache_entry); if (entry == NULL) { pr_info("out of memory\n"); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 3e4f17d335fe..2574aae3e066 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -397,7 +397,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) if (be.encaps != PPPOATM_ENCAPS_AUTODETECT && be.encaps != PPPOATM_ENCAPS_VC && be.encaps != PPPOATM_ENCAPS_LLC) return -EINVAL; - pvcc = kzalloc(sizeof(*pvcc), GFP_KERNEL); + pvcc = kzalloc_obj(*pvcc); if (pvcc == NULL) return -ENOMEM; pvcc->atmvcc = atmvcc; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 66d9a9bd5896..8f5e76f5dd9e 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -24,7 +24,7 @@ static int pvc_shutdown(struct socket *sock, int how) return 0; } -static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr, +static int pvc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len) { struct sock *sk = sock->sk; @@ -56,7 +56,7 @@ out: return error; } -static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, +static int pvc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len, int flags) { return pvc_bind(sock, sockaddr, sockaddr_len); diff --git a/net/atm/resources.c b/net/atm/resources.c index 7c6fdedbcf4e..939452a610c0 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -36,7 +36,7 @@ static struct atm_dev *__alloc_atm_dev(const char *type) { struct atm_dev *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc_obj(*dev); if (!dev) return NULL; dev->type = type; diff --git a/net/atm/signaling.c b/net/atm/signaling.c index e70ae2c113f9..358fbe5e4d1d 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -22,6 +22,36 @@ struct atm_vcc *sigd = NULL; +/* + * find_get_vcc - validate and get a reference to a vcc pointer + * @vcc: the vcc pointer to validate + * + * This function validates that @vcc points to a registered VCC in vcc_hash. + * If found, it increments the socket reference count and returns the vcc. + * The caller must call sock_put(sk_atm(vcc)) when done. + * + * Returns the vcc pointer if valid, NULL otherwise. + */ +static struct atm_vcc *find_get_vcc(struct atm_vcc *vcc) +{ + int i; + + read_lock(&vcc_sklist_lock); + for (i = 0; i < VCC_HTABLE_SIZE; i++) { + struct sock *s; + + sk_for_each(s, &vcc_hash[i]) { + if (atm_sk(s) == vcc) { + sock_hold(s); + read_unlock(&vcc_sklist_lock); + return vcc; + } + } + } + read_unlock(&vcc_sklist_lock); + return NULL; +} + static void sigd_put_skb(struct sk_buff *skb) { if (!sigd) { @@ -69,7 +99,14 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) msg = (struct atmsvc_msg *) skb->data; WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc)); - vcc = *(struct atm_vcc **) &msg->vcc; + + vcc = find_get_vcc(*(struct atm_vcc **)&msg->vcc); + if (!vcc) { + pr_debug("invalid vcc pointer in msg\n"); + dev_kfree_skb(skb); + return -EINVAL; + } + pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc); sk = sk_atm(vcc); @@ -100,7 +137,16 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) clear_bit(ATM_VF_WAITING, &vcc->flags); break; case as_indicate: - vcc = *(struct atm_vcc **)&msg->listen_vcc; + /* Release the reference from msg->vcc, we'll use msg->listen_vcc instead */ + sock_put(sk); + + vcc = find_get_vcc(*(struct atm_vcc **)&msg->listen_vcc); + if (!vcc) { + pr_debug("invalid listen_vcc pointer in msg\n"); + dev_kfree_skb(skb); + return -EINVAL; + } + sk = sk_atm(vcc); pr_debug("as_indicate!!!\n"); lock_sock(sk); @@ -115,6 +161,8 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) sk->sk_state_change(sk); as_indicate_complete: release_sock(sk); + /* Paired with find_get_vcc(msg->listen_vcc) above */ + sock_put(sk); return 0; case as_close: set_bit(ATM_VF_RELEASED, &vcc->flags); @@ -131,11 +179,15 @@ as_indicate_complete: break; default: pr_alert("bad message type %d\n", (int)msg->type); + /* Paired with find_get_vcc(msg->vcc) above */ + sock_put(sk); return -EINVAL; } sk->sk_state_change(sk); out: dev_kfree_skb(skb); + /* Paired with find_get_vcc(msg->vcc) above */ + sock_put(sk); return 0; } diff --git a/net/atm/svc.c b/net/atm/svc.c index f8137ae693b0..005964250ecd 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -97,7 +97,7 @@ static int svc_release(struct socket *sock) return 0; } -static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, +static int svc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len) { DEFINE_WAIT(wait); @@ -153,7 +153,7 @@ out: return error; } -static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, +static int svc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index e23a3dc14b93..310169ce1488 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -63,20 +63,6 @@ config AX25_DAMA_SLAVE be enabled at runtime. For more about DAMA see <https://linux-ax25.in-berlin.de>. If unsure, say Y. -# placeholder until implemented -config AX25_DAMA_MASTER - bool 'AX.25 DAMA Master support' - depends on AX25_DAMA_SLAVE && BROKEN - help - DAMA is a mechanism to prevent collisions when doing AX.25 - networking. A DAMA server (called "master") accepts incoming traffic - from clients (called "slaves") and redistributes it to other slaves. - If you say Y here, your Linux box will act as a DAMA master; this is - transparent in that you don't have to do any special DAMA - configuration. Linux cannot yet act as a DAMA server. This option - only compiles DAMA slave support into the kernel. It still needs to - be explicitly enabled, so if unsure, say Y. - config NETROM tristate "Amateur Radio NET/ROM protocol" depends on AX25 diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 6ef8b2a57a9b..a76f4793aed2 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -528,7 +528,7 @@ ax25_cb *ax25_create_cb(void) { ax25_cb *ax25; - if ((ax25 = kzalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL) + if ((ax25 = kzalloc_obj(*ax25, GFP_ATOMIC)) == NULL) return NULL; refcount_set(&ax25->refcount, 1); @@ -1094,7 +1094,7 @@ static int ax25_release(struct socket *sock) * that we've implemented support for SO_BINDTODEVICE. It is however small * and trivially backward compatible. */ -static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int ax25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; @@ -1175,7 +1175,7 @@ out: * FIXME: nonblock behaviour looks like it may have a bug. */ static int __must_check ax25_connect(struct socket *sock, - struct sockaddr *uaddr, int addr_len, int flags) + struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; ax25_cb *ax25 = sk_to_ax25(sk), *ax25t; @@ -1249,7 +1249,7 @@ static int __must_check ax25_connect(struct socket *sock, goto out_release; } - if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { + if ((digi = kmalloc_obj(ax25_digi)) == NULL) { err = -ENOBUFS; goto out_release; } diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 3733c0254a50..3c0544fc4ad5 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -54,7 +54,7 @@ void ax25_dev_device_up(struct net_device *dev) { ax25_dev *ax25_dev; - ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_KERNEL); + ax25_dev = kzalloc_obj(*ax25_dev); if (!ax25_dev) { printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n"); return; @@ -82,9 +82,7 @@ void ax25_dev_device_up(struct net_device *dev) #ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; -#endif -#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); #endif diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c index 979bc4b828a0..3ad454416a5c 100644 --- a/net/ax25/ax25_iface.c +++ b/net/ax25/ax25_iface.c @@ -105,7 +105,7 @@ int ax25_listen_register(const ax25_address *callsign, struct net_device *dev) if (ax25_listen_mine(callsign, dev)) return 0; - if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) + if ((listen = kmalloc_obj(*listen, GFP_ATOMIC)) == NULL) return -ENOMEM; listen->callsign = *callsign; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index f2d66af86359..d75b3e9ed93d 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -377,7 +377,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, * Sort out any digipeated paths. */ if (dp.ndigi && !ax25->digipeat && - (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + (ax25->digipeat = kmalloc_obj(ax25_digi, GFP_ATOMIC)) == NULL) { kfree_skb(skb); ax25_destroy_socket(ax25); if (sk) diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 10577434f40b..1d5c59ccf142 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -91,7 +91,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) kfree(ax25_rt->digipeat); ax25_rt->digipeat = NULL; if (route->digi_count != 0) { - if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + if ((ax25_rt->digipeat = kmalloc_obj(ax25_digi, GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); ax25_dev_put(ax25_dev); return -ENOMEM; @@ -110,7 +110,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt = ax25_rt->next; } - if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { + if ((ax25_rt = kmalloc_obj(ax25_route, GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); ax25_dev_put(ax25_dev); return -ENOMEM; @@ -121,7 +121,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; if (route->digi_count != 0) { - if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + if ((ax25_rt->digipeat = kmalloc_obj(ax25_digi, GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); ax25_dev_put(ax25_dev); diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index 241e4680ecb1..159ce74273f0 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -101,7 +101,7 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) } if (sax->sax25_uid == 0) return -EINVAL; - if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL) + if ((ax25_uid = kmalloc_obj(*ax25_uid)) == NULL) return -ENOMEM; refcount_set(&ax25_uid->refcount, 1); diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c299e2bc87ed..58c408b7a7d9 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -35,6 +35,7 @@ config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET select CRC16 + select NET_CRC32C default y help This option enables BLA (Bridge Loop Avoidance), a mechanism diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index b75c2228e69a..f28e9cbf8ad5 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -473,6 +473,9 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, if (aggregated_bytes > max_bytes) return false; + if (skb_tailroom(forw_packet->skb) < packet_len) + return false; + if (packet_num >= BATADV_MAX_AGGREGATION_PACKETS) return false; diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index cb16c1ed2a58..fdc2abe96d77 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -111,7 +111,15 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, /* unsupported WiFi driver version */ goto default_throughput; - real_netdev = batadv_get_real_netdev(hard_iface->net_dev); + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + real_netdev = __batadv_get_real_netdev(hard_iface->net_dev); + rtnl_unlock(); if (!real_netdev) goto default_throughput; @@ -355,7 +363,7 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) * context. Therefore add it to metric_queue and process it * outside rcu protected context. */ - metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + metric_entry = kzalloc_obj(*metric_entry, GFP_ATOMIC); if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); continue; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index b992ba12aa24..49ae92b9a152 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -12,7 +12,6 @@ #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc16.h> -#include <linux/crc32.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -506,7 +505,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, "%s(): not found (%pM, %d), creating new entry\n", __func__, orig, batadv_print_vid(vid)); - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) return NULL; @@ -700,7 +699,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, /* create a new claim entry if it does not exist yet. */ if (!claim) { - claim = kzalloc(sizeof(*claim), GFP_ATOMIC); + claim = kzalloc_obj(*claim, GFP_ATOMIC); if (!claim) return; @@ -1586,44 +1585,10 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** - * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in - * the header - * @skb: skb pointing to fragmented socket buffers - * @payload_ptr: Pointer to position inside the head buffer of the skb - * marking the start of the data to be CRC'ed - * - * payload_ptr must always point to an address in the skb head buffer and not to - * a fragment. - * - * Return: big endian crc32c of the checksummed data - */ -static __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) -{ - unsigned int to = skb->len; - unsigned int consumed = 0; - struct skb_seq_state st; - unsigned int from; - unsigned int len; - const u8 *data; - u32 crc = 0; - - from = (unsigned int)(payload_ptr - skb->data); - - skb_prepare_seq_read(skb, from, to, &st); - while ((len = skb_seq_read(consumed, &data, &st)) != 0) { - crc = crc32c(crc, data, len); - consumed += len; - } - - return htonl(crc); -} - -/** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked - * @payload_ptr: pointer to position inside the head buffer of the skb - * marking the start of the data to be CRC'ed + * @payload_offset: offset in the skb, marking the start of the data to be CRC'ed * @orig: originator mac address, NULL if unknown * * Check if it is on our broadcast list. Another gateway might have sent the @@ -1638,16 +1603,18 @@ static __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb, u8 *payload_ptr, + struct sk_buff *skb, int payload_offset, const u8 *orig) { struct batadv_bcast_duplist_entry *entry; bool ret = false; + int payload_len; int i, curr; - __be32 crc; + u32 crc; /* calculate the crc ... */ - crc = batadv_skb_crc32(skb, payload_ptr); + payload_len = skb->len - payload_offset; + crc = skb_crc32c(skb, payload_offset, payload_len, 0); spin_lock_bh(&bat_priv->bla.bcast_duplist_lock); @@ -1727,7 +1694,7 @@ out: static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { - return batadv_bla_check_duplist(bat_priv, skb, (u8 *)skb->data, NULL); + return batadv_bla_check_duplist(bat_priv, skb, 0, NULL); } /** @@ -1745,12 +1712,10 @@ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bcast_packet *bcast_packet; - u8 *payload_ptr; bcast_packet = (struct batadv_bcast_packet *)skb->data; - payload_ptr = (u8 *)(bcast_packet + 1); - return batadv_bla_check_duplist(bat_priv, skb, payload_ptr, + return batadv_bla_check_duplist(bat_priv, skb, sizeof(*bcast_packet), bcast_packet->orig); } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 8b8132eb0a79..3efc4cf50b46 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -381,7 +381,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, goto out; } - dat_entry = kmalloc(sizeof(*dat_entry), GFP_ATOMIC); + dat_entry = kmalloc_obj(*dat_entry, GFP_ATOMIC); if (!dat_entry) goto out; @@ -635,8 +635,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, if (!bat_priv->orig_hash) return NULL; - res = kmalloc_array(BATADV_DAT_CANDIDATES_NUM, sizeof(*res), - GFP_ATOMIC); + res = kmalloc_objs(*res, BATADV_DAT_CANDIDATES_NUM, GFP_ATOMIC); if (!res) return NULL; diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index cc14bc41381e..f4e45cc25816 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -156,7 +156,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, seqno = ntohs(frag_packet->seqno); bucket = seqno % BATADV_FRAG_BUFFER_COUNT; - frag_entry_new = kmalloc(sizeof(*frag_entry_new), GFP_ATOMIC); + frag_entry_new = kmalloc_obj(*frag_entry_new, GFP_ATOMIC); if (!frag_entry_new) goto err; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 7a11b245e9f4..51e9c081a2a4 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -332,7 +332,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; - gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); + gw_node = kzalloc_obj(*gw_node, GFP_ATOMIC); if (!gw_node) return; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5113f879736b..d6732c34aeaf 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -204,7 +204,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) } /** - * batadv_get_real_netdevice() - check if the given netdev struct is a virtual + * __batadv_get_real_netdev() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * @@ -214,7 +214,7 @@ static bool batadv_is_valid_iface(const struct net_device *net_dev) * Return: the 'real' net device or the original net device and NULL in case * of an error. */ -static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) +struct net_device *__batadv_get_real_netdev(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; @@ -267,7 +267,7 @@ struct net_device *batadv_get_real_netdev(struct net_device *net_device) struct net_device *real_netdev; rtnl_lock(); - real_netdev = batadv_get_real_netdevice(net_device); + real_netdev = __batadv_get_real_netdev(net_device); rtnl_unlock(); return real_netdev; @@ -336,7 +336,7 @@ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; - real_netdev = batadv_get_real_netdevice(net_device); + real_netdev = __batadv_get_real_netdev(net_device); if (!real_netdev) return wifi_flags; @@ -871,7 +871,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (!batadv_is_valid_iface(net_dev)) return NULL; - hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC); + hard_iface = kzalloc_obj(*hard_iface, GFP_ATOMIC); if (!hard_iface) return NULL; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 9db8a310961e..9ba8fb2bdceb 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -67,6 +67,7 @@ enum batadv_hard_if_bcast { extern struct notifier_block batadv_hard_if_notifier; +struct net_device *__batadv_get_real_netdev(struct net_device *net_device); struct net_device *batadv_get_real_netdev(struct net_device *net_device); bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface); bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 8016e619787f..759fa29176db 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -45,16 +45,15 @@ struct batadv_hashtable *batadv_hash_new(u32 size) { struct batadv_hashtable *hash; - hash = kmalloc(sizeof(*hash), GFP_ATOMIC); + hash = kmalloc_obj(*hash, GFP_ATOMIC); if (!hash) return NULL; - hash->table = kmalloc_array(size, sizeof(*hash->table), GFP_ATOMIC); + hash->table = kmalloc_objs(*hash->table, size, GFP_ATOMIC); if (!hash->table) goto free_hash; - hash->list_locks = kmalloc_array(size, sizeof(*hash->list_locks), - GFP_ATOMIC); + hash->list_locks = kmalloc_objs(*hash->list_locks, size, GFP_ATOMIC); if (!hash->list_locks) goto free_table; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2be1ac17acaa..af230b017bc1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2025.4" +#define BATADV_SOURCE_VERSION "2025.5" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index df7e95811ef5..56ca1c1b83f2 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -555,7 +555,7 @@ int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) return -EEXIST; } - vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); + vlan = kzalloc_obj(*vlan, GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return -ENOMEM; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index e8c6b0bf670f..a3d3efe22d30 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -399,7 +399,7 @@ batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev, if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; - new = kmalloc(sizeof(*new), GFP_ATOMIC); + new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; @@ -472,7 +472,7 @@ batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; - new = kmalloc(sizeof(*new), GFP_ATOMIC); + new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; @@ -632,7 +632,7 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; - new = kmalloc(sizeof(*new), GFP_ATOMIC); + new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index a662408ad867..b3468ccab535 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -179,7 +179,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, if (vlan) goto out; - vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); + vlan = kzalloc_obj(*vlan, GFP_ATOMIC); if (!vlan) goto out; @@ -417,7 +417,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, if (orig_ifinfo) goto out; - orig_ifinfo = kzalloc(sizeof(*orig_ifinfo), GFP_ATOMIC); + orig_ifinfo = kzalloc_obj(*orig_ifinfo, GFP_ATOMIC); if (!orig_ifinfo) goto out; @@ -495,7 +495,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, if (neigh_ifinfo) goto out; - neigh_ifinfo = kzalloc(sizeof(*neigh_ifinfo), GFP_ATOMIC); + neigh_ifinfo = kzalloc_obj(*neigh_ifinfo, GFP_ATOMIC); if (!neigh_ifinfo) goto out; @@ -575,7 +575,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, if (hardif_neigh) goto out; - hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); + hardif_neigh = kzalloc_obj(*hardif_neigh, GFP_ATOMIC); if (!hardif_neigh) goto out; @@ -683,7 +683,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node, if (!hardif_neigh) goto out; - neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); + neigh_node = kzalloc_obj(*neigh_node, GFP_ATOMIC); if (!neigh_node) goto out; @@ -947,7 +947,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Creating new originator: %pM\n", addr); - orig_node = kzalloc(sizeof(*orig_node), GFP_ATOMIC); + orig_node = kzalloc_obj(*orig_node, GFP_ATOMIC); if (!orig_node) return NULL; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 20d85c681064..60cd67ec9cea 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -503,7 +503,7 @@ batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, return NULL; } - forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); + forw_packet = kmalloc_obj(*forw_packet, GFP_ATOMIC); if (!forw_packet) goto err; diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 350b149e48be..2e42f6b348c8 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -967,7 +967,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, return; } - tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC); + tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) { spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, @@ -1228,7 +1228,7 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_vars *tp_vars, u32 payload_len; bool added = false; - new = kmalloc(sizeof(*new), GFP_ATOMIC); + new = kmalloc_obj(*new, GFP_ATOMIC); if (unlikely(!new)) return false; @@ -1343,7 +1343,7 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv, goto out_unlock; } - tp_vars = kmalloc(sizeof(*tp_vars), GFP_ATOMIC); + tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) goto out_unlock; diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 76dff1f9c559..8129a3f9c44d 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -557,7 +557,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, return; } - tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); + tvlv_handler = kzalloc_obj(*tvlv_handler, GFP_ATOMIC); if (!tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ae1d7a8dc480..8fc5fe0e9b05 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -734,7 +734,7 @@ struct batadv_bcast_duplist_entry { u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ - __be32 crc; + u32 crc; /** @entrytime: time when the broadcast packet was received */ unsigned long entrytime; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 2c21ae8abadc..2f03b780b40d 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -645,7 +645,7 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, { struct lowpan_peer *peer; - peer = kzalloc(sizeof(*peer), GFP_ATOMIC); + peer = kzalloc_obj(*peer, GFP_ATOMIC); if (!peer) return NULL; @@ -1107,7 +1107,7 @@ static int lowpan_enable_set(void *data, u64 val) { struct set_enable *set_enable; - set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL); + set_enable = kzalloc_obj(*set_enable); if (!set_enable) return -ENOMEM; @@ -1245,7 +1245,7 @@ static void disconnect_devices(void) rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { - new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC); + new_dev = kmalloc_obj(*new_dev, GFP_ATOMIC); if (!new_dev) break; diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index 884703fda979..b95413bffa16 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -72,7 +72,7 @@ static struct cmtp_application *cmtp_application_add(struct cmtp_session *session, __u16 appl) { - struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL); + struct cmtp_application *app = kzalloc_obj(*app); BT_DBG("session %p application %p appl %u", session, app, appl); diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 90d130588a3e..261aeeda3236 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -341,7 +341,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) if (req->flags & ~valid_flags) return -EINVAL; - session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); + session = kzalloc_obj(struct cmtp_session); if (!session) return -ENOMEM; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 6fc0692abf05..6eb59e9f2aa8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -462,7 +462,7 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) struct conn_handle_t *conn_handle; if (enhanced_sync_conn_capable(conn->hdev)) { - conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL); + conn_handle = kzalloc_obj(*conn_handle); if (!conn_handle) return false; @@ -726,7 +726,7 @@ static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn) bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big, conn->iso_qos.bcast.bis); - d = kzalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc_obj(*d); if (!d) return -ENOMEM; @@ -777,7 +777,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn, conn->iso_qos.bcast.big, conn->sync_handle); - d = kzalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc_obj(*d); if (!d) return -ENOMEM; @@ -922,10 +922,12 @@ static int hci_conn_hash_alloc_unset(struct hci_dev *hdev) U16_MAX, GFP_ATOMIC); } -static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, +static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, + bdaddr_t *dst, u8 dst_type, u8 role, u16 handle) { struct hci_conn *conn; + struct smp_irk *irk = NULL; switch (type) { case ACL_LINK: @@ -937,12 +939,14 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t case PA_LINK: if (!hdev->iso_mtu) return ERR_PTR(-ECONNREFUSED); + irk = hci_get_irk(hdev, dst, dst_type); break; case LE_LINK: if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); + irk = hci_get_irk(hdev, dst, dst_type); break; case SCO_LINK: case ESCO_LINK: @@ -956,11 +960,19 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle); - conn = kzalloc(sizeof(*conn), GFP_KERNEL); + conn = kzalloc_obj(*conn); if (!conn) return ERR_PTR(-ENOMEM); - bacpy(&conn->dst, dst); + /* If and IRK exists use its identity address */ + if (!irk) { + bacpy(&conn->dst, dst); + conn->dst_type = dst_type; + } else { + bacpy(&conn->dst, &irk->bdaddr); + conn->dst_type = irk->addr_type; + } + bacpy(&conn->src, &hdev->bdaddr); conn->handle = handle; conn->hdev = hdev; @@ -990,12 +1002,18 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; + conn->link_policy = hdev->link_policy; conn->mtu = hdev->acl_mtu; break; case LE_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; + /* Use the controller supported PHYS as default until the + * remote features are resolved. + */ + conn->le_tx_def_phys = hdev->le_tx_def_phys; + conn->le_rx_def_phys = hdev->le_tx_def_phys; break; case CIS_LINK: /* conn->src should reflect the local identity address */ @@ -1059,7 +1077,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t } struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, - bdaddr_t *dst, u8 role) + bdaddr_t *dst, u8 dst_type, u8 role) { int handle; @@ -1069,16 +1087,16 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, if (unlikely(handle < 0)) return ERR_PTR(-ECONNREFUSED); - return __hci_conn_add(hdev, type, dst, role, handle); + return __hci_conn_add(hdev, type, dst, dst_type, role, handle); } struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role, u16 handle) + u8 dst_type, u8 role, u16 handle) { if (handle > HCI_CONN_HANDLE_MAX) return ERR_PTR(-EINVAL); - return __hci_conn_add(hdev, type, dst, role, handle); + return __hci_conn_add(hdev, type, dst, dst_type, role, handle); } static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) @@ -1410,14 +1428,13 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, if (conn) { bacpy(&conn->dst, dst); } else { - conn = hci_conn_add_unset(hdev, LE_LINK, dst, role); + conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, role); if (IS_ERR(conn)) return conn; hci_conn_hold(conn); conn->pending_sec_level = sec_level; } - conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; conn->le_adv_phy = phy; @@ -1587,7 +1604,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, 0, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; @@ -1633,7 +1650,8 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, BT_DBG("requesting refresh of dst_addr"); - conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, + HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; @@ -1644,7 +1662,6 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); - conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; @@ -1681,7 +1698,8 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { - acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); + acl = hci_conn_add_unset(hdev, ACL_LINK, dst, 0, + HCI_ROLE_MASTER); if (IS_ERR(acl)) return acl; } @@ -1721,7 +1739,7 @@ static struct hci_link *hci_conn_link(struct hci_conn *parent, if (conn->parent) return NULL; - link = kzalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc_obj(*link); if (!link) return NULL; @@ -1750,7 +1768,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { - sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER); + sco = hci_conn_add_unset(hdev, type, dst, 0, HCI_ROLE_MASTER); if (IS_ERR(sco)) { hci_conn_drop(acl); return sco; @@ -1807,7 +1825,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu); cp.bis.latency = cpu_to_le16(qos->bcast.out.latency); cp.bis.rtn = qos->bcast.out.rtn; - cp.bis.phy = qos->bcast.out.phy; + cp.bis.phy = qos->bcast.out.phys; cp.bis.packing = qos->bcast.packing; cp.bis.framing = qos->bcast.framing; cp.bis.encryption = qos->bcast.encryption; @@ -1857,10 +1875,10 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data) cis->cis_id = cis_id; cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); - cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : - qos->ucast.in.phy; - cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : - qos->ucast.out.phy; + cis->c_phys = qos->ucast.out.phys ? qos->ucast.out.phys : + qos->ucast.in.phys; + cis->p_phys = qos->ucast.in.phys ? qos->ucast.in.phys : + qos->ucast.out.phys; cis->c_rtn = qos->ucast.out.rtn; cis->p_rtn = qos->ucast.in.rtn; } @@ -1926,6 +1944,8 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) return false; done: + conn->iso_qos = *qos; + if (hci_cmd_sync_queue(hdev, set_cig_params_sync, UINT_PTR(qos->ucast.cig), NULL) < 0) return false; @@ -1942,7 +1962,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { - cis = hci_conn_add_unset(hdev, CIS_LINK, dst, + cis = hci_conn_add_unset(hdev, CIS_LINK, dst, dst_type, HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; @@ -1962,8 +1982,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, return cis; /* Update LINK PHYs according to QoS preference */ - cis->le_tx_phy = qos->ucast.out.phy; - cis->le_rx_phy = qos->ucast.in.phy; + cis->le_tx_phy = qos->ucast.out.phys; + cis->le_rx_phy = qos->ucast.in.phys; /* If output interval is not set use the input interval as it cannot be * 0x000000. @@ -1995,8 +2015,6 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, } hci_conn_hold(cis); - - cis->iso_qos = *qos; cis->state = BT_BOUND; return cis; @@ -2078,15 +2096,15 @@ int hci_le_create_cis_pending(struct hci_dev *hdev) } static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, - struct bt_iso_io_qos *qos, __u8 phy) + struct bt_iso_io_qos *qos, __u8 phys) { /* Only set MTU if PHY is enabled */ - if (!qos->sdu && qos->phy) + if (!qos->sdu && qos->phys) qos->sdu = conn->mtu; /* Use the same PHY as ACL if set to any */ - if (qos->phy == BT_ISO_PHY_ANY) - qos->phy = phy; + if (qos->phys == BT_ISO_PHY_ANY) + qos->phys = phys; /* Use LE ACL connection interval if not set */ if (!qos->interval) @@ -2106,7 +2124,7 @@ static int create_big_sync(struct hci_dev *hdev, void *data) u32 flags = 0; int err; - if (qos->bcast.out.phy == 0x02) + if (qos->bcast.out.phys == BIT(1)) flags |= MGMT_ADV_FLAG_SEC_2M; /* Align intervals */ @@ -2133,12 +2151,11 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); - conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE); + conn = hci_conn_add_unset(hdev, PA_LINK, dst, dst_type, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; conn->iso_qos = *qos; - conn->dst_type = dst_type; conn->sid = sid; conn->state = BT_LISTEN; conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10); @@ -2216,8 +2233,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, return conn; /* Update LINK PHYs according to QoS preference */ - conn->le_tx_phy = qos->bcast.out.phy; - conn->le_tx_phy = qos->bcast.out.phy; + conn->le_tx_def_phys = qos->bcast.out.phys; /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { @@ -2226,7 +2242,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, } hci_iso_qos_setup(hdev, conn, &qos->bcast.out, - conn->le_tx_phy ? conn->le_tx_phy : + conn->le_tx_def_phys ? conn->le_tx_def_phys : hdev->le_tx_def_phys); conn->iso_qos = *qos; @@ -2245,6 +2261,18 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, return conn; } +int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type) +{ + struct hci_conn *le; + + /* Lookup existing LE connection to rebind to */ + le = hci_conn_hash_lookup_le(conn->hdev, dst, dst_type); + if (!le) + return -EINVAL; + + return hci_past_sync(conn, le); +} + static void bis_mark_per_adv(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; @@ -2334,9 +2362,11 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, return le; hci_iso_qos_setup(hdev, le, &qos->ucast.out, - le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys); + le->le_tx_def_phys ? le->le_tx_def_phys : + hdev->le_tx_def_phys); hci_iso_qos_setup(hdev, le, &qos->ucast.in, - le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); + le->le_rx_def_phys ? le->le_rx_def_phys : + hdev->le_rx_def_phys); cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { @@ -2591,8 +2621,8 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) timer: if (hdev->idle_timeout > 0) - queue_delayed_work(hdev->workqueue, &conn->idle_work, - msecs_to_jiffies(hdev->idle_timeout)); + mod_delayed_work(hdev->workqueue, &conn->idle_work, + msecs_to_jiffies(hdev->idle_timeout)); } /* Drop all connection on the device */ @@ -2751,7 +2781,7 @@ struct hci_chan *hci_chan_create(struct hci_conn *conn) return NULL; } - chan = kzalloc(sizeof(*chan), GFP_KERNEL); + chan = kzalloc_obj(*chan); if (!chan) return NULL; @@ -2905,22 +2935,22 @@ u32 hci_conn_get_phy(struct hci_conn *conn) break; case LE_LINK: - if (conn->le_tx_phy & HCI_LE_SET_PHY_1M) + if (conn->le_tx_def_phys & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_TX; - if (conn->le_rx_phy & HCI_LE_SET_PHY_1M) + if (conn->le_rx_def_phys & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_RX; - if (conn->le_tx_phy & HCI_LE_SET_PHY_2M) + if (conn->le_tx_def_phys & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_TX; - if (conn->le_rx_phy & HCI_LE_SET_PHY_2M) + if (conn->le_rx_def_phys & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_RX; - if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED) + if (conn->le_tx_def_phys & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_TX; - if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED) + if (conn->le_rx_def_phys & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_RX; break; @@ -2929,6 +2959,111 @@ u32 hci_conn_get_phy(struct hci_conn *conn) return phys; } +static u16 bt_phy_pkt_type(struct hci_conn *conn, u32 phys) +{ + u16 pkt_type = conn->pkt_type; + + if (phys & BT_PHY_BR_1M_3SLOT) + pkt_type |= HCI_DM3 | HCI_DH3; + else + pkt_type &= ~(HCI_DM3 | HCI_DH3); + + if (phys & BT_PHY_BR_1M_5SLOT) + pkt_type |= HCI_DM5 | HCI_DH5; + else + pkt_type &= ~(HCI_DM5 | HCI_DH5); + + if (phys & BT_PHY_EDR_2M_1SLOT) + pkt_type &= ~HCI_2DH1; + else + pkt_type |= HCI_2DH1; + + if (phys & BT_PHY_EDR_2M_3SLOT) + pkt_type &= ~HCI_2DH3; + else + pkt_type |= HCI_2DH3; + + if (phys & BT_PHY_EDR_2M_5SLOT) + pkt_type &= ~HCI_2DH5; + else + pkt_type |= HCI_2DH5; + + if (phys & BT_PHY_EDR_3M_1SLOT) + pkt_type &= ~HCI_3DH1; + else + pkt_type |= HCI_3DH1; + + if (phys & BT_PHY_EDR_3M_3SLOT) + pkt_type &= ~HCI_3DH3; + else + pkt_type |= HCI_3DH3; + + if (phys & BT_PHY_EDR_3M_5SLOT) + pkt_type &= ~HCI_3DH5; + else + pkt_type |= HCI_3DH5; + + return pkt_type; +} + +static int bt_phy_le_phy(u32 phys, u8 *tx_phys, u8 *rx_phys) +{ + if (!tx_phys || !rx_phys) + return -EINVAL; + + *tx_phys = 0; + *rx_phys = 0; + + if (phys & BT_PHY_LE_1M_TX) + *tx_phys |= HCI_LE_SET_PHY_1M; + + if (phys & BT_PHY_LE_1M_RX) + *rx_phys |= HCI_LE_SET_PHY_1M; + + if (phys & BT_PHY_LE_2M_TX) + *tx_phys |= HCI_LE_SET_PHY_2M; + + if (phys & BT_PHY_LE_2M_RX) + *rx_phys |= HCI_LE_SET_PHY_2M; + + if (phys & BT_PHY_LE_CODED_TX) + *tx_phys |= HCI_LE_SET_PHY_CODED; + + if (phys & BT_PHY_LE_CODED_RX) + *rx_phys |= HCI_LE_SET_PHY_CODED; + + return 0; +} + +int hci_conn_set_phy(struct hci_conn *conn, u32 phys) +{ + u8 tx_phys, rx_phys; + + switch (conn->type) { + case SCO_LINK: + case ESCO_LINK: + return -EINVAL; + case ACL_LINK: + /* Only allow setting BR/EDR PHYs if link type is ACL */ + if (phys & ~BT_PHY_BREDR_MASK) + return -EINVAL; + + return hci_acl_change_pkt_type(conn, + bt_phy_pkt_type(conn, phys)); + case LE_LINK: + /* Only allow setting LE PHYs if link type is LE */ + if (phys & ~BT_PHY_LE_MASK) + return -EINVAL; + + if (bt_phy_le_phy(phys, &tx_phys, &rx_phys)) + return -EINVAL; + + return hci_le_set_phy(conn, tx_phys, rx_phys); + default: + return -EINVAL; + } +} + static int abort_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8ccec73dce45..31308c1de4ec 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -117,6 +117,7 @@ bool hci_discovery_active(struct hci_dev *hdev) return false; } } +EXPORT_SYMBOL(hci_discovery_active); void hci_discovery_set_state(struct hci_dev *hdev, int state) { @@ -261,7 +262,7 @@ u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, } /* Entry not in the cache. Add new one. */ - ie = kzalloc(sizeof(*ie), GFP_KERNEL); + ie = kzalloc_obj(*ie); if (!ie) { flags |= MGMT_DEV_FOUND_CONFIRM_NAME; goto done; @@ -796,7 +797,7 @@ int hci_get_dev_list(void __user *arg) if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) return -EINVAL; - dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL); + dl = kzalloc_flex(*dl, dev_req, dev_num); if (!dl) return -ENOMEM; @@ -1285,7 +1286,7 @@ struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; - key = kzalloc(sizeof(*key), GFP_KERNEL); + key = kzalloc_obj(*key); if (!key) return NULL; list_add_rcu(&key->list, &hdev->link_keys); @@ -1330,7 +1331,7 @@ struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, if (old_key) key = old_key; else { - key = kzalloc(sizeof(*key), GFP_KERNEL); + key = kzalloc_obj(*key); if (!key) return NULL; list_add_rcu(&key->list, &hdev->long_term_keys); @@ -1355,7 +1356,7 @@ struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); if (!irk) { - irk = kzalloc(sizeof(*irk), GFP_KERNEL); + irk = kzalloc_obj(*irk); if (!irk) return NULL; @@ -1549,7 +1550,7 @@ int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) { - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kmalloc_obj(*data); if (!data) return -ENOMEM; @@ -1717,7 +1718,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); - adv = kzalloc(sizeof(*adv), GFP_KERNEL); + adv = kzalloc_obj(*adv); if (!adv) return ERR_PTR(-ENOMEM); @@ -2106,7 +2107,7 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; - entry = kzalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc_obj(*entry); if (!entry) return -ENOMEM; @@ -2129,7 +2130,7 @@ int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; - entry = kzalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc_obj(*entry); if (!entry) return -ENOMEM; @@ -2158,7 +2159,7 @@ int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; - entry = kzalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc_obj(*entry); if (!entry) return -ENOMEM; @@ -2275,7 +2276,7 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, if (params) return params; - params = kzalloc(sizeof(*params), GFP_KERNEL); + params = kzalloc_obj(*params); if (!params) { bt_dev_err(hdev, "out of memory"); return NULL; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3838b90343d9..286529d2e554 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2267,7 +2267,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) } else { if (!conn) { conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr, - HCI_ROLE_MASTER); + 0, HCI_ROLE_MASTER); if (IS_ERR(conn)) bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); } @@ -2869,6 +2869,31 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } +static void hci_cs_le_set_phy(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_set_phy *cp; + struct hci_conn *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PHY); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + conn->le_tx_def_phys = cp->tx_phys; + conn->le_rx_def_phys = cp->rx_phys; + } + + hci_dev_unlock(hdev); +} + static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) { struct hci_cp_le_read_remote_features *cp; @@ -2886,12 +2911,8 @@ static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); - if (conn) { - if (conn->state == BT_CONFIG) { - hci_connect_cfm(conn, status); - hci_conn_drop(conn); - } - } + if (conn && conn->state == BT_CONFIG) + hci_connect_cfm(conn, status); hci_dev_unlock(hdev); } @@ -3123,7 +3144,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, &ev->bdaddr, BDADDR_BREDR)) { conn = hci_conn_add_unset(hdev, ev->link_type, - &ev->bdaddr, HCI_ROLE_SLAVE); + &ev->bdaddr, 0, + HCI_ROLE_SLAVE); if (IS_ERR(conn)) { bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; @@ -3299,7 +3321,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); if (!conn) { - conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, + conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, 0, HCI_ROLE_SLAVE); if (IS_ERR(conn)) { bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); @@ -3914,11 +3936,49 @@ unlock: return rp->status; } +static u8 hci_cc_le_read_all_local_features(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_rp_le_read_all_local_features *rp = data; + + bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); + + if (rp->status) + return rp->status; + + memcpy(hdev->le_features, rp->features, 248); + + return rp->status; +} + static void hci_cs_le_create_big(struct hci_dev *hdev, u8 status) { bt_dev_dbg(hdev, "status 0x%2.2x", status); } +static void hci_cs_le_read_all_remote_features(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_read_remote_features *cp; + struct hci_conn *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn && conn->state == BT_CONFIG) + hci_connect_cfm(conn, status); + + hci_dev_unlock(hdev); +} + static u8 hci_cc_set_per_adv_param(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -4170,6 +4230,9 @@ static const struct hci_cc { sizeof(struct hci_rp_le_set_cig_params), HCI_MAX_EVENT_SIZE), HCI_CC(HCI_OP_LE_SETUP_ISO_PATH, hci_cc_le_setup_iso_path, sizeof(struct hci_rp_le_setup_iso_path)), + HCI_CC(HCI_OP_LE_READ_ALL_LOCAL_FEATURES, + hci_cc_le_read_all_local_features, + sizeof(struct hci_rp_le_read_all_local_features)), }; static u8 hci_cc_func(struct hci_dev *hdev, const struct hci_cc *cc, @@ -4321,9 +4384,12 @@ static const struct hci_cs { HCI_CS(HCI_OP_LE_CREATE_CONN, hci_cs_le_create_conn), HCI_CS(HCI_OP_LE_READ_REMOTE_FEATURES, hci_cs_le_read_remote_features), HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc), + HCI_CS(HCI_OP_LE_SET_PHY, hci_cs_le_set_phy), HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn), HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis), HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big), + HCI_CS(HCI_OP_LE_READ_ALL_REMOTE_FEATURES, + hci_cs_le_read_all_remote_features), }; static void hci_cmd_status_evt(struct hci_dev *hdev, void *data, @@ -5644,6 +5710,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, struct hci_conn *conn; struct smp_irk *irk; u8 addr_type; + int err; hci_dev_lock(hdev); @@ -5670,14 +5737,13 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, if (status) goto unlock; - conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, role); + conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, bdaddr_type, + role); if (IS_ERR(conn)) { bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; } - conn->dst_type = bdaddr_type; - /* If we didn't have a hci_conn object previously * but we're in central role this must be something * initiated using an accept list. Since accept list based @@ -5775,26 +5841,8 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - /* The remote features procedure is defined for central - * role only. So only in case of an initiated connection - * request the remote features. - * - * If the local controller supports peripheral-initiated features - * exchange, then requesting the remote features in peripheral - * role is possible. Otherwise just transition into the - * connected state without requesting the remote features. - */ - if (conn->out || - (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) { - struct hci_cp_le_read_remote_features cp; - - cp.handle = __cpu_to_le16(conn->handle); - - hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, - sizeof(cp), &cp); - - hci_conn_hold(conn); - } else { + err = hci_le_read_remote_features(conn); + if (err) { conn->state = BT_CONNECTED; hci_connect_cfm(conn, status); } @@ -5936,6 +5984,71 @@ unlock: hci_dev_unlock(hdev); } +static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle) +{ + struct hci_cp_le_pa_term_sync cp; + + memset(&cp, 0, sizeof(cp)); + cp.handle = handle; + + return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp); +} + +static void hci_le_past_received_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_past_received *ev = data; + int mask = hdev->link_mode; + __u8 flags = 0; + struct hci_conn *pa_sync, *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + + hci_dev_lock(hdev); + + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + conn = hci_conn_hash_lookup_create_pa_sync(hdev); + if (!conn) { + bt_dev_err(hdev, + "Unable to find connection for dst %pMR sid 0x%2.2x", + &ev->bdaddr, ev->sid); + goto unlock; + } + + conn->sync_handle = le16_to_cpu(ev->sync_handle); + conn->sid = HCI_SID_INVALID; + + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK, + &flags); + if (!(mask & HCI_LM_ACCEPT)) { + hci_le_pa_term_sync(hdev, ev->sync_handle); + goto unlock; + } + + if (!(flags & HCI_PROTO_DEFER)) + goto unlock; + + /* Add connection to indicate PA sync event */ + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0, + HCI_ROLE_SLAVE); + + if (IS_ERR(pa_sync)) + goto unlock; + + pa_sync->sync_handle = le16_to_cpu(ev->sync_handle); + + if (ev->status) { + set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags); + + /* Notify iso layer */ + hci_connect_cfm(pa_sync, ev->status); + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6412,16 +6525,6 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_unlock(hdev); } -static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle) -{ - struct hci_cp_le_pa_term_sync cp; - - memset(&cp, 0, sizeof(cp)); - cp.handle = handle; - - return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp); -} - static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6460,7 +6563,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, goto unlock; /* Add connection to indicate PA sync event */ - pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -6530,8 +6633,20 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); if (conn) { - if (!ev->status) - memcpy(conn->features[0], ev->features, 8); + if (!ev->status) { + memcpy(conn->le_features, ev->features, 8); + + /* Update supported PHYs */ + if (!(conn->le_features[1] & HCI_LE_PHY_2M)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_2M; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_2M; + } + + if (!(conn->le_features[1] & HCI_LE_PHY_CODED)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_CODED; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_CODED; + } + } if (conn->state == BT_CONFIG) { __u8 status; @@ -6553,7 +6668,6 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, conn->state = BT_CONNECTED; hci_connect_cfm(conn, status); - hci_conn_drop(conn); } } @@ -6753,6 +6867,21 @@ unlock: hci_dev_unlock(hdev); } +/* Convert LE PHY to QoS PHYs */ +static u8 le_phy_qos(u8 phy) +{ + switch (phy) { + case 0x01: + return HCI_LE_SET_PHY_1M; + case 0x02: + return HCI_LE_SET_PHY_2M; + case 0x03: + return HCI_LE_SET_PHY_CODED; + } + + return 0; +} + static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6814,8 +6943,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, 1000); qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; - qos->ucast.in.phy = ev->c_phy; - qos->ucast.out.phy = ev->p_phy; + qos->ucast.in.phys = le_phy_qos(ev->c_phy); + qos->ucast.out.phys = le_phy_qos(ev->p_phy); break; case HCI_ROLE_MASTER: qos->ucast.in.interval = p_sdu_interval; @@ -6829,8 +6958,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, 1000); qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; - qos->ucast.out.phy = ev->c_phy; - qos->ucast.in.phy = ev->p_phy; + qos->ucast.out.phys = le_phy_qos(ev->c_phy); + qos->ucast.in.phys = le_phy_qos(ev->p_phy); break; } @@ -6901,7 +7030,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, cis = hci_conn_hash_lookup_handle(hdev, cis_handle); if (!cis) { - cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, + cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, acl->dst_type, HCI_ROLE_SLAVE, cis_handle); if (IS_ERR(cis)) { hci_le_reject_cis(hdev, ev->cis_handle); @@ -7018,7 +7147,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "ignore too large handle %u", handle); continue; } - bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, + bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, 0, HCI_ROLE_SLAVE, handle); if (IS_ERR(bis)) continue; @@ -7131,6 +7260,62 @@ unlock: hci_dev_unlock(hdev); } +static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev, + void *data, struct sk_buff *skb) +{ + struct hci_evt_le_read_all_remote_features_complete *ev = data; + struct hci_conn *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + if (!ev->status) { + memcpy(conn->le_features, ev->features, 248); + + /* Update supported PHYs */ + if (!(conn->le_features[1] & HCI_LE_PHY_2M)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_2M; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_2M; + } + + if (!(conn->le_features[1] & HCI_LE_PHY_CODED)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_CODED; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_CODED; + } + } + + if (conn->state == BT_CONFIG) { + __u8 status; + + /* If the local controller supports peripheral-initiated + * features exchange, but the remote controller does + * not, then it is possible that the error code 0x1a + * for unsupported remote feature gets returned. + * + * In this specific case, allow the connection to + * transition into connected state and mark it as + * successful. + */ + if (!conn->out && + ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE && + (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) + status = 0x00; + else + status = ev->status; + + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, status); + } + +unlock: + hci_dev_unlock(hdev); +} + #define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \ [_op] = { \ .func = _func, \ @@ -7206,6 +7391,10 @@ static const struct hci_le_ev { /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), + /* [0x18 = HCI_EVT_LE_PAST_RECEIVED] */ + HCI_LE_EV(HCI_EV_LE_PAST_RECEIVED, + hci_le_past_received_evt, + sizeof(struct hci_ev_le_past_received)), /* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */ HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt, sizeof(struct hci_evt_le_cis_established)), @@ -7232,6 +7421,12 @@ static const struct hci_le_ev { hci_le_big_info_adv_report_evt, sizeof(struct hci_evt_le_big_info_adv_report), HCI_MAX_EVENT_SIZE), + /* [0x2b = HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE] */ + HCI_LE_EV_VL(HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE, + hci_le_read_all_remote_features_evt, + sizeof(struct + hci_evt_le_read_all_remote_features_complete), + HCI_MAX_EVENT_SIZE), }; static void hci_le_meta_evt(struct hci_dev *hdev, void *data, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index ad19022ae127..0290dea081f6 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1185,7 +1185,7 @@ static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, } #endif -static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, +static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_hci haddr; @@ -2166,6 +2166,7 @@ static void hci_sock_destruct(struct sock *sk) mgmt_cleanup(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&sk->sk_error_queue); } static const struct proto_ops hci_sock_ops = { diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 6e76798ec786..3166914b0d6c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -711,7 +711,7 @@ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, goto unlock; } - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kmalloc_obj(*entry); if (!entry) { err = -ENOMEM; goto unlock; @@ -2685,7 +2685,7 @@ static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) rcu_read_unlock(); - p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL); + p = kvzalloc_objs(struct conn_params, *n); if (!p) return NULL; @@ -2948,8 +2948,8 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; - if (qos->bcast.in.phy & BT_ISO_PHY_1M || - qos->bcast.in.phy & BT_ISO_PHY_2M) { + if (qos->bcast.in.phys & BT_ISO_PHY_1M || + qos->bcast.in.phys & BT_ISO_PHY_2M) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, @@ -2958,7 +2958,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, phy++; } - if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { + if (qos->bcast.in.phys & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, @@ -4011,8 +4011,19 @@ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) /* Read LE Local Supported Features */ static int hci_le_read_local_features_sync(struct hci_dev *hdev) { - return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, - 0, NULL, HCI_CMD_TIMEOUT); + int err; + + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, + 0, NULL, HCI_CMD_TIMEOUT); + if (err) + return err; + + if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(2)) + return __hci_cmd_sync_status(hdev, + HCI_OP_LE_READ_ALL_LOCAL_FEATURES, + 0, NULL, HCI_CMD_TIMEOUT); + + return err; } /* Read LE Supported States */ @@ -4324,6 +4335,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (ll_privacy_capable(hdev)) hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION; + /* Mark PAST if supported */ + if (past_capable(hdev)) + hdev->conn_flags |= HCI_CONN_FLAG_PAST; + /* If the controller supports Extended Scanner Filter * Policies, enable the corresponding event. */ @@ -4393,6 +4408,9 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (ext_adv_capable(hdev)) events[2] |= 0x02; /* LE Advertising Set Terminated */ + if (past_receiver_capable(hdev)) + events[2] |= 0x80; /* LE PAST Received */ + if (cis_capable(hdev)) { events[3] |= 0x01; /* LE CIS Established */ if (cis_peripheral_capable(hdev)) @@ -4402,6 +4420,7 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (bis_capable(hdev)) { events[1] |= 0x20; /* LE PA Report */ events[1] |= 0x40; /* LE PA Sync Established */ + events[1] |= 0x80; /* LE PA Sync Lost */ events[3] |= 0x04; /* LE Create BIG Complete */ events[3] |= 0x08; /* LE Terminate BIG Complete */ events[3] |= 0x10; /* LE BIG Sync Established */ @@ -4409,6 +4428,17 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) events[4] |= 0x02; /* LE BIG Info Advertising Report */ } + if (le_cs_capable(hdev)) { + /* Channel Sounding events */ + events[5] |= 0x08; /* LE CS Read Remote Supported Cap Complete event */ + events[5] |= 0x10; /* LE CS Read Remote FAE Table Complete event */ + events[5] |= 0x20; /* LE CS Security Enable Complete event */ + events[5] |= 0x40; /* LE CS Config Complete event */ + events[5] |= 0x80; /* LE CS Procedure Enable Complete event */ + events[6] |= 0x01; /* LE CS Subevent Result event */ + events[6] |= 0x02; /* LE CS Subevent Result Continue event */ + events[6] |= 0x04; /* LE CS Test End Complete event */ + } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } @@ -4541,23 +4571,43 @@ static int hci_set_le_support_sync(struct hci_dev *hdev) } /* LE Set Host Feature */ -static int hci_le_set_host_feature_sync(struct hci_dev *hdev) +static int hci_le_set_host_feature_sync(struct hci_dev *hdev, u8 bit, u8 value) { struct hci_cp_le_set_host_feature cp; - if (!iso_capable(hdev)) - return 0; - memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ - cp.bit_number = 32; - cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; + cp.bit_number = bit; + cp.bit_value = value; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } +/* Set Host Features, each feature needs to be sent separately since + * HCI_OP_LE_SET_HOST_FEATURE doesn't support setting all of them at once. + */ +static int hci_le_set_host_features_sync(struct hci_dev *hdev) +{ + int err; + + if (cis_capable(hdev)) { + /* Connected Isochronous Channels (Host Support) */ + err = hci_le_set_host_feature_sync(hdev, 32, + (iso_enabled(hdev) ? 0x01 : + 0x00)); + if (err) + return err; + } + + if (le_cs_capable(hdev)) + /* Channel Sounding (Host Support) */ + err = hci_le_set_host_feature_sync(hdev, 47, 0x01); + + return err; +} + /* LE Controller init stage 3 command sequence */ static const struct hci_init_stage le_init3[] = { /* HCI_OP_LE_SET_EVENT_MASK */ @@ -4585,7 +4635,7 @@ static const struct hci_init_stage le_init3[] = { /* HCI_OP_WRITE_LE_HOST_SUPPORTED */ HCI_INIT(hci_set_le_support_sync), /* HCI_OP_LE_SET_HOST_FEATURE */ - HCI_INIT(hci_le_set_host_feature_sync), + HCI_INIT(hci_le_set_host_features_sync), {} }; @@ -6577,8 +6627,8 @@ static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data) * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_scan_disable_sync(hdev); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); + hci_scan_disable_sync(hdev); } /* Update random address, but set require_privacy to false so @@ -6878,8 +6928,6 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) conn->attempt++; - conn->link_policy = hdev->link_policy; - memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.pscan_rep_mode = 0x02; @@ -7006,7 +7054,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) goto unlock; /* Add connection to indicate PA sync error */ - pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -7021,10 +7069,41 @@ unlock: hci_dev_unlock(hdev); } +static int hci_le_past_params_sync(struct hci_dev *hdev, struct hci_conn *conn, + struct hci_conn *acl, struct bt_iso_qos *qos) +{ + struct hci_cp_le_past_params cp; + int err; + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(acl->handle); + /* An HCI_LE_Periodic_Advertising_Sync_Transfer_Received event is sent + * to the Host. HCI_LE_Periodic_Advertising_Report events will be + * enabled with duplicate filtering enabled. + */ + cp.mode = 0x03; + cp.skip = cpu_to_le16(qos->bcast.skip); + cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp.cte_type = qos->bcast.sync_cte_type; + + /* HCI_LE_PAST_PARAMS command returns a command complete event so it + * cannot wait for HCI_EV_LE_PAST_RECEIVED. + */ + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_PARAMS, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); + if (err) + return err; + + /* Wait for HCI_EV_LE_PAST_RECEIVED event */ + return __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_PAST_RECEIVED, + conn->conn_timeout, NULL); +} + static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_pa_create_sync cp; - struct hci_conn *conn = data; + struct hci_conn *conn = data, *le; struct bt_iso_qos *qos = &conn->iso_qos; int err; @@ -7056,6 +7135,24 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) hci_update_passive_scan_sync(hdev); + /* Check if PAST is possible: + * + * 1. Check if an ACL connection with the destination address exists + * 2. Check if that HCI_CONN_FLAG_PAST has been set which indicates that + * user really intended to use PAST. + */ + le = hci_conn_hash_lookup_le(hdev, &conn->dst, conn->dst_type); + if (le) { + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, &le->dst, le->dst_type); + if (params && params->flags & HCI_CONN_FLAG_PAST) { + err = hci_le_past_params_sync(hdev, conn, le, qos); + if (!err) + goto done; + } + } + /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update * it. */ @@ -7172,3 +7269,254 @@ int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn) return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, create_big_complete); } + +struct past_data { + struct hci_conn *conn; + struct hci_conn *le; +}; + +static void past_complete(struct hci_dev *hdev, void *data, int err) +{ + struct past_data *past = data; + + bt_dev_dbg(hdev, "err %d", err); + + kfree(past); +} + +static int hci_le_past_set_info_sync(struct hci_dev *hdev, void *data) +{ + struct past_data *past = data; + struct hci_cp_le_past_set_info cp; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, past->conn) || + !hci_conn_valid(hdev, past->le)) { + hci_dev_unlock(hdev); + return -ECANCELED; + } + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(past->le->handle); + cp.adv_handle = past->conn->iso_qos.bcast.bis; + + hci_dev_unlock(hdev); + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_SET_INFO, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +static int hci_le_past_sync(struct hci_dev *hdev, void *data) +{ + struct past_data *past = data; + struct hci_cp_le_past cp; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, past->conn) || + !hci_conn_valid(hdev, past->le)) { + hci_dev_unlock(hdev); + return -ECANCELED; + } + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(past->le->handle); + cp.sync_handle = cpu_to_le16(past->conn->sync_handle); + + hci_dev_unlock(hdev); + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +int hci_past_sync(struct hci_conn *conn, struct hci_conn *le) +{ + struct past_data *data; + int err; + + if (conn->type != BIS_LINK && conn->type != PA_LINK) + return -EINVAL; + + if (!past_sender_capable(conn->hdev)) + return -EOPNOTSUPP; + + data = kmalloc_obj(*data); + if (!data) + return -ENOMEM; + + data->conn = conn; + data->le = le; + + if (conn->role == HCI_ROLE_MASTER) + err = hci_cmd_sync_queue_once(conn->hdev, + hci_le_past_set_info_sync, data, + past_complete); + else + err = hci_cmd_sync_queue_once(conn->hdev, hci_le_past_sync, + data, past_complete); + + if (err) + kfree(data); + + return err; +} + +static void le_read_features_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_conn *conn = data; + + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + + hci_conn_drop(conn); +} + +static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev, + void *data) +{ + struct hci_conn *conn = data; + struct hci_cp_le_read_all_remote_features cp; + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + cp.pages = 10; /* Attempt to read all pages */ + + /* Wait for HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE event otherwise + * hci_conn_drop may run prematurely causing a disconnection. + */ + return __hci_cmd_sync_status_sk(hdev, + HCI_OP_LE_READ_ALL_REMOTE_FEATURES, + sizeof(cp), &cp, + HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE, + HCI_CMD_TIMEOUT, NULL); + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data) +{ + struct hci_conn *conn = data; + struct hci_cp_le_read_remote_features cp; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + /* Check if LL Extended Feature Set is supported and + * HCI_OP_LE_READ_ALL_REMOTE_FEATURES is supported then use that to read + * all features. + */ + if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(3)) + return hci_le_read_all_remote_features_sync(hdev, data); + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + + /* Wait for HCI_EV_LE_REMOTE_FEAT_COMPLETE event otherwise + * hci_conn_drop may run prematurely causing a disconnection. + */ + return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, + sizeof(cp), &cp, + HCI_EV_LE_REMOTE_FEAT_COMPLETE, + HCI_CMD_TIMEOUT, NULL); +} + +int hci_le_read_remote_features(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + int err; + + /* The remote features procedure is defined for central + * role only. So only in case of an initiated connection + * request the remote features. + * + * If the local controller supports peripheral-initiated features + * exchange, then requesting the remote features in peripheral + * role is possible. Otherwise just transition into the + * connected state without requesting the remote features. + */ + if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) + err = hci_cmd_sync_queue_once(hdev, + hci_le_read_remote_features_sync, + hci_conn_hold(conn), + le_read_features_complete); + else + err = -EOPNOTSUPP; + + return err; +} + +static void pkt_type_changed(struct hci_dev *hdev, void *data, int err) +{ + struct hci_cp_change_conn_ptype *cp = data; + + bt_dev_dbg(hdev, "err %d", err); + + kfree(cp); +} + +static int hci_change_conn_ptype_sync(struct hci_dev *hdev, void *data) +{ + struct hci_cp_change_conn_ptype *cp = data; + + return __hci_cmd_sync_status_sk(hdev, HCI_OP_CHANGE_CONN_PTYPE, + sizeof(*cp), cp, + HCI_EV_PKT_TYPE_CHANGE, + HCI_CMD_TIMEOUT, NULL); +} + +int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_change_conn_ptype *cp; + + cp = kmalloc_obj(*cp); + if (!cp) + return -ENOMEM; + + cp->handle = cpu_to_le16(conn->handle); + cp->pkt_type = cpu_to_le16(pkt_type); + + return hci_cmd_sync_queue_once(hdev, hci_change_conn_ptype_sync, cp, + pkt_type_changed); +} + +static void le_phy_update_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_cp_le_set_phy *cp = data; + + bt_dev_dbg(hdev, "err %d", err); + + kfree(cp); +} + +static int hci_le_set_phy_sync(struct hci_dev *hdev, void *data) +{ + struct hci_cp_le_set_phy *cp = data; + + return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_SET_PHY, + sizeof(*cp), cp, + HCI_EV_LE_PHY_UPDATE_COMPLETE, + HCI_CMD_TIMEOUT, NULL); +} + +int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_le_set_phy *cp; + + cp = kmalloc_obj(*cp); + if (!cp) + return -ENOMEM; + + memset(cp, 0, sizeof(*cp)); + cp->handle = cpu_to_le16(conn->handle); + cp->tx_phys = tx_phys; + cp->rx_phys = rx_phys; + + return hci_cmd_sync_queue_once(hdev, hci_le_set_phy_sync, cp, + le_phy_update_complete); +} diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 6724adce615b..7bcf8c5ceaee 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -920,7 +920,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, ctrl = bt_sk(ctrl_sock->sk); intr = bt_sk(intr_sock->sk); - session = kzalloc(sizeof(*session), GFP_KERNEL); + session = kzalloc_obj(*session); if (!session) return -ENOMEM; @@ -986,7 +986,8 @@ static void session_free(struct kref *ref) skb_queue_purge(&session->intr_transmit); fput(session->intr_sock->file); fput(session->ctrl_sock->file); - l2cap_conn_put(session->conn); + if (session->conn) + l2cap_conn_put(session->conn); kfree(session); } @@ -1164,6 +1165,15 @@ static void hidp_session_remove(struct l2cap_conn *conn, down_write(&hidp_session_sem); + /* Drop L2CAP reference immediately to indicate that + * l2cap_unregister_user() shall not be called as it is already + * considered removed. + */ + if (session->conn) { + l2cap_conn_put(session->conn); + session->conn = NULL; + } + hidp_session_terminate(session); cancel_work_sync(&session->dev_init); @@ -1301,7 +1311,9 @@ static int hidp_session_thread(void *arg) * Instead, this call has the same semantics as if user-space tried to * delete the session. */ - l2cap_unregister_user(session->conn, &session->user); + if (session->conn) + l2cap_unregister_user(session->conn, &session->user); + hidp_session_put(session); module_put_and_kthread_exit(0); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 616c2fef91d2..be145e2736b7 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -80,14 +80,15 @@ static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); +static bool iso_match_sid_past(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); -static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, - enum bt_sock_state state, +static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src, + bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ @@ -211,7 +212,7 @@ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) return conn; } - conn = kzalloc(sizeof(*conn), GFP_KERNEL); + conn = kzalloc_obj(*conn); if (!conn) return NULL; @@ -360,7 +361,7 @@ static int iso_connect_bis(struct sock *sk) } /* Fail if out PHYs are marked as disabled */ - if (!iso_pi(sk)->qos.bcast.out.phy) { + if (!iso_pi(sk)->qos.bcast.out.phys) { err = -EINVAL; goto unlock; } @@ -457,7 +458,7 @@ static int iso_connect_cis(struct sock *sk) } /* Fail if either PHYs are marked as disabled */ - if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) { + if (!iso_pi(sk)->qos.ucast.in.phys && !iso_pi(sk)->qos.ucast.out.phys) { err = -EINVAL; goto unlock; } @@ -637,8 +638,8 @@ static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, * match func data - pass -1 to ignore * Returns closest match. */ -static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, - enum bt_sock_state state, +static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src, + bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; @@ -650,8 +651,25 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, continue; /* Match Broadcast destination */ - if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) - continue; + if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) { + struct smp_irk *irk1, *irk2; + + /* Check if destination is an RPA that we can resolve */ + irk1 = hci_find_irk_by_rpa(hdev, dst); + if (!irk1) + continue; + + /* Match with identity address */ + if (bacmp(&iso_pi(sk)->dst, &irk1->bdaddr)) { + /* Check if socket destination address is also + * an RPA and if the IRK matches. + */ + irk2 = hci_find_irk_by_rpa(hdev, + &iso_pi(sk)->dst); + if (!irk2 || irk1 != irk2) + continue; + } + } /* Use Match function if provided */ if (match && !match(sk, data)) @@ -728,6 +746,7 @@ static void iso_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&sk->sk_error_queue); } static void iso_sock_cleanup_listen(struct sock *parent) @@ -876,7 +895,7 @@ static struct proto iso_proto = { .interval = 10000u, \ .latency = 10u, \ .sdu = 40u, \ - .phy = BT_ISO_PHY_2M, \ + .phys = BT_ISO_PHY_2M, \ .rtn = 2u, \ } @@ -944,7 +963,7 @@ static int iso_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, +static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -986,20 +1005,14 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, return 0; } -static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa, +/* Must be called on the locked socket. */ +static int iso_sock_rebind_bis(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { int err = 0; - if (sk->sk_type != SOCK_SEQPACKET) { - err = -EINVAL; - goto done; - } - - if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) { - err = -EINVAL; - goto done; - } + if (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) + return -EBADFD; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { err = -EINVAL; @@ -1022,7 +1035,78 @@ done: return err; } -static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, +static struct hci_dev *iso_conn_get_hdev(struct iso_conn *conn) +{ + struct hci_dev *hdev = NULL; + + iso_conn_lock(conn); + if (conn->hcon) + hdev = hci_dev_hold(conn->hcon->hdev); + iso_conn_unlock(conn); + + return hdev; +} + +/* Must be called on the locked socket. */ +static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa, + int addr_len) +{ + struct hci_dev *hdev; + struct hci_conn *bis; + int err; + + if (sk->sk_type != SOCK_SEQPACKET || !iso_pi(sk)->conn) + return -EINVAL; + + /* Check if it is really a Broadcast address being requested */ + if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) + return -EINVAL; + + /* Check if the address hasn't changed then perhaps only the number of + * bis has changed. + */ + if (!bacmp(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr) || + !bacmp(&sa->iso_bc->bc_bdaddr, BDADDR_ANY)) + return iso_sock_rebind_bis(sk, sa, addr_len); + + /* Check if the address type is of LE type */ + if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) + return -EINVAL; + + hdev = iso_conn_get_hdev(iso_pi(sk)->conn); + if (!hdev) + return -EINVAL; + + bis = iso_pi(sk)->conn->hcon; + + /* Release the socket before lookups since that requires hci_dev_lock + * which shall not be acquired while holding sock_lock for proper + * ordering. + */ + release_sock(sk); + hci_dev_lock(bis->hdev); + lock_sock(sk); + + if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) { + /* raced with iso_conn_del() or iso_disconn_sock() */ + err = -ENOTCONN; + goto unlock; + } + + BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bc->bc_bdaddr, + sa->iso_bc->bc_bdaddr_type); + + err = hci_past_bis(bis, &sa->iso_bc->bc_bdaddr, + le_addr_type(sa->iso_bc->bc_bdaddr_type)); + +unlock: + hci_dev_unlock(hdev); + hci_dev_put(hdev); + + return err; +} + +static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -1037,13 +1121,12 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, lock_sock(sk); - /* Allow the user to bind a PA sync socket to a number - * of BISes to sync to. - */ - if ((sk->sk_state == BT_CONNECT2 || - sk->sk_state == BT_CONNECTED) && - test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { - err = iso_sock_bind_pa_sk(sk, sa, addr_len); + if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) && + addr_len > sizeof(*sa)) { + /* Allow the user to rebind to a different address using + * PAST procedures. + */ + err = iso_sock_rebind_bc(sk, sa, addr_len); goto done; } @@ -1080,7 +1163,7 @@ done: return err; } -static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, +static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -1579,7 +1662,7 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, static bool check_io_qos(struct bt_iso_io_qos *qos) { /* If no PHY is enable SDU must be 0 */ - if (!qos->phy && qos->sdu) + if (!qos->phys && qos->sdu) return false; if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff)) @@ -1588,7 +1671,7 @@ static bool check_io_qos(struct bt_iso_io_qos *qos) if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0)) return false; - if (qos->phy > BT_ISO_PHY_ANY) + if (qos->phys > BT_ISO_PHY_ANY) return false; return true; @@ -1939,6 +2022,11 @@ static bool iso_match_pa_sync_flag(struct sock *sk, void *data) return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } +static bool iso_match_dst(struct sock *sk, void *data) +{ + return !bacmp(&iso_pi(sk)->dst, (bdaddr_t *)data); +} + static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; @@ -1947,23 +2035,45 @@ static void iso_conn_ready(struct iso_conn *conn) struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; + struct hci_dev *hdev; BT_DBG("conn %p", conn); if (sk) { + /* Attempt to update source address in case of BIS Sender if + * the advertisement is using a random address. + */ + if (conn->hcon->type == BIS_LINK && + conn->hcon->role == HCI_ROLE_MASTER && + !bacmp(&conn->hcon->dst, BDADDR_ANY)) { + struct hci_conn *bis = conn->hcon; + struct adv_info *adv; + + adv = hci_find_adv_instance(bis->hdev, + bis->iso_qos.bcast.bis); + if (adv && bacmp(&adv->random_addr, BDADDR_ANY)) { + lock_sock(sk); + iso_pi(sk)->src_type = BDADDR_LE_RANDOM; + bacpy(&iso_pi(sk)->src, &adv->random_addr); + release_sock(sk); + } + } + iso_sock_ready(conn->sk); } else { hcon = conn->hcon; if (!hcon) return; + hdev = hcon->hdev; + if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) { /* A BIS slave hcon is notified to the ISO layer * after the Command Complete for the LE Setup * ISO Data Path command is received. Get the * parent socket that matches the hcon BIG handle. */ - parent = iso_get_sock(&hcon->src, &hcon->dst, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_big_hcon, hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { @@ -1971,12 +2081,12 @@ static void iso_conn_ready(struct iso_conn *conn) HCI_EVT_LE_BIG_SYNC_ESTABLISHED); /* Get reference to PA sync parent socket, if it exists */ - parent = iso_get_sock(&hcon->src, &hcon->dst, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_pa_sync_flag, NULL); if (!parent && ev) - parent = iso_get_sock(&hcon->src, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_big, ev); @@ -1984,7 +2094,7 @@ static void iso_conn_ready(struct iso_conn *conn) ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) - parent = iso_get_sock(&hcon->src, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_sid, ev2); @@ -1992,7 +2102,7 @@ static void iso_conn_ready(struct iso_conn *conn) ev3 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) - parent = iso_get_sock(&hcon->src, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_sync_handle_pa_report, @@ -2000,8 +2110,8 @@ static void iso_conn_ready(struct iso_conn *conn) } if (!parent) - parent = iso_get_sock(&hcon->src, BDADDR_ANY, - BT_LISTEN, NULL, NULL); + parent = iso_get_sock(hdev, &hcon->src, BDADDR_ANY, + BT_LISTEN, iso_match_dst, BDADDR_ANY); if (!parent) return; @@ -2090,6 +2200,16 @@ static bool iso_match_sid(struct sock *sk, void *data) return ev->sid == iso_pi(sk)->bc_sid; } +static bool iso_match_sid_past(struct sock *sk, void *data) +{ + struct hci_ev_le_past_received *ev = data; + + if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) + return true; + + return ev->sid == iso_pi(sk)->bc_sid; +} + static bool iso_match_sync_handle(struct sock *sk, void *data) { struct hci_evt_le_big_info_adv_report *ev = data; @@ -2109,6 +2229,7 @@ static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; + struct hci_ev_le_past_received *ev1a; struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; @@ -2122,6 +2243,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) * SID to listen to and once sync is established its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. + * 1a. HCI_EV_LE_PAST_RECEIVED: alternative to 1. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. @@ -2131,7 +2253,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) { iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); @@ -2141,10 +2263,22 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) goto done; } + ev1a = hci_recv_event_data(hdev, HCI_EV_LE_PAST_RECEIVED); + if (ev1a) { + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, + iso_match_sid_past, ev1a); + if (sk && !ev1a->status) { + iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle); + iso_pi(sk)->bc_sid = ev1a->sid; + } + + goto done; + } + ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { /* Check if BIGInfo report has already been handled */ - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECTED, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECTED, iso_match_sync_handle, ev2); if (sk) { sock_put(sk); @@ -2153,10 +2287,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) } /* Try to get PA sync socket, if it exists */ - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECT2, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECT2, iso_match_sync_handle, ev2); if (!sk) - sk = iso_get_sock(&hdev->bdaddr, bdaddr, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle, ev2); @@ -2195,7 +2329,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) u8 *base; struct hci_conn *hcon; - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; @@ -2245,8 +2379,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) hcon->le_per_adv_data_len = 0; } } else { - sk = iso_get_sock(&hdev->bdaddr, BDADDR_ANY, - BT_LISTEN, NULL, NULL); + sk = iso_get_sock(hdev, &hdev->bdaddr, BDADDR_ANY, + BT_LISTEN, iso_match_dst, BDADDR_ANY); } done: diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 07b493331fd7..5deb6c4f1e41 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -442,7 +442,7 @@ struct l2cap_chan *l2cap_chan_create(void) { struct l2cap_chan *chan; - chan = kzalloc(sizeof(*chan), GFP_ATOMIC); + chan = kzalloc_obj(*chan, GFP_ATOMIC); if (!chan) return NULL; @@ -924,26 +924,18 @@ int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator) initiator); } -static u8 l2cap_get_ident(struct l2cap_conn *conn) +static int l2cap_get_ident(struct l2cap_conn *conn) { - u8 id; + /* LE link does not support tools like l2ping so use the full range */ + if (conn->hcon->type == LE_LINK) + return ida_alloc_range(&conn->tx_ida, 1, 255, GFP_ATOMIC); /* Get next available identificator. * 1 - 128 are used by kernel. * 129 - 199 are reserved. * 200 - 254 are used by utilities like l2ping, etc. */ - - mutex_lock(&conn->ident_lock); - - if (++conn->tx_ident > 128) - conn->tx_ident = 1; - - id = conn->tx_ident; - - mutex_unlock(&conn->ident_lock); - - return id; + return ida_alloc_range(&conn->tx_ida, 1, 128, GFP_ATOMIC); } static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, @@ -1686,17 +1678,15 @@ static void l2cap_info_timeout(struct work_struct *work) int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user) { - struct hci_dev *hdev = conn->hcon->hdev; int ret; /* We need to check whether l2cap_conn is registered. If it is not, we - * must not register the l2cap_user. l2cap_conn_del() is unregisters - * l2cap_conn objects, but doesn't provide its own locking. Instead, it - * relies on the parent hci_conn object to be locked. This itself relies - * on the hci_dev object to be locked. So we must lock the hci device - * here, too. */ + * must not register the l2cap_user. l2cap_conn_del() unregisters + * l2cap_conn objects under conn->lock, and we use the same lock here + * to protect access to conn->users and conn->hchan. + */ - hci_dev_lock(hdev); + mutex_lock(&conn->lock); if (!list_empty(&user->list)) { ret = -EINVAL; @@ -1717,16 +1707,14 @@ int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user) ret = 0; out_unlock: - hci_dev_unlock(hdev); + mutex_unlock(&conn->lock); return ret; } EXPORT_SYMBOL(l2cap_register_user); void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user) { - struct hci_dev *hdev = conn->hcon->hdev; - - hci_dev_lock(hdev); + mutex_lock(&conn->lock); if (list_empty(&user->list)) goto out_unlock; @@ -1735,7 +1723,7 @@ void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user) user->remove(conn, user); out_unlock: - hci_dev_unlock(hdev); + mutex_unlock(&conn->lock); } EXPORT_SYMBOL(l2cap_unregister_user); @@ -1773,6 +1761,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + ida_destroy(&conn->tx_ida); + cancel_delayed_work_sync(&conn->id_addr_timer); l2cap_unregister_all_users(conn); @@ -4622,7 +4612,8 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, switch (type) { case L2CAP_IT_FEAT_MASK: - conn->feat_mask = get_unaligned_le32(rsp->data); + if (cmd_len >= sizeof(*rsp) + sizeof(u32)) + conn->feat_mask = get_unaligned_le32(rsp->data); if (conn->feat_mask & L2CAP_FEAT_FIXED_CHAN) { struct l2cap_info_req req; @@ -4641,7 +4632,8 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, break; case L2CAP_IT_FIXED_CHAN: - conn->remote_fixed_chan = rsp->data[0]; + if (cmd_len >= sizeof(*rsp) + sizeof(rsp->data[0])) + conn->remote_fixed_chan = rsp->data[0]; conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; @@ -4782,12 +4774,34 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, return err; } +static void l2cap_put_ident(struct l2cap_conn *conn, u8 code, u8 id) +{ + switch (code) { + case L2CAP_COMMAND_REJ: + case L2CAP_CONN_RSP: + case L2CAP_CONF_RSP: + case L2CAP_DISCONN_RSP: + case L2CAP_ECHO_RSP: + case L2CAP_INFO_RSP: + case L2CAP_CONN_PARAM_UPDATE_RSP: + case L2CAP_ECRED_CONN_RSP: + case L2CAP_ECRED_RECONF_RSP: + /* First do a lookup since the remote may send bogus ids that + * would make ida_free to generate warnings. + */ + if (ida_find_first_range(&conn->tx_ida, id, id) >= 0) + ida_free(&conn->tx_ida, id); + } +} + static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) { int err = 0; + l2cap_put_ident(conn, cmd->code, cmd->ident); + switch (cmd->code) { case L2CAP_COMMAND_REJ: l2cap_command_rej(conn, cmd, cmd_len, data); @@ -4900,6 +4914,13 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response_unlock; } + /* Check if Key Size is sufficient for the security level */ + if (!l2cap_check_enc_key_size(conn->hcon, pchan)) { + result = L2CAP_CR_LE_BAD_KEY_SIZE; + chan = NULL; + goto response_unlock; + } + /* Check for valid dynamic CID range */ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { result = L2CAP_CR_LE_INVALID_SCID; @@ -5035,21 +5056,34 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, struct l2cap_chan *chan, *pchan; u16 mtu, mps; __le16 psm; - u8 result, len = 0; - int i, num_scid; + u8 result, rsp_len = 0; + int i, num_scid = 0; bool defer = false; if (!enable_ecred) return -EINVAL; + memset(pdu, 0, sizeof(*pdu)); + if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) { result = L2CAP_CR_LE_INVALID_PARAMS; goto response; } + /* Check if there are no pending channels with the same ident */ + __l2cap_chan_list_id(conn, cmd->ident, l2cap_ecred_list_defer, + &num_scid); + if (num_scid) { + result = L2CAP_CR_LE_INVALID_PARAMS; + goto response; + } + cmd_len -= sizeof(*req); num_scid = cmd_len / sizeof(u16); + /* Always respond with the same number of scids as in the request */ + rsp_len = cmd_len; + if (num_scid > L2CAP_ECRED_MAX_CID) { result = L2CAP_CR_LE_INVALID_PARAMS; goto response; @@ -5059,7 +5093,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, mps = __le16_to_cpu(req->mps); if (mtu < L2CAP_ECRED_MIN_MTU || mps < L2CAP_ECRED_MIN_MPS) { - result = L2CAP_CR_LE_UNACCEPT_PARAMS; + result = L2CAP_CR_LE_INVALID_PARAMS; goto response; } @@ -5079,8 +5113,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps); - memset(pdu, 0, sizeof(*pdu)); - /* Check if we have socket listening on psm */ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, &conn->hcon->dst, LE_LINK); @@ -5093,7 +5125,16 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, if (!smp_sufficient_security(conn->hcon, pchan->sec_level, SMP_ALLOW_STK)) { - result = L2CAP_CR_LE_AUTHENTICATION; + result = pchan->sec_level == BT_SECURITY_MEDIUM ? + L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION; + goto unlock; + } + + /* Check if the listening channel has set an output MTU then the + * requested MTU shall be less than or equal to that value. + */ + if (pchan->omtu && mtu < pchan->omtu) { + result = L2CAP_CR_LE_UNACCEPT_PARAMS; goto unlock; } @@ -5105,7 +5146,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, BT_DBG("scid[%d] 0x%4.4x", i, scid); pdu->dcid[i] = 0x0000; - len += sizeof(*pdu->dcid); /* Check for valid dynamic CID range */ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { @@ -5172,7 +5212,7 @@ response: return 0; l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP, - sizeof(*pdu) + len, pdu); + sizeof(*pdu) + rsp_len, pdu); return 0; } @@ -5294,14 +5334,14 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn, struct l2cap_ecred_reconf_req *req = (void *) data; struct l2cap_ecred_reconf_rsp rsp; u16 mtu, mps, result; - struct l2cap_chan *chan; + struct l2cap_chan *chan[L2CAP_ECRED_MAX_CID] = {}; int i, num_scid; if (!enable_ecred) return -EINVAL; - if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) { - result = L2CAP_CR_LE_INVALID_PARAMS; + if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) { + result = L2CAP_RECONF_INVALID_CID; goto respond; } @@ -5311,42 +5351,69 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn, BT_DBG("mtu %u mps %u", mtu, mps); if (mtu < L2CAP_ECRED_MIN_MTU) { - result = L2CAP_RECONF_INVALID_MTU; + result = L2CAP_RECONF_INVALID_PARAMS; goto respond; } if (mps < L2CAP_ECRED_MIN_MPS) { - result = L2CAP_RECONF_INVALID_MPS; + result = L2CAP_RECONF_INVALID_PARAMS; goto respond; } cmd_len -= sizeof(*req); num_scid = cmd_len / sizeof(u16); + + if (num_scid > L2CAP_ECRED_MAX_CID) { + result = L2CAP_RECONF_INVALID_PARAMS; + goto respond; + } + result = L2CAP_RECONF_SUCCESS; + /* Check if each SCID, MTU and MPS are valid */ for (i = 0; i < num_scid; i++) { u16 scid; scid = __le16_to_cpu(req->scid[i]); - if (!scid) - return -EPROTO; + if (!scid) { + result = L2CAP_RECONF_INVALID_CID; + goto respond; + } - chan = __l2cap_get_chan_by_dcid(conn, scid); - if (!chan) - continue; + chan[i] = __l2cap_get_chan_by_dcid(conn, scid); + if (!chan[i]) { + result = L2CAP_RECONF_INVALID_CID; + goto respond; + } - /* If the MTU value is decreased for any of the included - * channels, then the receiver shall disconnect all - * included channels. + /* The MTU field shall be greater than or equal to the greatest + * current MTU size of these channels. */ - if (chan->omtu > mtu) { - BT_ERR("chan %p decreased MTU %u -> %u", chan, - chan->omtu, mtu); + if (chan[i]->omtu > mtu) { + BT_ERR("chan %p decreased MTU %u -> %u", chan[i], + chan[i]->omtu, mtu); result = L2CAP_RECONF_INVALID_MTU; + goto respond; } - chan->omtu = mtu; - chan->remote_mps = mps; + /* If more than one channel is being configured, the MPS field + * shall be greater than or equal to the current MPS size of + * each of these channels. If only one channel is being + * configured, the MPS field may be less than the current MPS + * of that channel. + */ + if (chan[i]->remote_mps >= mps && i) { + BT_ERR("chan %p decreased MPS %u -> %u", chan[i], + chan[i]->remote_mps, mps); + result = L2CAP_RECONF_INVALID_MPS; + goto respond; + } + } + + /* Commit the new MTU and MPS values after checking they are valid */ + for (i = 0; i < num_scid; i++) { + chan[i]->omtu = mtu; + chan[i]->remote_mps = mps; } respond: @@ -5363,7 +5430,7 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, u8 *data) { struct l2cap_chan *chan, *tmp; - struct l2cap_ecred_conn_rsp *rsp = (void *) data; + struct l2cap_ecred_reconf_rsp *rsp = (void *)data; u16 result; if (cmd_len < sizeof(*rsp)) @@ -5371,7 +5438,7 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, result = __le16_to_cpu(rsp->result); - BT_DBG("result 0x%4.4x", rsp->result); + BT_DBG("result 0x%4.4x", result); if (!result) return 0; @@ -5419,6 +5486,8 @@ static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn, { int err = 0; + l2cap_put_ident(conn, cmd->code, cmd->ident); + switch (cmd->code) { case L2CAP_COMMAND_REJ: l2cap_le_command_rej(conn, cmd, cmd_len, data); @@ -6599,8 +6668,10 @@ static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) return -ENOBUFS; } - if (chan->imtu < skb->len) { - BT_ERR("Too big LE L2CAP PDU"); + if (skb->len > chan->imtu) { + BT_ERR("Too big LE L2CAP PDU: len %u > %u", skb->len, + chan->imtu); + l2cap_send_disconn_req(chan, ECONNRESET); return -ENOBUFS; } @@ -6626,7 +6697,9 @@ static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) sdu_len, skb->len, chan->imtu); if (sdu_len > chan->imtu) { - BT_ERR("Too big LE L2CAP SDU length received"); + BT_ERR("Too big LE L2CAP SDU length: len %u > %u", + skb->len, sdu_len); + l2cap_send_disconn_req(chan, ECONNRESET); err = -EMSGSIZE; goto failed; } @@ -6662,6 +6735,7 @@ static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) if (chan->sdu->len + skb->len > chan->sdu_len) { BT_ERR("Too much LE L2CAP data received"); + l2cap_send_disconn_req(chan, ECONNRESET); err = -EINVAL; goto failed; } @@ -6884,7 +6958,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) if (!hchan) return NULL; - conn = kzalloc(sizeof(*conn), GFP_KERNEL); + conn = kzalloc_obj(*conn); if (!conn) { hci_chan_del(hchan); return NULL; @@ -6907,13 +6981,13 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP))) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; - mutex_init(&conn->ident_lock); mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + ida_init(&conn->tx_ida); skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 814fb8610ac4..597686790371 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -80,7 +80,7 @@ static int l2cap_validate_le_psm(u16 psm) return 0; } -static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +static int l2cap_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -178,7 +178,7 @@ done: return err; } -static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, +static int l2cap_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; @@ -885,7 +885,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, struct bt_power pwr; struct l2cap_conn *conn; int err = 0; - u32 opt; + u32 opt, phys; u16 mtu; u8 mode; @@ -1029,10 +1029,17 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - /* Setting is not supported as it's the remote side that - * decides this. - */ - err = -EPERM; + /* Only allow setting output MTU when not connected */ + if (sk->sk_state == BT_CONNECTED) { + err = -EISCONN; + break; + } + + err = copy_safe_from_sockptr(&mtu, sizeof(mtu), optval, optlen); + if (err) + break; + + chan->omtu = mtu; break; case BT_RCVMTU: @@ -1059,6 +1066,24 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_PHY: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + err = copy_safe_from_sockptr(&phys, sizeof(phys), optval, + optlen); + if (err) + break; + + if (!chan->conn) + break; + + conn = chan->conn; + err = hci_conn_set_phy(conn->hcon, phys); + break; + case BT_MODE: if (!enable_ecred) { err = -ENOPROTOOPT; @@ -1546,8 +1571,7 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) (chan->mode == L2CAP_MODE_ERTM || chan->mode == L2CAP_MODE_LE_FLOWCTL || chan->mode == L2CAP_MODE_EXT_FLOWCTL)) { - struct l2cap_rx_busy *rx_busy = - kmalloc(sizeof(*rx_busy), GFP_KERNEL); + struct l2cap_rx_busy *rx_busy = kmalloc_obj(*rx_busy); if (!rx_busy) { err = -ENOMEM; goto done; @@ -1799,6 +1823,7 @@ static void l2cap_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&sk->sk_error_queue); } static void l2cap_skb_msg_name(struct sk_buff *skb, void *msg_name, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 262bf984d2aa..d52238ce6a9a 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -560,7 +560,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, list_for_each_entry(d, &hci_dev_list, list) count++; - rp = kmalloc(struct_size(rp, entry, count), GFP_ATOMIC); + rp = kmalloc_flex(*rp, entry, count, GFP_ATOMIC); if (!rp) { read_unlock(&hci_dev_list_lock); return -ENOMEM; @@ -849,9 +849,21 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; + if (bis_capable(hdev)) + settings |= MGMT_SETTING_ISO_BROADCASTER; + + if (sync_recv_capable(hdev)) + settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; + if (ll_privacy_capable(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; + if (past_sender_capable(hdev)) + settings |= MGMT_SETTING_PAST_SENDER; + + if (past_receiver_capable(hdev)) + settings |= MGMT_SETTING_PAST_RECEIVER; + settings |= MGMT_SETTING_PHY_CONFIGURATION; return settings; @@ -937,6 +949,12 @@ static u32 get_current_settings(struct hci_dev *hdev) if (ll_privacy_enabled(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; + if (past_sender_enabled(hdev)) + settings |= MGMT_SETTING_PAST_SENDER; + + if (past_receiver_enabled(hdev)) + settings |= MGMT_SETTING_PAST_RECEIVER; + return settings; } @@ -1948,6 +1966,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) } mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); + mgmt_pending_free(cmd); return; } @@ -1966,6 +1985,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) sock_put(match.sk); hci_update_eir_sync(hdev); + mgmt_pending_free(cmd); } static int set_ssp_sync(struct hci_dev *hdev, void *data) @@ -2175,10 +2195,7 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) sk = cmd->sk; if (status) { - mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, - status); - mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, - cmd_status_rsp, &status); + mgmt_cmd_status(cmd->sk, hdev->id, cmd->opcode, status); goto done; } @@ -2747,7 +2764,7 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto failed; } - uuid = kmalloc(sizeof(*uuid), GFP_KERNEL); + uuid = kmalloc_obj(*uuid); if (!uuid) { err = -ENOMEM; goto failed; @@ -3340,7 +3357,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, i++; } - rp = kmalloc(struct_size(rp, addr, i), GFP_KERNEL); + rp = kmalloc_flex(*rp, addr, i); if (!rp) { err = -ENOMEM; goto unlock; @@ -4357,7 +4374,7 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, hci_blocked_keys_clear(hdev); for (i = 0; i < key_count; ++i) { - struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL); + struct blocked_key *b = kzalloc_obj(*b); if (!b) { err = MGMT_STATUS_NO_RESOURCES; @@ -5110,6 +5127,69 @@ static void device_flags_changed(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk); } +static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) +{ + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); + if (!conn) + return false; + + if (conn->dst_type != type) + return false; + + if (conn->state != BT_CONNECTED) + return false; + + return true; +} + +/* This function requires the caller holds hdev->lock */ +static struct hci_conn_params *hci_conn_params_set(struct hci_dev *hdev, + bdaddr_t *addr, u8 addr_type, + u8 auto_connect) +{ + struct hci_conn_params *params; + + params = hci_conn_params_add(hdev, addr, addr_type); + if (!params) + return NULL; + + if (params->auto_connect == auto_connect) + return params; + + hci_pend_le_list_del_init(params); + + switch (auto_connect) { + case HCI_AUTO_CONN_DISABLED: + case HCI_AUTO_CONN_LINK_LOSS: + /* If auto connect is being disabled when we're trying to + * connect to device, keep connecting. + */ + if (params->explicit_connect) + hci_pend_le_list_add(params, &hdev->pend_le_conns); + break; + case HCI_AUTO_CONN_REPORT: + if (params->explicit_connect) + hci_pend_le_list_add(params, &hdev->pend_le_conns); + else + hci_pend_le_list_add(params, &hdev->pend_le_reports); + break; + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + if (!is_connected(hdev, addr, addr_type)) + hci_pend_le_list_add(params, &hdev->pend_le_conns); + break; + } + + params->auto_connect = auto_connect; + + bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", + addr, addr_type, auto_connect); + + return params; +} + static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { @@ -5153,9 +5233,16 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (!params) { - bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", - &cp->addr.bdaddr, le_addr_type(cp->addr.type)); - goto unlock; + /* Create a new hci_conn_params if it doesn't exist */ + params = hci_conn_params_set(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type), + HCI_AUTO_CONN_DISABLED); + if (!params) { + bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", + &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + goto unlock; + } } supported_flags = hdev->conn_flags; @@ -5287,7 +5374,7 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); bt_dev_dbg(hdev, "add monitor %d complete, status %d", @@ -5400,7 +5487,7 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, (offset + length) > HCI_MAX_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc_obj(*p); if (!p) return MGMT_STATUS_NO_RESOURCES; @@ -5437,7 +5524,7 @@ static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, goto done; } - m = kzalloc(sizeof(*m), GFP_KERNEL); + m = kzalloc_obj(*m); if (!m) { status = MGMT_STATUS_NO_RESOURCES; goto done; @@ -5474,7 +5561,7 @@ static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev, goto done; } - m = kzalloc(sizeof(*m), GFP_KERNEL); + m = kzalloc_obj(*m); if (!m) { status = MGMT_STATUS_NO_RESOURCES; goto done; @@ -6350,6 +6437,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) hci_dev_clear_flag(hdev, HCI_ADVERTISING); settings_rsp(cmd, &match); + mgmt_pending_free(cmd); new_settings(hdev, match.sk); @@ -7542,68 +7630,6 @@ unlock: return err; } -static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) -{ - struct hci_conn *conn; - - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); - if (!conn) - return false; - - if (conn->dst_type != type) - return false; - - if (conn->state != BT_CONNECTED) - return false; - - return true; -} - -/* This function requires the caller holds hdev->lock */ -static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, - u8 addr_type, u8 auto_connect) -{ - struct hci_conn_params *params; - - params = hci_conn_params_add(hdev, addr, addr_type); - if (!params) - return -EIO; - - if (params->auto_connect == auto_connect) - return 0; - - hci_pend_le_list_del_init(params); - - switch (auto_connect) { - case HCI_AUTO_CONN_DISABLED: - case HCI_AUTO_CONN_LINK_LOSS: - /* If auto connect is being disabled when we're trying to - * connect to device, keep connecting. - */ - if (params->explicit_connect) - hci_pend_le_list_add(params, &hdev->pend_le_conns); - break; - case HCI_AUTO_CONN_REPORT: - if (params->explicit_connect) - hci_pend_le_list_add(params, &hdev->pend_le_conns); - else - hci_pend_le_list_add(params, &hdev->pend_le_reports); - break; - case HCI_AUTO_CONN_DIRECT: - case HCI_AUTO_CONN_ALWAYS: - if (!is_connected(hdev, addr, addr_type)) - hci_pend_le_list_add(params, &hdev->pend_le_conns); - break; - } - - params->auto_connect = auto_connect; - - bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", - addr, addr_type, auto_connect); - - return 0; -} - static void device_added(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type, u8 action) { @@ -7715,17 +7741,13 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, /* If the connection parameters don't exist for this device, * they will be created and configured with defaults. */ - if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, - auto_conn) < 0) { + params = hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, + auto_conn); + if (!params) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); goto unlock; - } else { - params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, - addr_type); - if (params) - current_flags = params->flags; } cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index c4063d200c0a..fdcc752c6f13 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -11,6 +11,12 @@ #include "mgmt_util.h" #include "mgmt_config.h" +#define HDEV_PARAM_U32(_param_name_) \ + struct {\ + struct mgmt_tlv_hdr entry; \ + __le32 value; \ + } __packed _param_name_ + #define HDEV_PARAM_U16(_param_name_) \ struct {\ struct mgmt_tlv_hdr entry; \ @@ -29,6 +35,12 @@ cpu_to_le16(hdev->_param_name_) \ } +#define TLV_SET_U32(_param_code_, _param_name_) \ + { \ + { cpu_to_le16(_param_code_), sizeof(__u32) }, \ + cpu_to_le32(hdev->_param_name_) \ + } + #define TLV_SET_U8(_param_code_, _param_name_) \ { \ { cpu_to_le16(_param_code_), sizeof(__u8) }, \ @@ -78,6 +90,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, HDEV_PARAM_U16(advmon_allowlist_duration); HDEV_PARAM_U16(advmon_no_filter_duration); HDEV_PARAM_U8(enable_advmon_interleave_scan); + HDEV_PARAM_U32(idle_timeout); } __packed rp = { TLV_SET_U16(0x0000, def_page_scan_type), TLV_SET_U16(0x0001, def_page_scan_int), @@ -111,6 +124,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, TLV_SET_U16(0x001d, advmon_allowlist_duration), TLV_SET_U16(0x001e, advmon_no_filter_duration), TLV_SET_U8(0x001f, enable_advmon_interleave_scan), + TLV_SET_U32(0x0020, idle_timeout), }; bt_dev_dbg(hdev, "sock %p", sk); @@ -122,6 +136,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, } #define TO_TLV(x) ((struct mgmt_tlv *)(x)) +#define TLV_GET_LE32(tlv) le32_to_cpu(*((__le32 *)(TO_TLV(tlv)->value))) #define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value))) #define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value))) @@ -191,6 +206,9 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, case 0x001f: exp_type_len = sizeof(u8); break; + case 0x0020: + exp_type_len = sizeof(u32); + break; default: exp_type_len = 0; bt_dev_warn(hdev, "unsupported parameter %u", type); @@ -314,6 +332,9 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, case 0x0001f: hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer); break; + case 0x00020: + hdev->idle_timeout = TLV_GET_LE32(buffer); + break; default: bt_dev_warn(hdev, "unsupported parameter %u", type); break; diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index aa7b5585cb26..4f19654d41a9 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -266,7 +266,7 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, { struct mgmt_pending_cmd *cmd; - cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + cmd = kzalloc_obj(*cmd); if (!cmd) return NULL; @@ -413,7 +413,7 @@ struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev, { struct mgmt_mesh_tx *mesh_tx; - mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL); + mesh_tx = kzalloc_obj(*mesh_tx); if (!mesh_tx) return NULL; diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index c560d8467669..2f008167cbaa 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -276,7 +276,7 @@ static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, if (status) goto unlock; - handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); + handle_data = kmalloc_obj(*handle_data); if (!handle_data) { status = HCI_ERROR_UNSPECIFIED; goto unlock; @@ -756,7 +756,7 @@ void msft_register(struct hci_dev *hdev) bt_dev_dbg(hdev, "Register MSFT extension"); - msft = kzalloc(sizeof(*msft), GFP_KERNEL); + msft = kzalloc_obj(*msft); if (!msft) { bt_dev_err(hdev, "Failed to register MSFT extension"); return; @@ -790,7 +790,7 @@ static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct monitored_device *dev; - dev = kmalloc(sizeof(*dev), GFP_KERNEL); + dev = kmalloc_obj(*dev); if (!dev) { bt_dev_err(hdev, "MSFT vendor event %u: no memory", MSFT_EV_LE_MONITOR_DEVICE); @@ -932,7 +932,7 @@ static struct msft_monitor_addr_filter_data *msft_add_address_filter struct msft_data *msft = hdev->msft_data; int err; - address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); + address_filter = kzalloc_obj(*address_filter); if (!address_filter) return NULL; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 96250807b32b..611a9a94151e 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -302,7 +302,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d) struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) { - struct rfcomm_dlc *d = kzalloc(sizeof(*d), prio); + struct rfcomm_dlc *d = kzalloc_obj(*d, prio); if (!d) return NULL; @@ -680,7 +680,7 @@ int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig) /* ---- RFCOMM sessions ---- */ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state) { - struct rfcomm_session *s = kzalloc(sizeof(*s), GFP_KERNEL); + struct rfcomm_session *s = kzalloc_obj(*s); if (!s) return NULL; @@ -781,7 +781,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, addr.l2_psm = 0; addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + *err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); if (*err < 0) goto failed; @@ -808,7 +808,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); + *err = kernel_connect(sock, (struct sockaddr_unsized *)&addr, sizeof(addr), O_NONBLOCK); if (*err == 0 || *err == -EINPROGRESS) return s; @@ -2068,7 +2068,7 @@ static int rfcomm_add_listener(bdaddr_t *ba) addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); if (err < 0) { BT_ERR("Bind failed %d", err); goto failed; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 913402806fa0..be6639cd6f59 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, return 0; } -static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int rfcomm_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_rc sa; struct sock *sk = sock->sk; @@ -371,7 +371,8 @@ done: return err; } -static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +static int rfcomm_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, + int alen, int flags) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index b783526ab588..91bf5274262e 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -221,7 +221,7 @@ static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct list_head *head = &rfcomm_dev_list; int err = 0; - dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL); + dev = kzalloc_obj(struct rfcomm_dev); if (!dev) return ERR_PTR(-ENOMEM); @@ -510,7 +510,7 @@ static int rfcomm_get_dev_list(void __user *arg) if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di)) return -EINVAL; - dl = kzalloc(struct_size(dl, dev_info, dev_num), GFP_KERNEL); + dl = kzalloc_flex(*dl, dev_info, dev_num); if (!dl) return -ENOMEM; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 298c2a9ab4df..e7db50165879 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -204,7 +204,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) return conn; } - conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL); + conn = kzalloc_obj(struct sco_conn); if (!conn) return NULL; @@ -470,6 +470,7 @@ static void sco_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&sk->sk_error_queue); } static void sco_sock_cleanup_listen(struct sock *parent) @@ -605,7 +606,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, +static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; @@ -639,7 +640,7 @@ done: return err; } -static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3a1ce04a7a53..485e3468bd26 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -374,7 +374,7 @@ static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16], static int smp_e(const u8 *k, u8 *r) { - struct crypto_aes_ctx ctx; + struct aes_enckey aes; uint8_t tmp[16], data[16]; int err; @@ -383,7 +383,7 @@ static int smp_e(const u8 *k, u8 *r) /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); - err = aes_expandkey(&ctx, tmp, 16); + err = aes_prepareenckey(&aes, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; @@ -392,14 +392,14 @@ static int smp_e(const u8 *k, u8 *r) /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); - aes_encrypt(&ctx, data, data); + aes_encrypt(&aes, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); SMP_DBG("r %16phN", r); - memzero_explicit(&ctx, sizeof(ctx)); + memzero_explicit(&aes, sizeof(aes)); return err; } @@ -1344,7 +1344,7 @@ static void smp_distribute_keys(struct smp_chan *smp) /* Generate a new random key */ get_random_bytes(sign.csrk, sizeof(sign.csrk)); - csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); + csrk = kzalloc_obj(*csrk); if (csrk) { if (hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED; @@ -1388,7 +1388,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; - smp = kzalloc(sizeof(*smp), GFP_ATOMIC); + smp = kzalloc_obj(*smp, GFP_ATOMIC); if (!smp) return NULL; @@ -2664,7 +2664,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) skb_pull(skb, sizeof(*rp)); - csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); + csrk = kzalloc_obj(*csrk); if (csrk) { if (conn->hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED; @@ -2743,7 +2743,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) && !crypto_memneq(key, smp->local_pk, 64)) { bt_dev_err(hdev, "Remote and local public keys are identical"); - return SMP_UNSPECIFIED; + return SMP_DHKEY_CHECK_FAILED; } memcpy(smp->remote_pk, key, 64); @@ -3293,7 +3293,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) goto create_chan; } - smp = kzalloc(sizeof(*smp), GFP_KERNEL); + smp = kzalloc_obj(*smp); if (!smp) return ERR_PTR(-ENOMEM); diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 812457819b5a..ae5a54c350b9 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -36,7 +36,7 @@ dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr) if (size_in != sizeof(u64) * nr) return ERR_PTR(-EINVAL); - args = kzalloc(sizeof(*args), GFP_KERNEL); + args = kzalloc_obj(*args); if (!args) return ERR_PTR(-ENOMEM); @@ -158,13 +158,13 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err) goto out; - tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); if (!tlinks) { err = -ENOMEM; goto out; } - link = kzalloc(sizeof(*link), GFP_USER); + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8b7d0b90fea7..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -436,7 +436,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, - struct skb_shared_info *sinfo, u32 size, + struct skb_shared_info *sinfo, u32 size, u32 frag_size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); @@ -453,7 +453,7 @@ static int bpf_test_finish(const union bpf_attr *kattr, } if (data_out) { - int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; + int len = sinfo ? copy_size - frag_size : copy_size; if (len < 0) { err = -ENOSPC; @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || @@ -899,6 +900,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), + offsetof(struct __sk_buff, data_end))) + return -EINVAL; + + /* data_end is allowed, but not copied to skb */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, data_end), offsetof(struct __sk_buff, tstamp))) return -EINVAL; @@ -939,6 +946,11 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; + + /* Currently GSO type is zero/unset. If this gets extended with + * a small list of accepted GSO types in future, the filter for + * an unset GSO type in bpf_clone_redirect() can be lifted. + */ skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; @@ -973,46 +985,39 @@ static struct proto bpf_dummy_proto = { int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - bool is_l2 = false, is_direct_pkt_access = false; + bool is_l2 = false, is_direct_pkt_access = false, is_lwt = false; + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; - u32 size = kattr->test.data_size_in; + u32 headroom = NET_SKB_PAD + NET_IP_ALIGN; + u32 linear_sz = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; + struct sk_buff *skb = NULL; + struct sock *sk = NULL; u32 retval, duration; int hh_len = ETH_HLEN; - struct sk_buff *skb; - struct sock *sk; - void *data; + void *data = NULL; int ret; if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; - if (size < ETH_HLEN) + if (kattr->test.data_size_in < ETH_HLEN) return -EINVAL; - data = bpf_test_init(kattr, kattr->test.data_size_in, - size, NET_SKB_PAD + NET_IP_ALIGN, - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); - if (IS_ERR(data)) - return PTR_ERR(data); - - ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); - if (IS_ERR(ctx)) { - kfree(data); - return PTR_ERR(ctx); - } - switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + is_direct_pkt_access = true; is_l2 = true; - fallthrough; + break; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + is_lwt = true; + fallthrough; case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; @@ -1020,25 +1025,88 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx) { + if (ctx->data_end > kattr->test.data_size_in || ctx->data || ctx->data_meta) { + ret = -EINVAL; + goto out; + } + if (ctx->data_end) { + /* Non-linear LWT test_run is unsupported for now. */ + if (is_lwt) { + ret = -EINVAL; + goto out; + } + linear_sz = max(ETH_HLEN, ctx->data_end); + } + } + + linear_sz = min_t(u32, linear_sz, PAGE_SIZE - headroom - tailroom); + + data = bpf_test_init(kattr, linear_sz, linear_sz, headroom, tailroom); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + data = NULL; + goto out; + } + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { - kfree(data); - kfree(ctx); - return -ENOMEM; + ret = -ENOMEM; + goto out; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { - kfree(data); - kfree(ctx); - sk_free(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } skb->sk = sk; + data = NULL; /* data released via kfree_skb */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - __skb_put(skb, size); + __skb_put(skb, linear_sz); + + if (unlikely(kattr->test.data_size_in > linear_sz)) { + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + struct skb_shared_info *sinfo = skb_shinfo(skb); + u32 copied = linear_sz; + + while (copied < kattr->test.data_size_in) { + struct page *page; + u32 data_len; + + if (sinfo->nr_frags == MAX_SKB_FRAGS) { + ret = -ENOMEM; + goto out; + } + + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + + data_len = min_t(u32, kattr->test.data_size_in - copied, + PAGE_SIZE); + skb_fill_page_desc(skb, sinfo->nr_frags, page, 0, data_len); + + if (copy_from_user(page_address(page), data_in + copied, + data_len)) { + ret = -EFAULT; + goto out; + } + skb->data_len += data_len; + skb->truesize += PAGE_SIZE; + skb->len += data_len; + copied += data_len; + } + } if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); @@ -1118,12 +1186,11 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, convert_skb_to___skb(skb, ctx); - size = skb->len; - /* bpf program can never convert linear skb to non-linear */ - if (WARN_ON_ONCE(skb_is_nonlinear(skb))) - size = skb_headlen(skb); - ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, - duration); + if (skb_is_nonlinear(skb)) + /* bpf program can never convert linear skb to non-linear */ + WARN_ON_ONCE(linear_sz == kattr->test.data_size_in); + ret = bpf_test_finish(kattr, uattr, skb->data, skb_shinfo(skb), skb->len, + skb->data_len, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); @@ -1131,7 +1198,9 @@ out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - sk_free(sk); + kfree(data); + if (sk) + sk_free(sk); kfree(ctx); return ret; } @@ -1226,8 +1295,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, batch_size = NAPI_POLL_WEIGHT; else if (batch_size > TEST_XDP_MAX_BATCH) return -E2BIG; - - headroom += sizeof(struct xdp_page_head); } else if (batch_size) { return -EINVAL; } @@ -1240,16 +1307,26 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, /* There can't be user provided data before the meta data */ if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in || ctx->data > ctx->data_end || - unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; - /* Meta data is allocated from the headroom */ - headroom -= ctx->data; meta_sz = ctx->data; + if (xdp_metalen_invalid(meta_sz) || meta_sz > headroom - sizeof(struct xdp_frame)) + goto free_ctx; + + /* Meta data is allocated from the headroom */ + headroom -= meta_sz; linear_sz = ctx->data_end; } + /* The xdp_page_head structure takes up space in each page, limiting the + * size of the packet data; add the extra size to headroom here to make + * sure it's accounted in the length checks below, but not in the + * metadata size check above. + */ + if (do_live) + headroom += sizeof(struct xdp_page_head); + max_linear_sz = PAGE_SIZE - headroom - tailroom; linear_sz = min_t(u32, linear_sz, max_linear_sz); @@ -1287,13 +1364,13 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (sinfo->nr_frags == MAX_SKB_FRAGS) { ret = -ENOMEM; - goto out; + goto out_put_dev; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; - goto out; + goto out_put_dev; } frag = &sinfo->frags[sinfo->nr_frags++]; @@ -1305,7 +1382,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (copy_from_user(page_address(page), data_in + size, data_len)) { ret = -EFAULT; - goto out; + goto out_put_dev; } sinfo->xdp_frags_size += data_len; size += data_len; @@ -1320,6 +1397,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); else ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); +out_put_dev: /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented * even if the test run failed. @@ -1329,7 +1407,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; - ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, sinfo->xdp_frags_size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, @@ -1420,7 +1498,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, - sizeof(flow_keys), retval, duration); + sizeof(flow_keys), 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); @@ -1521,7 +1599,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } - ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); @@ -1721,7 +1799,7 @@ int bpf_prog_test_run_nf(struct bpf_prog *prog, if (ret) goto out; - ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration); out: kfree(user_ctx); diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c index c2c1c7d44c61..118c7ea48c35 100644 --- a/net/bridge/br_cfm.c +++ b/net/bridge/br_cfm.c @@ -547,7 +547,7 @@ int br_cfm_mep_create(struct net_bridge *br, } } - mep = kzalloc(sizeof(*mep), GFP_KERNEL); + mep = kzalloc_obj(*mep); if (!mep) return -ENOMEM; @@ -576,7 +576,7 @@ static void mep_delete_implementation(struct net_bridge *br, /* Empty and free peer MEP list */ hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) { - cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); + disable_delayed_work_sync(&peer_mep->ccm_rx_dwork); hlist_del_rcu(&peer_mep->head); kfree_rcu(peer_mep, rcu); } @@ -693,7 +693,7 @@ int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, return -EEXIST; } - peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL); + peer_mep = kzalloc_obj(*peer_mep); if (!peer_mep) return -ENOMEM; @@ -732,7 +732,7 @@ int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, return -ENOENT; } - cc_peer_disable(peer_mep); + disable_delayed_work_sync(&peer_mep->ccm_rx_dwork); hlist_del_rcu(&peer_mep->head); kfree_rcu(peer_mep, rcu); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index a818fdc22da9..f7502e62dd35 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -74,7 +74,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); - } else if (IS_ENABLED(CONFIG_IPV6) && + } else if (ipv6_mod_enabled() && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + @@ -308,7 +308,7 @@ static int __br_netpoll_enable(struct net_bridge_port *p) struct netpoll *np; int err; - np = kzalloc(sizeof(*p->np), GFP_KERNEL); + np = kzalloc_obj(*p->np); if (!np) return -ENOMEM; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 58d22e2b85fc..0501ffcb8a3d 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -70,7 +70,7 @@ static inline int has_expired(const struct net_bridge *br, { return !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && - time_before_eq(fdb->updated + hold_time(br), jiffies); + time_before_eq(READ_ONCE(fdb->updated) + hold_time(br), jiffies); } static int fdb_to_nud(const struct net_bridge *br, @@ -126,9 +126,9 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags)) goto nla_put_failure; - ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); ci.ndm_confirmed = 0; - ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_updated = jiffies_to_clock_t(now - READ_ONCE(fdb->updated)); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; @@ -551,7 +551,7 @@ void br_fdb_cleanup(struct work_struct *work) */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - unsigned long this_timer = f->updated + delay; + unsigned long this_timer = READ_ONCE(f->updated) + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { @@ -924,6 +924,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, { struct net_bridge_fdb_entry *f; struct __fdb_entry *fe = buf; + unsigned long delta; int num = 0; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); @@ -953,8 +954,11 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->port_hi = f->dst->port_no >> 8; fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); - if (!test_bit(BR_FDB_STATIC, &f->flags)) - fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); + if (!test_bit(BR_FDB_STATIC, &f->flags)) { + delta = jiffies - READ_ONCE(f->updated); + fe->ageing_timer_value = + jiffies_delta_to_clock_t(delta); + } ++fe; ++num; } @@ -1002,8 +1006,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; bool fdb_modified = false; - if (now != fdb->updated) { - fdb->updated = now; + if (now != READ_ONCE(fdb->updated)) { + WRITE_ONCE(fdb->updated, now); fdb_modified = __fdb_mark_active(fdb); } @@ -1242,10 +1246,10 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (fdb_handle_notify(fdb, notify)) modified = true; - fdb->used = jiffies; + WRITE_ONCE(fdb->used, jiffies); if (modified) { if (refresh) - fdb->updated = jiffies; + WRITE_ONCE(fdb->updated, jiffies); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } @@ -1556,7 +1560,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, goto err_unlock; } - fdb->updated = jiffies; + WRITE_ONCE(fdb->updated, jiffies); if (READ_ONCE(fdb->dst) != p) { WRITE_ONCE(fdb->dst, p); @@ -1565,7 +1569,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, if (test_and_set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ - fdb->used = jiffies; + WRITE_ONCE(fdb->used, jiffies); } else { modified = true; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ca3a637d7cca..d39571e13744 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -429,7 +429,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, if (index < 0) return ERR_PTR(index); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc_obj(*p); if (p == NULL) return ERR_PTR(-ENOMEM); @@ -526,20 +526,6 @@ void br_mtu_auto_adjust(struct net_bridge *br) br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } -static void br_set_gso_limits(struct net_bridge *br) -{ - unsigned int tso_max_size = TSO_MAX_SIZE; - const struct net_bridge_port *p; - u16 tso_max_segs = TSO_MAX_SEGS; - - list_for_each_entry(p, &br->port_list, list) { - tso_max_size = min(tso_max_size, p->dev->tso_max_size); - tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); - } - netif_set_tso_max_size(br->dev, tso_max_size); - netif_set_tso_max_segs(br->dev, tso_max_segs); -} - /* * Recomputes features using slave's features */ @@ -653,8 +639,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } - netdev_update_features(br->dev); - br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) @@ -695,7 +679,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); - br_set_gso_limits(br); + + netdev_compute_master_upper_features(br->dev, false); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -741,7 +726,6 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) del_nbp(p); br_mtu_auto_adjust(br); - br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); @@ -750,7 +734,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - netdev_update_features(br->dev); + netdev_compute_master_upper_features(br->dev, false); return 0; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 777fa869c1a1..2cbae0f9ae1f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -170,7 +170,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb (skb->protocol == htons(ETH_P_ARP) || skb->protocol == htons(ETH_P_RARP))) { br_do_proxy_suppress_arp(skb, br, vid, p); - } else if (IS_ENABLED(CONFIG_IPV6) && + } else if (ipv6_mod_enabled() && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + @@ -221,8 +221,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (test_bit(BR_FDB_LOCAL, &dst->flags)) return br_pass_frame_up(skb, false); - if (now != dst->used) - dst->used = now; + if (now != READ_ONCE(dst->used)) + WRITE_ONCE(dst->used, now); br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) @@ -274,7 +274,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) int ret; net = dev_net(skb->dev); -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) goto frame_finish; #endif diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 6bc0a11f2ed3..766c43b327af 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -204,7 +204,7 @@ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, if (num > BR_MAX_PORTS) num = BR_MAX_PORTS; - indices = kcalloc(num, sizeof(int), GFP_KERNEL); + indices = kzalloc_objs(int, num); if (indices == NULL) return -ENOMEM; @@ -357,7 +357,7 @@ static int old_deviceless(struct net *net, void __user *data) if (args[2] >= 2048) return -ENOMEM; - indices = kcalloc(args[2], sizeof(int), GFP_KERNEL); + indices = kzalloc_objs(int, args[2]); if (indices == NULL) return -ENOMEM; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 400eb872b403..e0c7020b12f5 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1133,8 +1133,8 @@ static int br_mdb_config_src_list_init(struct nlattr *src_list, return -EINVAL; } - cfg->src_entries = kcalloc(cfg->num_src_entries, - sizeof(struct br_mdb_src_entry), GFP_KERNEL); + cfg->src_entries = kzalloc_objs(struct br_mdb_src_entry, + cfg->num_src_entries); if (!cfg->src_entries) return -ENOMEM; diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 3c36fa24bc05..f1aa67f7a051 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -516,7 +516,7 @@ int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) !br_mrp_unique_ifindex(br, instance->s_ifindex)) return -EINVAL; - mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); + mrp = kzalloc_obj(*mrp); if (!mrp) return -ENOMEM; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 22d12e545966..881d866d687a 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -244,14 +244,11 @@ br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) lockdep_assert_held_once(&port->br->multicast_lock); - if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) - return NULL; - /* Take RCU to access the vlan. */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); - if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + if (vlan) pmctx = &vlan->port_mcast_ctx; rcu_read_unlock(); @@ -701,7 +698,10 @@ br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, u32 max = READ_ONCE(pmctx->mdb_max_entries); u32 n = READ_ONCE(pmctx->mdb_n_entries); - if (max && n >= max) { + /* enforce the max limit when it's a port pmctx or a port-vlan pmctx + * with snooping enabled + */ + if (!br_multicast_port_ctx_vlan_disabled(pmctx) && max && n >= max) { NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", what, n, max); return -E2BIG; @@ -736,9 +736,7 @@ static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, return err; } - /* Only count on the VLAN context if VID is given, and if snooping on - * that VLAN is enabled. - */ + /* Only count on the VLAN context if VID is given */ if (!group->vid) return 0; @@ -1292,7 +1290,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, return ERR_PTR(-E2BIG); } - mp = kzalloc(sizeof(*mp), GFP_ATOMIC); + mp = kzalloc_obj(*mp, GFP_ATOMIC); if (unlikely(!mp)) return ERR_PTR(-ENOMEM); @@ -1383,7 +1381,7 @@ br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_i #endif } - grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC); + grp_src = kzalloc_obj(*grp_src, GFP_ATOMIC); if (unlikely(!grp_src)) return NULL; @@ -1416,7 +1414,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( if (err) return NULL; - p = kzalloc(sizeof(*p), GFP_ATOMIC); + p = kzalloc_obj(*p, GFP_ATOMIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); goto dec_out; @@ -2011,6 +2009,18 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif + /* initialize mdb_n_entries if a new port vlan is being created */ + if (vlan) { + struct net_bridge_port_group *pg; + u32 n = 0; + + spin_lock_bh(&port->br->multicast_lock); + hlist_for_each_entry(pg, &port->mglist, mglist) + if (pg->key.addr.vid == vlan->vid) + n++; + WRITE_ONCE(pmctx->mdb_n_entries, n); + spin_unlock_bh(&port->br->multicast_lock); + } } void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) @@ -2094,25 +2104,6 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } - - if (br_multicast_port_ctx_is_vlan(pmctx)) { - struct net_bridge_port_group *pg; - u32 n = 0; - - /* The mcast_n_groups counter might be wrong. First, - * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries - * are flushed, thus mcast_n_groups after the toggle does not - * reflect the true values. And second, permanent entries added - * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected - * either. Thus we have to refresh the counter. - */ - - hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { - if (pg->key.addr.vid == pmctx->vlan->vid) - n++; - } - WRITE_ONCE(pmctx->mdb_n_entries, n); - } } static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) @@ -4649,6 +4640,14 @@ static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, rcu_read_unlock(); } +static void br_multicast_del_grps(struct net_bridge *br) +{ + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) + __br_multicast_disable_port_ctx(&port->multicast_ctx); +} + int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { @@ -4669,6 +4668,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; + br_multicast_del_grps(br); goto unlock; } @@ -4891,7 +4891,7 @@ int br_multicast_list_adjacent(struct net_device *dev, continue; hlist_for_each_entry_rcu(group, &port->mglist, mglist) { - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + entry = kmalloc_obj(*entry, GFP_ATOMIC); if (!entry) goto unlock; @@ -5192,7 +5192,7 @@ void br_multicast_get_stats(const struct net_bridge *br, do { start = u64_stats_fetch_begin(&cpu_stats->syncp); - memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); + u64_stats_copy(&temp, &cpu_stats->mstats, sizeof(temp)); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index adfd74102019..18eaf9c0dada 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -266,7 +266,7 @@ __eht_lookup_create_host(struct net_bridge_port_group *pg, if (br_multicast_eht_hosts_over_limit(pg)) return NULL; - eht_host = kzalloc(sizeof(*eht_host), GFP_ATOMIC); + eht_host = kzalloc_obj(*eht_host, GFP_ATOMIC); if (!eht_host) return NULL; @@ -313,7 +313,7 @@ __eht_lookup_create_set_entry(struct net_bridge *br, if (!allow_zero_src && eht_host->num_entries >= PG_SRC_ENT_LIMIT) return NULL; - set_h = kzalloc(sizeof(*set_h), GFP_ATOMIC); + set_h = kzalloc_obj(*set_h, GFP_ATOMIC); if (!set_h) return NULL; @@ -360,7 +360,7 @@ __eht_lookup_create_set(struct net_bridge_port_group *pg, return this; } - eht_set = kzalloc(sizeof(*eht_set), GFP_ATOMIC); + eht_set = kzalloc_obj(*eht_set, GFP_ATOMIC); if (!eht_set) return NULL; diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index e0421eaa3abc..76ce70b4e7f3 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -58,7 +58,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) if (hdr->version != 6) goto inhdr_error; - pkt_len = ntohs(hdr->payload_len); + pkt_len = ipv6_payload_len(skb, hdr); if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len)) goto drop; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4e2d53b27221..0264730938f4 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -467,7 +467,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, else br = netdev_priv(dev); - br_debug(br, "br_fill_info event %d port %s master %s\n", + br_debug(br, "br_fill_ifinfo event %d port %s master %s\n", event, dev->name, br->dev->name); nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7280c4e9305f..9b55d38ea9ed 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -247,6 +247,7 @@ struct net_bridge_vlan { * struct net_bridge_vlan_group * * @vlan_hash: VLAN entry rhashtable + * @tunnel_hash: Hash table to map from tunnel key ID (e.g. VXLAN VNI) to VLAN * @vlan_list: sorted VLAN entry list * @num_vlans: number of total VLAN entries * @pvid: PVID VLAN id @@ -1344,6 +1345,16 @@ br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, } static inline bool +br_multicast_port_ctx_options_equal(const struct net_bridge_mcast_port *pmctx1, + const struct net_bridge_mcast_port *pmctx2) +{ + return br_multicast_ngroups_get(pmctx1) == + br_multicast_ngroups_get(pmctx2) && + br_multicast_ngroups_get_max(pmctx1) == + br_multicast_ngroups_get_max(pmctx2); +} + +static inline bool br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) { bool vlan_snooping_enabled; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index c20a41bf253b..cc4b27ff1b08 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -344,8 +344,8 @@ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) { - return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", - id->prio[0], id->prio[1], - id->addr[0], id->addr[1], id->addr[2], - id->addr[3], id->addr[4], id->addr[5]); + return sysfs_emit(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", + id->prio[0], id->prio[1], + id->addr[0], id->addr[1], id->addr[2], + id->addr[3], id->addr[4], id->addr[5]); } diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index fe3f7bbe86ee..4fac002922d2 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -661,7 +661,7 @@ void br_switchdev_mdb_notify(struct net_device *dev, mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: - complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); + complete_info = kmalloc_obj(*complete_info, GFP_ATOMIC); if (!complete_info) break; complete_info->port = pg->key.port; diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index cb4855ed9500..d2993f187f57 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/hex.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> @@ -67,7 +68,7 @@ static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); + return sysfs_emit(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, @@ -87,8 +88,8 @@ static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(to_bridge(d)->hello_time)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, @@ -108,8 +109,8 @@ static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(to_bridge(d)->max_age)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, @@ -129,7 +130,7 @@ static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); + return sysfs_emit(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, @@ -150,7 +151,7 @@ static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->stp_enabled); + return sysfs_emit(buf, "%d\n", br->stp_enabled); } @@ -173,7 +174,7 @@ static ssize_t group_fwd_mask_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%#x\n", br->group_fwd_mask); + return sysfs_emit(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, @@ -200,8 +201,8 @@ static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", - (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); + return sysfs_emit(buf, "%d\n", + (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, @@ -235,21 +236,21 @@ static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_bridge(d)->root_port); + return sysfs_emit(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); + return sysfs_emit(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_bridge(d)->topology_change); + return sysfs_emit(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); @@ -258,7 +259,7 @@ static ssize_t topology_change_detected_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->topology_change_detected); + return sysfs_emit(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); @@ -266,7 +267,7 @@ static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); @@ -274,7 +275,7 @@ static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); @@ -283,7 +284,7 @@ static ssize_t topology_change_timer_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); @@ -291,7 +292,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); @@ -299,7 +300,7 @@ static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%pM\n", br->group_addr); + return sysfs_emit(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, @@ -365,7 +366,7 @@ static ssize_t no_linklocal_learn_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); + return sysfs_emit(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, @@ -387,7 +388,7 @@ static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); + return sysfs_emit(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, @@ -409,7 +410,7 @@ static ssize_t multicast_snooping_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); + return sysfs_emit(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, @@ -425,8 +426,8 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", - br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); + return sysfs_emit(buf, "%d\n", + br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, @@ -450,7 +451,7 @@ static ssize_t multicast_querier_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); + return sysfs_emit(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, @@ -470,7 +471,7 @@ static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", RHT_ELASTICITY); + return sysfs_emit(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, @@ -494,7 +495,7 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->hash_max); + return sysfs_emit(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, @@ -517,7 +518,7 @@ static ssize_t multicast_igmp_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, @@ -539,7 +540,7 @@ static ssize_t multicast_last_member_count_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, @@ -561,7 +562,7 @@ static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, @@ -583,8 +584,8 @@ static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, @@ -606,8 +607,8 @@ static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, @@ -630,8 +631,8 @@ static ssize_t multicast_querier_interval_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, @@ -654,8 +655,8 @@ static ssize_t multicast_query_interval_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, @@ -677,9 +678,8 @@ static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf( - buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, @@ -701,9 +701,8 @@ static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf( - buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, @@ -727,8 +726,8 @@ static ssize_t multicast_stats_enabled_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", - br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); + return sysfs_emit(buf, "%d\n", + br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, @@ -754,7 +753,7 @@ static ssize_t multicast_mld_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, @@ -777,7 +776,7 @@ static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, @@ -799,7 +798,7 @@ static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, @@ -821,7 +820,7 @@ static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, @@ -845,7 +844,7 @@ static ssize_t vlan_filtering_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); + return sysfs_emit(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, @@ -861,7 +860,7 @@ static ssize_t vlan_protocol_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); + return sysfs_emit(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, @@ -877,7 +876,7 @@ static ssize_t default_pvid_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->default_pvid); + return sysfs_emit(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, @@ -893,7 +892,7 @@ static ssize_t vlan_stats_enabled_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, @@ -915,7 +914,7 @@ static ssize_t vlan_stats_per_port_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 74fdd8105dca..1f57c36a7fc0 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -47,7 +47,7 @@ const struct brport_attribute brport_attr_##_name = { \ #define BRPORT_ATTR_FLAG(_name, _mask) \ static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \ { \ - return sprintf(buf, "%d\n", !!(p->flags & _mask)); \ + return sysfs_emit(buf, "%d\n", !!(p->flags & _mask)); \ } \ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ @@ -83,7 +83,7 @@ static int store_flag(struct net_bridge_port *p, unsigned long v, static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->path_cost); + return sysfs_emit(buf, "%d\n", p->path_cost); } static BRPORT_ATTR(path_cost, 0644, @@ -91,7 +91,7 @@ static BRPORT_ATTR(path_cost, 0644, static ssize_t show_priority(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->priority); + return sysfs_emit(buf, "%d\n", p->priority); } static BRPORT_ATTR(priority, 0644, @@ -111,65 +111,65 @@ static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->designated_port); + return sysfs_emit(buf, "%d\n", p->designated_port); } static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->designated_cost); + return sysfs_emit(buf, "%d\n", p->designated_cost); } static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "0x%x\n", p->port_id); + return sysfs_emit(buf, "0x%x\n", p->port_id); } static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "0x%x\n", p->port_no); + return sysfs_emit(buf, "0x%x\n", p->port_no); } static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->topology_change_ack); + return sysfs_emit(buf, "%d\n", p->topology_change_ack); } static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->config_pending); + return sysfs_emit(buf, "%d\n", p->config_pending); } static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->state); + return sysfs_emit(buf, "%d\n", p->state); } static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&p->hold_timer)); } static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); @@ -182,7 +182,7 @@ static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%#x\n", p->group_fwd_mask); + return sysfs_emit(buf, "%#x\n", p->group_fwd_mask); } static int store_group_fwd_mask(struct net_bridge_port *p, @@ -205,7 +205,7 @@ static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) - ret = sprintf(buf, "%s\n", backup_p->dev->name); + ret = sysfs_emit(buf, "%s\n", backup_p->dev->name); rcu_read_unlock(); return ret; @@ -244,7 +244,7 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); + return sysfs_emit(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index ce72b837ff8e..326933b455b3 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -787,7 +787,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, return br_vlan_add_existing(br, vg, vlan, flags, changed, extack); - vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + vlan = kzalloc_obj(*vlan); if (!vlan) return -ENOMEM; @@ -1224,7 +1224,7 @@ int br_vlan_init(struct net_bridge *br) struct net_bridge_vlan_group *vg; int ret = -ENOMEM; - vg = kzalloc(sizeof(*vg), GFP_KERNEL); + vg = kzalloc_obj(*vg); if (!vg) goto out; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); @@ -1260,7 +1260,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) struct net_bridge_vlan_group *vg; int ret = -ENOMEM; - vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); + vg = kzalloc_obj(struct net_bridge_vlan_group); if (!vg) goto out; @@ -1334,7 +1334,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, return 0; } - vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + vlan = kzalloc_obj(*vlan); if (!vlan) return -ENOMEM; diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index 8fa89b04ee94..5514e1fc8d1f 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -43,9 +43,29 @@ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, u8 range_mc_rtr = br_vlan_multicast_router(range_end); u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); - return v_curr->state == range_end->state && - __vlan_tun_can_enter_range(v_curr, range_end) && - curr_mc_rtr == range_mc_rtr; + if (v_curr->state != range_end->state) + return false; + + if (!__vlan_tun_can_enter_range(v_curr, range_end)) + return false; + + if (curr_mc_rtr != range_mc_rtr) + return false; + + /* Check user-visible priv_flags that affect output */ + if ((v_curr->priv_flags ^ range_end->priv_flags) & + (BR_VLFLAG_NEIGH_SUPPRESS_ENABLED | BR_VLFLAG_MCAST_ENABLED)) + return false; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (!br_vlan_is_master(v_curr) && + !br_multicast_port_ctx_vlan_disabled(&v_curr->port_mcast_ctx) && + !br_multicast_port_ctx_options_equal(&v_curr->port_mcast_ctx, + &range_end->port_mcast_ctx)) + return false; +#endif + + return true; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index a966a6ec8263..257cae9f1569 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -189,7 +189,6 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tunnel_dst; __be64 tunnel_id; - int err; if (!vlan) return 0; @@ -199,9 +198,13 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, return 0; skb_dst_drop(skb); - err = skb_vlan_pop(skb); - if (err) - return err; + /* For 802.1ad (QinQ), skb_vlan_pop() incorrectly moves the C-VLAN + * from payload to hwaccel after clearing S-VLAN. We only need to + * clear the hwaccel S-VLAN; the C-VLAN must stay in payload for + * correct VXLAN encapsulation. This is also correct for 802.1Q + * where no C-VLAN exists in payload. + */ + __vlan_hwaccel_clear_tag(skb); if (BR_INPUT_SKB_CB(skb)->backup_nhid) { __set_bit(IP_TUNNEL_KEY_BIT, flags); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5697e3949a36..aea3e19875c6 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1299,11 +1299,11 @@ int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct ne list_for_each_entry(tmpl, &template_tables, list) { if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) { mutex_unlock(&ebt_mutex); - return -EEXIST; + return -EBUSY; } } - tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL); + tmpl = kzalloc_obj(*tmpl); if (!tmpl) { mutex_unlock(&ebt_mutex); return -ENOMEM; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6482de4d8750..58a33d0380b0 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -16,8 +16,7 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_bridge.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <linux/netfilter_ipv4.h> #include "../br_private.h" @@ -230,7 +229,7 @@ static int nf_ct_br_ipv6_check(const struct sk_buff *skb) if (hdr->version != 6) return -1; - len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff; + len = ipv6_payload_len(skb, hdr) + sizeof(struct ipv6hdr) + nhoff; if (skb->len < len) return -1; @@ -270,7 +269,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return NF_ACCEPT; - len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + len = sizeof(struct ipv6hdr) + skb_ipv6_payload_len(skb); if (pskb_trim_rcsum(skb, len)) return NF_ACCEPT; diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 24e85c5487ef..922de3d611c0 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -94,7 +94,7 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) { struct caif_device_entry *caifd; - caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); + caifd = kzalloc_obj(*caifd); if (!caifd) return NULL; caifd->pcpu_refcnt = alloc_percpu(int); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 039dfbd367c9..af218742af5a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -734,7 +734,7 @@ bad_sol: * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ -static int caif_connect(struct socket *sock, struct sockaddr *uaddr, +static int caif_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 5dc05a1e3178..4d44960d4c2f 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -85,7 +85,7 @@ static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, static struct cflayer *cfusbl_create(int phyid, const u8 ethaddr[ETH_ALEN], u8 braddr[ETH_ALEN]) { - struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); + struct cfusbl *this = kmalloc_obj(struct cfusbl, GFP_ATOMIC); if (!this) return NULL; diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 52509e185960..8a80914783e8 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -77,7 +77,7 @@ struct cfcnfg *cfcnfg_create(void) might_sleep(); /* Initiate this layer */ - this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); + this = kzalloc_obj(struct cfcnfg, GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); @@ -477,7 +477,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, goto out; got_phyid: - phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); + phyinfo = kzalloc_obj(struct cfcnfg_phyinfo, GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 2aa1e7d46eb2..c6cc2bfed65d 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -36,7 +36,7 @@ struct cflayer *cfctrl_create(void) { struct dev_info dev_info; struct cfctrl *this = - kzalloc(sizeof(struct cfctrl), GFP_ATOMIC); + kzalloc_obj(struct cfctrl, GFP_ATOMIC); if (!this) return NULL; caif_assert(offsetof(struct cfctrl, serv.layer) == 0); @@ -270,7 +270,7 @@ int cfctrl_linkup_request(struct cflayer *layer, cfpkt_destroy(pkt); return -EINVAL; } - req = kzalloc(sizeof(*req), GFP_KERNEL); + req = kzalloc_obj(*req); if (!req) { cfpkt_destroy(pkt); return -ENOMEM; diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 77f428428b47..57ad3f82e004 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -19,7 +19,7 @@ static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt); struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info) { - struct cfsrvl *dbg = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + struct cfsrvl *dbg = kzalloc_obj(struct cfsrvl, GFP_ATOMIC); if (!dbg) return NULL; caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c index eb6f8ef47a79..c451ddd155a7 100644 --- a/net/caif/cfdgml.c +++ b/net/caif/cfdgml.c @@ -26,7 +26,7 @@ static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt); struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info) { - struct cfsrvl *dgm = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + struct cfsrvl *dgm = kzalloc_obj(struct cfsrvl, GFP_ATOMIC); if (!dgm) return NULL; caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index 6651a8dc62e0..0f4979d89fcb 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -34,7 +34,7 @@ static u32 cffrml_rcv_error; static u32 cffrml_rcv_checsum_error; struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { - struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC); + struct cffrml *this = kzalloc_obj(struct cffrml, GFP_ATOMIC); if (!this) return NULL; this->pcpu_refcnt = alloc_percpu(int); @@ -92,8 +92,15 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) len = le16_to_cpu(tmp); /* Subtract for FCS on length if FCS is not used. */ - if (!this->dofcs) + if (!this->dofcs) { + if (len < 2) { + ++cffrml_rcv_error; + pr_err("Invalid frame length (%d)\n", len); + cfpkt_destroy(pkt); + return -EPROTO; + } len -= 2; + } if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index 4172b0d0db63..77a1f31639b7 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -47,7 +47,7 @@ static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); struct cflayer *cfmuxl_create(void) { - struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC); + struct cfmuxl *this = kzalloc_obj(struct cfmuxl, GFP_ATOMIC); if (!this) return NULL; diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 3c335057f255..93732ebbd1e2 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -46,7 +46,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info, int mtu_size) { int tmp; - struct cfrfml *this = kzalloc(sizeof(struct cfrfml), GFP_ATOMIC); + struct cfrfml *this = kzalloc_obj(struct cfrfml, GFP_ATOMIC); if (!this) return NULL; diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index aee11c74d3c8..faf78fb754e2 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -38,7 +38,7 @@ void cfserl_release(struct cflayer *layer) struct cflayer *cfserl_create(int instance, bool use_stx) { - struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC); + struct cfserl *this = kzalloc_obj(struct cfserl, GFP_ATOMIC); if (!this) return NULL; caif_assert(offsetof(struct cfserl, layer) == 0); diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c index b2e47ede912f..5111090bb2c0 100644 --- a/net/caif/cfutill.c +++ b/net/caif/cfutill.c @@ -26,7 +26,7 @@ static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt); struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info) { - struct cfsrvl *util = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + struct cfsrvl *util = kzalloc_obj(struct cfsrvl, GFP_ATOMIC); if (!util) return NULL; caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c index db2274b94a5d..53f844c49bbb 100644 --- a/net/caif/cfveil.c +++ b/net/caif/cfveil.c @@ -25,7 +25,7 @@ static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt); struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info) { - struct cfsrvl *vei = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + struct cfsrvl *vei = kzalloc_obj(struct cfsrvl, GFP_ATOMIC); if (!vei) return NULL; caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c index 134bad43196c..39e075b0a259 100644 --- a/net/caif/cfvidl.c +++ b/net/caif/cfvidl.c @@ -21,7 +21,7 @@ static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt); struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info) { - struct cfsrvl *vid = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC); + struct cfsrvl *vid = kzalloc_obj(struct cfsrvl, GFP_ATOMIC); if (!vid) return NULL; caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/can/Kconfig b/net/can/Kconfig index af64a6f76458..abbb4be7ad21 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -5,6 +5,7 @@ menuconfig CAN tristate "CAN bus subsystem support" + select SKB_EXTENSIONS help Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial communications protocol. Development of the CAN bus started in diff --git a/net/can/af_can.c b/net/can/af_can.c index 770173d8db42..f70e2ba0aadc 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -641,6 +641,16 @@ static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buf return matches; } +void can_set_skb_uid(struct sk_buff *skb) +{ + /* create non-zero unique skb identifier together with *skb */ + while (!(skb->hash)) + skb->hash = atomic_inc_return(&skbcounter); + + skb->sw_hash = 1; +} +EXPORT_SYMBOL(can_set_skb_uid); + static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; @@ -652,9 +662,7 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) atomic_long_inc(&pkg_stats->rx_frames); atomic_long_inc(&pkg_stats->rx_frames_delta); - /* create non-zero unique skb identifier together with *skb */ - while (!(can_skb_prv(skb)->skbcnt)) - can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); + can_set_skb_uid(skb); rcu_read_lock(); @@ -679,7 +687,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || + !can_skb_ext_find(skb) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -694,7 +703,8 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || + !can_skb_ext_find(skb) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -709,7 +719,8 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || + !can_skb_ext_find(skb) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -787,14 +798,13 @@ EXPORT_SYMBOL(can_proto_unregister); static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); - net->can.rx_alldev_list = - kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); + net->can.rx_alldev_list = kzalloc_obj(*net->can.rx_alldev_list); if (!net->can.rx_alldev_list) goto out; - net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); + net->can.pkg_stats = kzalloc_obj(*net->can.pkg_stats); if (!net->can.pkg_stats) goto out_free_rx_alldev_list; - net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); + net->can.rcv_lists_stats = kzalloc_obj(*net->can.rcv_lists_stats); if (!net->can.rcv_lists_stats) goto out_free_pkg_stats; diff --git a/net/can/bcm.c b/net/can/bcm.c index 5e690a2377e4..fd9fa072881e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -59,6 +59,7 @@ #include <linux/can/bcm.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <net/can.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -291,6 +292,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; struct canfd_frame *cf; int err; @@ -310,13 +312,17 @@ static void bcm_can_tx(struct bcm_op *op) return; } - skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); + skb = alloc_skb(op->cfsiz, gfp_any()); if (!skb) goto out; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + goto out; + } + + csx->can_iif = dev->ifindex; skb_put_data(skb, cf, op->cfsiz); @@ -1170,6 +1176,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; + spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); @@ -1318,6 +1325,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; int err; @@ -1325,11 +1333,15 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, if (!ifindex) return -ENODEV; - skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); + skb = alloc_skb(cfsiz, GFP_KERNEL); if (!skb) return -ENOMEM; - can_skb_reserve(skb); + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + return -ENOMEM; + } err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { @@ -1343,8 +1355,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, return -ENODEV; } - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx->can_iif = dev->ifindex; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ @@ -1657,7 +1668,7 @@ static int bcm_release(struct socket *sock) return 0; } -static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, +static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; diff --git a/net/can/gw.c b/net/can/gw.c index 55eccb1c7620..8ee4d67a07d3 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -55,6 +55,7 @@ #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> +#include <net/can.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -70,8 +71,8 @@ MODULE_ALIAS(CAN_GW_NAME); #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 -static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; -module_param(max_hops, uint, 0444); +static unsigned char max_hops __read_mostly = CGW_DEFAULT_HOPS; +module_param(max_hops, byte, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" @@ -459,6 +460,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; + struct can_skb_ext *csx, *ncsx; struct cf_mod *mod; int modidx = 0; @@ -471,22 +473,15 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) return; } + csx = can_skb_ext_find(skb); + if (!csx) + return; + /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). - * - * The Controller Area Network controllers only accept CAN frames with - * correct CRCs - which are not visible in the controller registers. - * According to skbuff.h documentation the csum_start element for IP - * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. - * Only CAN skbs can be processed here which already have this property. */ - -#define cgw_hops(skb) ((skb)->csum_start) - - BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); - - if (cgw_hops(skb) >= max_hops) { + if (csx->can_gw_hops >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; @@ -499,7 +494,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && - can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) + csx->can_iif == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() @@ -518,12 +513,23 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) return; } + /* the cloned/copied nskb points to the skb extension of the original + * skb with an increased refcount. skb_ext_add() creates a copy to + * separate the skb extension data to modify the can_gw_hops. + */ + ncsx = skb_ext_add(nskb, SKB_EXT_CAN); + if (!ncsx) { + kfree_skb(nskb); + gwj->dropped_frames++; + return; + } + /* put the incremented hop counter in the cloned skb */ - cgw_hops(nskb) = cgw_hops(skb) + 1; + ncsx->can_gw_hops = csx->can_gw_hops + 1; /* first processing of this CAN frame -> adjust to private hop limit */ - if (gwj->limit_hops && cgw_hops(nskb) == 1) - cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; + if (gwj->limit_hops && ncsx->can_gw_hops == 1) + ncsx->can_gw_hops = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; @@ -1093,7 +1099,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; - mod = kmalloc(sizeof(*mod), GFP_KERNEL); + mod = kmalloc_obj(*mod); if (!mod) return -ENOMEM; diff --git a/net/can/isotp.c b/net/can/isotp.c index 74ee1e52249b..da3b72e7afcc 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -69,6 +69,7 @@ #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> +#include <net/can.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -214,24 +215,28 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; + struct can_skb_ext *csx; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; - nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); + nskb = alloc_skb(so->ll.mtu, gfp_any()); if (!nskb) return 1; + csx = can_skb_ext_add(nskb); + if (!csx) { + kfree_skb(nskb); + return 1; + } + dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } - can_skb_reserve(nskb); - can_skb_prv(nskb)->ifindex = dev->ifindex; - can_skb_prv(nskb)->skbcnt = 0; - + csx->can_iif = dev->ifindex; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; @@ -763,6 +768,7 @@ static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; @@ -772,15 +778,20 @@ static void isotp_send_cframe(struct isotp_sock *so) if (!dev) return; - skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); + skb = alloc_skb(so->ll.mtu, GFP_ATOMIC); if (!skb) { dev_put(dev); return; } - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + netdev_put(dev, NULL); + return; + } + + csx->can_iif = dev->ifindex; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); @@ -940,6 +951,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; @@ -1000,16 +1012,22 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto err_out_drop; } - skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), - msg->msg_flags & MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, so->ll.mtu, msg->msg_flags & MSG_DONTWAIT, + &err); if (!skb) { dev_put(dev); goto err_out_drop; } - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + netdev_put(dev, NULL); + err = -ENOMEM; + goto err_out_drop; + } + + csx->can_iif = dev->ifindex; so->tx.len = size; so->tx.idx = 0; @@ -1246,7 +1264,7 @@ static int isotp_release(struct socket *sock) return 0; } -static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int isotp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 797719cb227e..dc374286eeb6 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -151,7 +151,7 @@ struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) lockdep_assert_held(&priv->lock); - ecu = kzalloc(sizeof(*ecu), gfp_any()); + ecu = kzalloc_obj(*ecu, gfp_any()); if (!ecu) return ERR_PTR(-ENOMEM); kref_init(&ecu->kref); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index a93af55df5fd..9937c04241bc 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -128,7 +128,7 @@ static struct j1939_priv *j1939_priv_create(struct net_device *ndev) { struct j1939_priv *priv; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (!priv) return NULL; diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 88e7160d4248..0502b030d238 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -17,6 +17,7 @@ #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> +#include <net/can.h> #include "j1939-priv.h" @@ -440,7 +441,7 @@ static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) return 0; } -static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); @@ -482,6 +483,12 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) goto out_release_sock; } + if (ndev->reg_state != NETREG_REGISTERED) { + dev_put(ndev); + ret = -ENODEV; + goto out_release_sock; + } + can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); @@ -535,7 +542,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) return ret; } -static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, +static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; @@ -878,20 +885,25 @@ static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; + struct can_skb_ext *csx; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - - sizeof(((struct can_frame *)NULL)->data) + - sizeof(struct can_skb_priv), + sizeof(((struct can_frame *)NULL)->data), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = ndev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + ret = -ENOMEM; + goto failure; + } + + csx->can_iif = ndev->ifindex; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index fbf5c8001c9d..df93d57907da 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -9,6 +9,7 @@ // Oleksij Rempel <kernel@pengutronix.de> #include <linux/can/skb.h> +#include <net/can.h> #include "j1939-priv.h" @@ -591,17 +592,21 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, bool swap_src_dst) { struct sk_buff *skb; + struct can_skb_ext *csx; struct j1939_sk_buff_cb *skcb; - skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv), - GFP_ATOMIC); + skb = alloc_skb(sizeof(struct can_frame), GFP_ATOMIC); if (unlikely(!skb)) return ERR_PTR(-ENOMEM); + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + skb->dev = priv->ndev; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = priv->ndev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx->can_iif = priv->ndev->ifindex; /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); @@ -1052,6 +1057,17 @@ static int j1939_simple_txnext(struct j1939_session *session) goto out_free; } + /* the cloned skb points to the skb extension of the original se_skb + * with an increased refcount. skb_ext_add() creates a copy to + * separate the skb extension data which is needed to modify the + * can_framelen in can_put_echo_skb(). + */ + if (!skb_ext_add(skb, SKB_EXT_CAN)) { + kfree_skb(skb); + ret = -ENOMEM; + goto out_free; + } + can_skb_set_owner(skb, se_skb->sk); j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS); @@ -1492,7 +1508,7 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv, struct j1939_session *session; struct j1939_sk_buff_cb *skcb; - session = kzalloc(sizeof(*session), gfp_any()); + session = kzalloc_obj(*session, gfp_any()); if (!session) return NULL; @@ -1526,17 +1542,22 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, const struct j1939_sk_buff_cb *rel_skcb) { struct sk_buff *skb; + struct can_skb_ext *csx; struct j1939_sk_buff_cb *skcb; struct j1939_session *session; - skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC); + skb = alloc_skb(size, GFP_ATOMIC); if (unlikely(!skb)) return NULL; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + return NULL; + } + skb->dev = priv->ndev; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = priv->ndev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx->can_iif = priv->ndev->ifindex; skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); @@ -1567,6 +1588,8 @@ int j1939_session_activate(struct j1939_session *session) if (active) { j1939_session_put(active); ret = -EAGAIN; + } else if (priv->ndev->reg_state != NETREG_REGISTERED) { + ret = -ENODEV; } else { WARN_ON_ONCE(session->state != J1939_SESSION_NEW); list_add_tail(&session->active_session_list_entry, @@ -1693,8 +1716,16 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, j1939_session_timers_cancel(session); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); - if (session->transmission) + if (session->transmission) { + j1939_session_deactivate_activate_next(session); + } else if (session->state == J1939_SESSION_WAITING_ABORT) { + /* Force deactivation for the receiver. + * If we rely on the timer starting in j1939_session_cancel, + * a second RTS call here will cancel that timer and fail + * to restart it because the state is already WAITING_ABORT. + */ j1939_session_deactivate_activate_next(session); + } return -EBUSY; } diff --git a/net/can/raw.c b/net/can/raw.c index a53853f5e9af..eee244ffc31e 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -49,10 +49,11 @@ #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> +#include <linux/can/can-ml.h> #include <linux/can/core.h> -#include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */ #include <linux/can/skb.h> #include <linux/can/raw.h> +#include <net/can.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -76,7 +77,7 @@ MODULE_ALIAS("can-proto-1"); struct uniqframe { const struct sk_buff *skb; - int skbcnt; + u32 hash; unsigned int join_rx_count; }; @@ -164,7 +165,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && - this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { + this_cpu_ptr(ro->uniq)->hash == oskb->hash) { if (!ro->join_filters) return; @@ -174,7 +175,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; } else { this_cpu_ptr(ro->uniq)->skb = oskb; - this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; + this_cpu_ptr(ro->uniq)->hash = oskb->hash; this_cpu_ptr(ro->uniq)->join_rx_count = 1; /* drop first frame to check all enabled filters? */ if (ro->join_filters && ro->count > 1) @@ -449,7 +450,7 @@ static int raw_release(struct socket *sock) return 0; } -static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int raw_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; @@ -892,20 +893,21 @@ static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) } } -static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, + struct net_device *dev) { - /* Classical CAN -> no checks for flags and device capabilities */ - if (can_is_can_skb(skb)) + /* Classical CAN */ + if (can_is_can_skb(skb) && can_cap_enabled(dev, CAN_CAP_CC)) return CAN_MTU; - /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ + /* CAN FD */ if (ro->fd_frames && can_is_canfd_skb(skb) && - (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) + can_cap_enabled(dev, CAN_CAP_FD)) return CANFD_MTU; - /* CAN XL -> needs to be enabled and a CAN XL device */ + /* CAN XL */ if (ro->xl_frames && can_is_canxl_skb(skb) && - can_is_canxl_dev_mtu(mtu)) + can_cap_enabled(dev, CAN_CAP_XL)) return CANXL_MTU; return 0; @@ -917,6 +919,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct raw_sock *ro = raw_sk(sk); struct sockcm_cookie sockc; struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; unsigned int txmtu; int ifindex; @@ -944,14 +947,25 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (!dev) return -ENXIO; - skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), - msg->msg_flags & MSG_DONTWAIT, &err); + /* no sending on a CAN device in read-only mode */ + if (can_cap_enabled(dev, CAN_CAP_RO)) { + err = -EACCES; + goto put_dev; + } + + skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, + &err); if (!skb) goto put_dev; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + err = -ENOMEM; + goto put_dev; + } + + csx->can_iif = dev->ifindex; /* fill the skb before testing for valid CAN frames */ err = memcpy_from_msg(skb_put(skb, size), msg, size); @@ -961,7 +975,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ - txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu)); + txmtu = raw_check_txframe(ro, skb, dev); if (!txmtu) goto free_skb; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index ea60e3ef0834..7e2528cde4b9 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -6,6 +6,7 @@ config CEPH_LIB select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM + select CRYPTO_KRB5 select CRYPTO_LIB_SHA256 select CRYPTO select KEYS diff --git a/net/ceph/auth.c b/net/ceph/auth.c index d38c9eadbe2f..901b93530b21 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -59,7 +59,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, { struct ceph_auth_client *ac; - ac = kzalloc(sizeof(*ac), GFP_NOFS); + ac = kzalloc_obj(*ac, GFP_NOFS); if (!ac) return ERR_PTR(-ENOMEM); @@ -205,9 +205,9 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, s32 result; u64 global_id; void *payload, *payload_end; - int payload_len; + u32 payload_len; char *result_msg; - int result_msg_len; + u32 result_msg_len; int ret = -EINVAL; mutex_lock(&ac->mutex); @@ -217,10 +217,12 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, result = ceph_decode_32(&p); global_id = ceph_decode_64(&p); payload_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, payload_len, bad); payload = p; p += payload_len; ceph_decode_need(&p, end, sizeof(u32), bad); result_msg_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, result_msg_len, bad); result_msg = p; p += result_msg_len; if (p != end) diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 77b5519bc45f..99e1f3e10a4c 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -97,7 +97,7 @@ static int ceph_auth_none_create_authorizer( struct ceph_none_authorizer *au; int ret; - au = kmalloc(sizeof(*au), GFP_NOFS); + au = kmalloc_obj(*au, GFP_NOFS); if (!au) return -ENOMEM; @@ -133,7 +133,7 @@ int ceph_auth_none_init(struct ceph_auth_client *ac) struct ceph_auth_none_info *xi; dout("ceph_auth_none_init %p\n", ac); - xi = kzalloc(sizeof(*xi), GFP_NOFS); + xi = kzalloc_obj(*xi, GFP_NOFS); if (!xi) return -ENOMEM; diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a21c157daf7d..692e0b868822 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -17,6 +17,22 @@ #include "auth_x.h" #include "auth_x_protocol.h" +static const u32 ticket_key_usages[] = { + CEPHX_KEY_USAGE_TICKET_SESSION_KEY, + CEPHX_KEY_USAGE_TICKET_BLOB, + CEPHX_KEY_USAGE_AUTH_CONNECTION_SECRET +}; + +static const u32 authorizer_key_usages[] = { + CEPHX_KEY_USAGE_AUTHORIZE, + CEPHX_KEY_USAGE_AUTHORIZE_CHALLENGE, + CEPHX_KEY_USAGE_AUTHORIZE_REPLY +}; + +static const u32 client_key_usages[] = { + CEPHX_KEY_USAGE_TICKET_SESSION_KEY +}; + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) @@ -44,28 +60,41 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac) return !!need; } -static int ceph_x_encrypt_offset(void) +static int __ceph_x_encrypt_offset(const struct ceph_crypto_key *key) +{ + return ceph_crypt_data_offset(key) + + sizeof(struct ceph_x_encrypt_header); +} + +static int ceph_x_encrypt_offset(const struct ceph_crypto_key *key) { - return sizeof(u32) + sizeof(struct ceph_x_encrypt_header); + return sizeof(u32) + __ceph_x_encrypt_offset(key); } -static int ceph_x_encrypt_buflen(int ilen) +/* + * AES: ciphertext_len | hdr | data... | padding + * AES256KRB5: ciphertext_len | confounder | hdr | data... | hmac + */ +static int ceph_x_encrypt_buflen(const struct ceph_crypto_key *key, + int data_len) { - return ceph_x_encrypt_offset() + ilen + 16; + int encrypt_len = sizeof(struct ceph_x_encrypt_header) + data_len; + return sizeof(u32) + ceph_crypt_buflen(key, encrypt_len); } -static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, - int buf_len, int plaintext_len) +static int ceph_x_encrypt(const struct ceph_crypto_key *key, int usage_slot, + void *buf, int buf_len, int plaintext_len) { - struct ceph_x_encrypt_header *hdr = buf + sizeof(u32); + struct ceph_x_encrypt_header *hdr; int ciphertext_len; int ret; + hdr = buf + sizeof(u32) + ceph_crypt_data_offset(key); hdr->struct_v = 1; hdr->magic = cpu_to_le64(CEPHX_ENC_MAGIC); - ret = ceph_crypt(secret, true, buf + sizeof(u32), buf_len - sizeof(u32), - plaintext_len + sizeof(struct ceph_x_encrypt_header), + ret = ceph_crypt(key, usage_slot, true, buf + sizeof(u32), + buf_len - sizeof(u32), plaintext_len + sizeof(*hdr), &ciphertext_len); if (ret) return ret; @@ -74,18 +103,19 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, return sizeof(u32) + ciphertext_len; } -static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, - int ciphertext_len) +static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, + void *p, int ciphertext_len) { - struct ceph_x_encrypt_header *hdr = p; + struct ceph_x_encrypt_header *hdr; int plaintext_len; int ret; - ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len, - &plaintext_len); + ret = ceph_crypt(key, usage_slot, false, p, ciphertext_len, + ciphertext_len, &plaintext_len); if (ret) return ret; + hdr = p + ceph_crypt_data_offset(key); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); return -EINVAL; @@ -94,7 +124,8 @@ static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, return plaintext_len - sizeof(*hdr); } -static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) +static int ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, + void **p, void *end) { int ciphertext_len; int ret; @@ -102,7 +133,7 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) ceph_decode_32_safe(p, end, ciphertext_len, e_inval); ceph_decode_need(p, end, ciphertext_len, e_inval); - ret = __ceph_x_decrypt(secret, *p, ciphertext_len); + ret = __ceph_x_decrypt(key, usage_slot, *p, ciphertext_len); if (ret < 0) return ret; @@ -135,7 +166,7 @@ get_ticket_handler(struct ceph_auth_client *ac, int service) } /* add it */ - th = kzalloc(sizeof(*th), GFP_NOFS); + th = kzalloc_obj(*th, GFP_NOFS); if (!th) return ERR_PTR(-ENOMEM); th->service = service; @@ -193,8 +224,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, } /* blob for me */ - dp = *p + ceph_x_encrypt_offset(); - ret = ceph_x_decrypt(secret, p, end); + dp = *p + ceph_x_encrypt_offset(secret); + ret = ceph_x_decrypt(secret, + 0 /* CEPHX_KEY_USAGE_TICKET_SESSION_KEY */, + p, end); if (ret < 0) goto out; dout(" decrypted %d bytes\n", ret); @@ -208,6 +241,11 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; + ret = ceph_crypto_key_prepare(&new_session_key, ticket_key_usages, + ARRAY_SIZE(ticket_key_usages)); + if (ret) + goto out; + ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&validity, dp); dp += sizeof(struct ceph_timespec); @@ -220,8 +258,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_decode_8_safe(p, end, is_enc, bad); if (is_enc) { /* encrypted */ - tp = *p + ceph_x_encrypt_offset(); - ret = ceph_x_decrypt(&th->session_key, p, end); + tp = *p + ceph_x_encrypt_offset(&th->session_key); + ret = ceph_x_decrypt(&th->session_key, + 1 /* CEPHX_KEY_USAGE_TICKET_BLOB */, + p, end); if (ret < 0) goto out; dout(" encrypted ticket, decrypted %d bytes\n", ret); @@ -312,7 +352,7 @@ static int encrypt_authorizer(struct ceph_x_authorizer *au, p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len); end = au->buf->vec.iov_base + au->buf->vec.iov_len; - msg_b = p + ceph_x_encrypt_offset(); + msg_b = p + ceph_x_encrypt_offset(&au->session_key); msg_b->struct_v = 2; msg_b->nonce = cpu_to_le64(au->nonce); if (server_challenge) { @@ -324,7 +364,9 @@ static int encrypt_authorizer(struct ceph_x_authorizer *au, msg_b->server_challenge_plus_one = 0; } - ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); + ret = ceph_x_encrypt(&au->session_key, + 0 /* CEPHX_KEY_USAGE_AUTHORIZE */, + p, end - p, sizeof(*msg_b)); if (ret < 0) return ret; @@ -367,8 +409,13 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, if (ret) goto out_au; + ret = ceph_crypto_key_prepare(&au->session_key, authorizer_key_usages, + ARRAY_SIZE(authorizer_key_usages)); + if (ret) + goto out_au; + maxlen = sizeof(*msg_a) + ticket_blob_len + - ceph_x_encrypt_buflen(sizeof(*msg_b)); + ceph_x_encrypt_buflen(&au->session_key, sizeof(*msg_b)); dout(" need len %d\n", maxlen); if (au->buf && au->buf->alloc_len < maxlen) { ceph_buffer_put(au->buf); @@ -506,8 +553,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (need & CEPH_ENTITY_TYPE_AUTH) { struct ceph_x_authenticate *auth = (void *)(head + 1); void *enc_buf = xi->auth_authorizer.enc_buf; - struct ceph_x_challenge_blob *blob = enc_buf + - ceph_x_encrypt_offset(); + struct ceph_x_challenge_blob *blob; u64 *u; p = auth + 1; @@ -517,14 +563,29 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, dout(" get_auth_session_key\n"); head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); - /* encrypt and hash */ + if (xi->secret.type == CEPH_CRYPTO_AES) { + blob = enc_buf + ceph_x_encrypt_offset(&xi->secret); + } else { + BUILD_BUG_ON(SHA256_DIGEST_SIZE + sizeof(*blob) > + CEPHX_AU_ENC_BUF_LEN); + blob = enc_buf + SHA256_DIGEST_SIZE; + } + get_random_bytes(&auth->client_challenge, sizeof(u64)); blob->client_challenge = auth->client_challenge; blob->server_challenge = cpu_to_le64(xi->server_challenge); - ret = ceph_x_encrypt(&xi->secret, enc_buf, CEPHX_AU_ENC_BUF_LEN, - sizeof(*blob)); - if (ret < 0) - return ret; + + if (xi->secret.type == CEPH_CRYPTO_AES) { + ret = ceph_x_encrypt(&xi->secret, 0 /* dummy */, + enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*blob)); + if (ret < 0) + return ret; + } else { + ceph_hmac_sha256(&xi->secret, blob, sizeof(*blob), + enc_buf); + ret = SHA256_DIGEST_SIZE; + } auth->struct_v = 3; /* nautilus+ */ auth->key = 0; @@ -634,8 +695,10 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, ceph_decode_need(p, end, len, e_inval); dout("%s connection secret blob len %d\n", __func__, len); if (len > 0) { - dp = *p + ceph_x_encrypt_offset(); - ret = ceph_x_decrypt(&th->session_key, p, *p + len); + dp = *p + ceph_x_encrypt_offset(&th->session_key); + ret = ceph_x_decrypt(&th->session_key, + 2 /* CEPHX_KEY_USAGE_AUTH_CONNECTION_SECRET */, + p, *p + len); if (ret < 0) return ret; @@ -745,7 +808,7 @@ static int ceph_x_create_authorizer( if (IS_ERR(th)) return PTR_ERR(th); - au = kzalloc(sizeof(*au), GFP_NOFS); + au = kzalloc_obj(*au, GFP_NOFS); if (!au) return -ENOMEM; @@ -799,12 +862,14 @@ static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret, int ret; /* no leading len */ - ret = __ceph_x_decrypt(secret, challenge, challenge_len); + ret = __ceph_x_decrypt(secret, + 1 /* CEPHX_KEY_USAGE_AUTHORIZE_CHALLENGE */, + challenge, challenge_len); if (ret < 0) return ret; dout("%s decrypted %d bytes\n", __func__, ret); - dp = challenge + sizeof(struct ceph_x_encrypt_header); + dp = challenge + __ceph_x_encrypt_offset(secret); dend = dp + ret; ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */ @@ -851,8 +916,9 @@ static int decrypt_authorizer_reply(struct ceph_crypto_key *secret, u8 struct_v; int ret; - dp = *p + ceph_x_encrypt_offset(); - ret = ceph_x_decrypt(secret, p, end); + dp = *p + ceph_x_encrypt_offset(secret); + ret = ceph_x_decrypt(secret, 2 /* CEPHX_KEY_USAGE_AUTHORIZE_REPLY */, + p, end); if (ret < 0) return ret; @@ -974,7 +1040,8 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, __le32 front_crc; __le32 middle_crc; __le32 data_crc; - } __packed *sigblock = enc_buf + ceph_x_encrypt_offset(); + } __packed *sigblock = enc_buf + + ceph_x_encrypt_offset(&au->session_key); sigblock->len = cpu_to_le32(4*sizeof(u32)); sigblock->header_crc = msg->hdr.crc; @@ -982,8 +1049,9 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, sigblock->middle_crc = msg->footer.middle_crc; sigblock->data_crc = msg->footer.data_crc; - ret = ceph_x_encrypt(&au->session_key, enc_buf, - CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock)); + ret = ceph_x_encrypt(&au->session_key, 0 /* dummy */, + enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*sigblock)); if (ret < 0) return ret; @@ -998,11 +1066,19 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, __le32 data_crc; __le32 data_len; __le32 seq_lower_word; - } __packed *sigblock = enc_buf; + } __packed *sigblock; struct { __le64 a, b, c, d; } __packed *penc = enc_buf; - int ciphertext_len; + + if (au->session_key.type == CEPH_CRYPTO_AES) { + /* no leading len, no ceph_x_encrypt_header */ + sigblock = enc_buf; + } else { + BUILD_BUG_ON(SHA256_DIGEST_SIZE + sizeof(*sigblock) > + CEPHX_AU_ENC_BUF_LEN); + sigblock = enc_buf + SHA256_DIGEST_SIZE; + } sigblock->header_crc = msg->hdr.crc; sigblock->front_crc = msg->footer.front_crc; @@ -1013,12 +1089,18 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, sigblock->data_len = msg->hdr.data_len; sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq; - /* no leading len, no ceph_x_encrypt_header */ - ret = ceph_crypt(&au->session_key, true, enc_buf, - CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock), - &ciphertext_len); - if (ret) - return ret; + if (au->session_key.type == CEPH_CRYPTO_AES) { + int ciphertext_len; /* unused */ + + ret = ceph_crypt(&au->session_key, 0 /* dummy */, + true, enc_buf, CEPHX_AU_ENC_BUF_LEN, + sizeof(*sigblock), &ciphertext_len); + if (ret) + return ret; + } else { + ceph_hmac_sha256(&au->session_key, sigblock, + sizeof(*sigblock), enc_buf); + } *psig = penc->a ^ penc->b ^ penc->c ^ penc->d; } @@ -1092,21 +1174,27 @@ int ceph_x_init(struct ceph_auth_client *ac) int ret; dout("ceph_x_init %p\n", ac); - ret = -ENOMEM; - xi = kzalloc(sizeof(*xi), GFP_NOFS); + xi = kzalloc_obj(*xi, GFP_NOFS); if (!xi) - goto out; + return -ENOMEM; ret = -EINVAL; if (!ac->key) { pr_err("no secret set (for auth_x protocol)\n"); - goto out_nomem; + goto err_xi; } ret = ceph_crypto_key_clone(&xi->secret, ac->key); if (ret < 0) { pr_err("cannot clone key: %d\n", ret); - goto out_nomem; + goto err_xi; + } + + ret = ceph_crypto_key_prepare(&xi->secret, client_key_usages, + ARRAY_SIZE(client_key_usages)); + if (ret) { + pr_err("cannot prepare key: %d\n", ret); + goto err_secret; } xi->starting = true; @@ -1117,8 +1205,9 @@ int ceph_x_init(struct ceph_auth_client *ac) ac->ops = &ceph_x_ops; return 0; -out_nomem: +err_secret: + ceph_crypto_key_destroy(&xi->secret); +err_xi: kfree(xi); -out: return ret; } diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 9c60feeb1bcb..d097b3651c99 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -6,6 +6,44 @@ #define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 #define CEPHX_GET_ROTATING_KEY 0x0400 +/* Client <-> AuthMonitor */ +/* + * The AUTH session's connection secret: encrypted with the AUTH + * ticket session key + */ +#define CEPHX_KEY_USAGE_AUTH_CONNECTION_SECRET 0x03 +/* + * The ticket's blob for the client ("blob for me", contains the + * session key): encrypted with the client's secret key in case of + * the AUTH ticket and the AUTH ticket session key in case of other + * service tickets + */ +#define CEPHX_KEY_USAGE_TICKET_SESSION_KEY 0x04 +/* + * The ticket's blob for the service (ceph_x_ticket_blob): possibly + * encrypted with the old AUTH ticket session key in case of the AUTH + * ticket and not encrypted in case of other service tickets + */ +#define CEPHX_KEY_USAGE_TICKET_BLOB 0x05 + +/* Client <-> Service */ +/* + * The client's authorization request (ceph_x_authorize_b): + * encrypted with the service ticket session key + */ +#define CEPHX_KEY_USAGE_AUTHORIZE 0x10 +/* + * The service's challenge (ceph_x_authorize_challenge): + * encrypted with the service ticket session key + */ +#define CEPHX_KEY_USAGE_AUTHORIZE_CHALLENGE 0x11 +/* + * The service's final reply (ceph_x_authorize_reply + the service + * session's connection secret): encrypted with the service ticket + * session key + */ +#define CEPHX_KEY_USAGE_AUTHORIZE_REPLY 0x12 + /* common bits */ struct ceph_x_ticket_blob { __u8 struct_v; diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 7e51f128045d..98c2ac4387ae 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -13,7 +13,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { struct ceph_buffer *b; - b = kmalloc(sizeof(*b), gfp); + b = kmalloc_obj(*b, gfp); if (!b) return NULL; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index e734e57be083..952121849180 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -309,13 +309,12 @@ struct ceph_options *ceph_alloc_options(void) { struct ceph_options *opt; - opt = kzalloc(sizeof(*opt), GFP_KERNEL); + opt = kzalloc_obj(*opt); if (!opt) return NULL; opt->crush_locs = RB_ROOT; - opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), - GFP_KERNEL); + opt->mon_addr = kzalloc_objs(*opt->mon_addr, CEPH_MAX_MON); if (!opt->mon_addr) { kfree(opt); return NULL; @@ -456,7 +455,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, ceph_crypto_key_destroy(opt->key); kfree(opt->key); - opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + opt->key = kzalloc_obj(*opt->key); if (!opt->key) return -ENOMEM; err = ceph_crypto_key_unarmor(opt->key, param->string); @@ -469,7 +468,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, ceph_crypto_key_destroy(opt->key); kfree(opt->key); - opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + opt->key = kzalloc_obj(*opt->key); if (!opt->key) return -ENOMEM; return get_secret(opt->key, param->string, &log); @@ -714,7 +713,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) if (err < 0) return ERR_PTR(err); - client = kzalloc(sizeof(*client), GFP_KERNEL); + client = kzalloc_obj(*client); if (client == NULL) return ERR_PTR(-ENOMEM); diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 66136a4c1ce7..c6956f1df333 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -300,7 +300,7 @@ static int decode_lockers(void **p, void *end, u8 *type, char **tag, return ret; *num_lockers = ceph_decode_32(p); - *lockers = kcalloc(*num_lockers, sizeof(**lockers), GFP_NOIO); + *lockers = kzalloc_objs(**lockers, *num_lockers, GFP_NOIO); if (!*lockers) return -ENOMEM; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 01b2ce1e8fc0..de7496791c91 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -7,6 +7,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <crypto/aes.h> +#include <crypto/krb5.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/sched/mm.h> @@ -16,77 +17,119 @@ #include <linux/ceph/decode.h> #include "crypto.h" -/* - * Set ->key and ->tfm. The rest of the key should be filled in before - * this function is called. - */ -static int set_secret(struct ceph_crypto_key *key, void *buf) +static int set_aes_tfm(struct ceph_crypto_key *key) { unsigned int noio_flag; int ret; - key->key = NULL; - key->tfm = NULL; - - switch (key->type) { - case CEPH_CRYPTO_NONE: - return 0; /* nothing to do */ - case CEPH_CRYPTO_AES: - break; - default: - return -ENOTSUPP; - } - - if (!key->len) - return -EINVAL; - - key->key = kmemdup(buf, key->len, GFP_NOIO); - if (!key->key) { - ret = -ENOMEM; - goto fail; - } - - /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); - key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); + key->aes_tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0); memalloc_noio_restore(noio_flag); - if (IS_ERR(key->tfm)) { - ret = PTR_ERR(key->tfm); - key->tfm = NULL; - goto fail; + if (IS_ERR(key->aes_tfm)) { + ret = PTR_ERR(key->aes_tfm); + key->aes_tfm = NULL; + return ret; } - ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len); + ret = crypto_sync_skcipher_setkey(key->aes_tfm, key->key, key->len); if (ret) - goto fail; + return ret; return 0; +} + +static int set_krb5_tfms(struct ceph_crypto_key *key, const u32 *key_usages, + int key_usage_cnt) +{ + struct krb5_buffer TK = { .len = key->len, .data = key->key }; + unsigned int noio_flag; + int ret = 0; + int i; + + if (WARN_ON_ONCE(key_usage_cnt > ARRAY_SIZE(key->krb5_tfms))) + return -EINVAL; -fail: - ceph_crypto_key_destroy(key); + key->krb5_type = crypto_krb5_find_enctype( + KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (!key->krb5_type) + return -ENOPKG; + + /* + * Despite crypto_krb5_prepare_encryption() taking a gfp mask, + * crypto_alloc_aead() inside of it allocates with GFP_KERNEL. + */ + noio_flag = memalloc_noio_save(); + for (i = 0; i < key_usage_cnt; i++) { + key->krb5_tfms[i] = crypto_krb5_prepare_encryption( + key->krb5_type, &TK, key_usages[i], + GFP_NOIO); + if (IS_ERR(key->krb5_tfms[i])) { + ret = PTR_ERR(key->krb5_tfms[i]); + key->krb5_tfms[i] = NULL; + goto out_flag; + } + } + +out_flag: + memalloc_noio_restore(noio_flag); return ret; } +int ceph_crypto_key_prepare(struct ceph_crypto_key *key, + const u32 *key_usages, int key_usage_cnt) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + return 0; /* nothing to do */ + case CEPH_CRYPTO_AES: + return set_aes_tfm(key); + case CEPH_CRYPTO_AES256KRB5: + hmac_sha256_preparekey(&key->hmac_key, key->key, key->len); + return set_krb5_tfms(key, key_usages, key_usage_cnt); + default: + return -ENOTSUPP; + } +} + +/* + * @dst should be zeroed before this function is called. + */ int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src) { - memcpy(dst, src, sizeof(struct ceph_crypto_key)); - return set_secret(dst, src->key); + dst->type = src->type; + dst->created = src->created; + dst->len = src->len; + + dst->key = kmemdup(src->key, src->len, GFP_NOIO); + if (!dst->key) + return -ENOMEM; + + return 0; } +/* + * @key should be zeroed before this function is called. + */ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) { - int ret; - ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); key->type = ceph_decode_16(p); ceph_decode_copy(p, &key->created, sizeof(key->created)); key->len = ceph_decode_16(p); ceph_decode_need(p, end, key->len, bad); - ret = set_secret(key, *p); + if (key->len > CEPH_MAX_KEY_LEN) { + pr_err("secret too big %d\n", key->len); + return -EINVAL; + } + + key->key = kmemdup(*p, key->len, GFP_NOIO); + if (!key->key) + return -ENOMEM; + memzero_explicit(*p, key->len); *p += key->len; - return ret; + return 0; bad: dout("failed to decode crypto key\n"); @@ -122,12 +165,26 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { - if (key) { - kfree_sensitive(key->key); - key->key = NULL; - if (key->tfm) { - crypto_free_sync_skcipher(key->tfm); - key->tfm = NULL; + int i; + + if (!key) + return; + + kfree_sensitive(key->key); + key->key = NULL; + + if (key->type == CEPH_CRYPTO_AES) { + if (key->aes_tfm) { + crypto_free_sync_skcipher(key->aes_tfm); + key->aes_tfm = NULL; + } + } else if (key->type == CEPH_CRYPTO_AES256KRB5) { + memzero_explicit(&key->hmac_key, sizeof(key->hmac_key)); + for (i = 0; i < ARRAY_SIZE(key->krb5_tfms); i++) { + if (key->krb5_tfms[i]) { + crypto_free_aead(key->krb5_tfms[i]); + key->krb5_tfms[i] = NULL; + } } } } @@ -207,7 +264,7 @@ static void teardown_sgtable(struct sg_table *sgt) static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->aes_tfm); struct sg_table sgt; struct scatterlist prealloc_sg; char iv[AES_BLOCK_SIZE] __aligned(8); @@ -223,7 +280,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, return ret; memcpy(iv, aes_iv, AES_BLOCK_SIZE); - skcipher_request_set_sync_tfm(req, key->tfm); + skcipher_request_set_sync_tfm(req, key->aes_tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv); @@ -268,7 +325,68 @@ out_sgt: return ret; } -int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, +static int ceph_krb5_encrypt(const struct ceph_crypto_key *key, int usage_slot, + void *buf, int buf_len, int in_len, int *pout_len) +{ + struct sg_table sgt; + struct scatterlist prealloc_sg; + int ret; + + if (WARN_ON_ONCE(usage_slot >= ARRAY_SIZE(key->krb5_tfms))) + return -EINVAL; + + ret = setup_sgtable(&sgt, &prealloc_sg, buf, buf_len); + if (ret) + return ret; + + ret = crypto_krb5_encrypt(key->krb5_type, key->krb5_tfms[usage_slot], + sgt.sgl, sgt.nents, buf_len, AES_BLOCK_SIZE, + in_len, false); + if (ret < 0) { + pr_err("%s encrypt failed: %d\n", __func__, ret); + goto out_sgt; + } + + *pout_len = ret; + ret = 0; + +out_sgt: + teardown_sgtable(&sgt); + return ret; +} + +static int ceph_krb5_decrypt(const struct ceph_crypto_key *key, int usage_slot, + void *buf, int buf_len, int in_len, int *pout_len) +{ + struct sg_table sgt; + struct scatterlist prealloc_sg; + size_t data_off = 0; + size_t data_len = in_len; + int ret; + + if (WARN_ON_ONCE(usage_slot >= ARRAY_SIZE(key->krb5_tfms))) + return -EINVAL; + + ret = setup_sgtable(&sgt, &prealloc_sg, buf, in_len); + if (ret) + return ret; + + ret = crypto_krb5_decrypt(key->krb5_type, key->krb5_tfms[usage_slot], + sgt.sgl, sgt.nents, &data_off, &data_len); + if (ret) { + pr_err("%s decrypt failed: %d\n", __func__, ret); + goto out_sgt; + } + + WARN_ON(data_off != AES_BLOCK_SIZE); + *pout_len = data_len; + +out_sgt: + teardown_sgtable(&sgt); + return ret; +} + +int ceph_crypt(const struct ceph_crypto_key *key, int usage_slot, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len) { switch (key->type) { @@ -278,11 +396,64 @@ int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, case CEPH_CRYPTO_AES: return ceph_aes_crypt(key, encrypt, buf, buf_len, in_len, pout_len); + case CEPH_CRYPTO_AES256KRB5: + return encrypt ? + ceph_krb5_encrypt(key, usage_slot, buf, buf_len, in_len, + pout_len) : + ceph_krb5_decrypt(key, usage_slot, buf, buf_len, in_len, + pout_len); default: return -ENOTSUPP; } } +int ceph_crypt_data_offset(const struct ceph_crypto_key *key) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + case CEPH_CRYPTO_AES: + return 0; + case CEPH_CRYPTO_AES256KRB5: + /* confounder */ + return AES_BLOCK_SIZE; + default: + BUG(); + } +} + +int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + return data_len; + case CEPH_CRYPTO_AES: + /* PKCS#7 padding at the end */ + return data_len + AES_BLOCK_SIZE - + (data_len & (AES_BLOCK_SIZE - 1)); + case CEPH_CRYPTO_AES256KRB5: + /* confounder at the beginning and 192-bit HMAC at the end */ + return AES_BLOCK_SIZE + data_len + 24; + default: + BUG(); + } +} + +void ceph_hmac_sha256(const struct ceph_crypto_key *key, const void *buf, + int buf_len, u8 hmac[SHA256_DIGEST_SIZE]) +{ + switch (key->type) { + case CEPH_CRYPTO_NONE: + case CEPH_CRYPTO_AES: + memset(hmac, 0, SHA256_DIGEST_SIZE); + return; + case CEPH_CRYPTO_AES256KRB5: + hmac_sha256(&key->hmac_key, buf, buf_len, hmac); + return; + default: + BUG(); + } +} + static int ceph_key_preparse(struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; @@ -295,7 +466,7 @@ static int ceph_key_preparse(struct key_preparsed_payload *prep) goto err; ret = -ENOMEM; - ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); + ckey = kzalloc_obj(*ckey); if (!ckey) goto err; diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 23de29fc613c..3a2ade15abbc 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -2,10 +2,11 @@ #ifndef _FS_CEPH_CRYPTO_H #define _FS_CEPH_CRYPTO_H +#include <crypto/sha2.h> #include <linux/ceph/types.h> #include <linux/ceph/buffer.h> -#define CEPH_KEY_LEN 16 +#define CEPH_MAX_KEY_LEN 32 #define CEPH_MAX_CON_SECRET_LEN 64 /* @@ -16,9 +17,19 @@ struct ceph_crypto_key { struct ceph_timespec created; int len; void *key; - struct crypto_sync_skcipher *tfm; + + union { + struct crypto_sync_skcipher *aes_tfm; + struct { + struct hmac_sha256_key hmac_key; + const struct krb5_enctype *krb5_type; + struct crypto_aead *krb5_tfms[3]; + }; + }; }; +int ceph_crypto_key_prepare(struct ceph_crypto_key *key, + const u32 *key_usages, int key_usage_cnt); int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src); int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); @@ -26,8 +37,12 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); void ceph_crypto_key_destroy(struct ceph_crypto_key *key); /* crypto.c */ -int ceph_crypt(const struct ceph_crypto_key *key, bool encrypt, +int ceph_crypt(const struct ceph_crypto_key *key, int usage_slot, bool encrypt, void *buf, int buf_len, int in_len, int *pout_len); +int ceph_crypt_data_offset(const struct ceph_crypto_key *key); +int ceph_crypt_buflen(const struct ceph_crypto_key *key, int data_len); +void ceph_hmac_sha256(const struct ceph_crypto_key *key, const void *buf, + int buf_len, u8 hmac[SHA256_DIGEST_SIZE]); int ceph_crypto_init(void); void ceph_crypto_shutdown(void); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8181acaf870..108adb583744 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -460,7 +460,7 @@ int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); con_sock_state_connecting(con); - ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), + ret = kernel_connect(sock, (struct sockaddr_unsized *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", @@ -1993,8 +1993,7 @@ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, m->front_alloc_len = m->front.iov_len = front_len; if (max_data_items) { - m->data = kmalloc_array(max_data_items, sizeof(*m->data), - flags); + m->data = kmalloc_objs(*m->data, max_data_items, flags); if (!m->data) goto out2; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 9e48623018a3..50f65820f623 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -392,7 +392,7 @@ static int head_onwire_len(int ctrl_len, bool secure) int head_len; int rem_len; - BUG_ON(ctrl_len < 0 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN); + BUG_ON(ctrl_len < 1 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN); if (secure) { head_len = CEPH_PREAMBLE_SECURE_LEN; @@ -401,9 +401,7 @@ static int head_onwire_len(int ctrl_len, bool secure) head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN; } } else { - head_len = CEPH_PREAMBLE_PLAIN_LEN; - if (ctrl_len) - head_len += ctrl_len + CEPH_CRC_LEN; + head_len = CEPH_PREAMBLE_PLAIN_LEN + ctrl_len + CEPH_CRC_LEN; } return head_len; } @@ -528,11 +526,16 @@ static int decode_preamble(void *p, struct ceph_frame_desc *desc) desc->fd_aligns[i] = ceph_decode_16(&p); } - if (desc->fd_lens[0] < 0 || + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (desc->fd_lens[0] < 1 || desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { pr_err("bad control segment length %d\n", desc->fd_lens[0]); return -EINVAL; } + if (desc->fd_lens[1] < 0 || desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { pr_err("bad front segment length %d\n", desc->fd_lens[1]); @@ -549,10 +552,6 @@ static int decode_preamble(void *p, struct ceph_frame_desc *desc) return -EINVAL; } - /* - * This would fire for FRAME_TAG_WAIT (it has one empty - * segment), but we should never get it as client. - */ if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { pr_err("last segment empty, segment count %d\n", desc->fd_seg_cnt); @@ -779,9 +778,9 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static void ceph_hmac_sha256(struct ceph_connection *con, - const struct kvec *kvecs, int kvec_cnt, - u8 hmac[SHA256_DIGEST_SIZE]) +static void con_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, + u8 hmac[SHA256_DIGEST_SIZE]) { struct hmac_sha256_ctx ctx; int i; @@ -1438,8 +1437,8 @@ static int prepare_auth_signature(struct ceph_connection *con) if (!buf) return -ENOMEM; - ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, - CTRL_BODY(buf)); + con_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, SHA256_DIGEST_SIZE); @@ -1538,8 +1537,7 @@ static int prepare_keepalive2(struct ceph_connection *con) struct timespec64 now; ktime_get_real_ts64(&now); - dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec, - now.tv_nsec); + dout("%s con %p timestamp %ptSp\n", __func__, con, &now); ceph_encode_timespec64(ts, &now); @@ -2361,7 +2359,7 @@ bad: */ static int process_auth_done(struct ceph_connection *con, void *p, void *end) { - u8 session_key_buf[CEPH_KEY_LEN + 16]; + u8 session_key_buf[CEPH_MAX_KEY_LEN + 16]; u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); @@ -2377,7 +2375,9 @@ static int process_auth_done(struct ceph_connection *con, void *p, void *end) ceph_decode_64_safe(&p, end, global_id, bad); ceph_decode_32_safe(&p, end, con->v2.con_mode, bad); + ceph_decode_32_safe(&p, end, payload_len, bad); + ceph_decode_need(&p, end, payload_len, bad); dout("%s con %p global_id %llu con_mode %d payload_len %d\n", __func__, con, global_id, con->v2.con_mode, payload_len); @@ -2435,8 +2435,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, - hmac); + con_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, + hmac); ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { @@ -2732,8 +2732,7 @@ static int process_keepalive2_ack(struct ceph_connection *con, ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&con->last_keepalive_ack, p); - dout("%s con %p timestamp %lld.%09ld\n", __func__, con, - con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec); + dout("%s con %p timestamp %ptSp\n", __func__, con, &con->last_keepalive_ack); return 0; @@ -2833,12 +2832,15 @@ static int process_message_header(struct ceph_connection *con, void *p, void *end) { struct ceph_frame_desc *desc = &con->v2.in_desc; - struct ceph_msg_header2 *hdr2 = p; + struct ceph_msg_header2 *hdr2; struct ceph_msg_header hdr; int skip; int ret; u64 seq; + ceph_decode_need(&p, end, sizeof(*hdr2), bad); + hdr2 = p; + /* verify seq# */ seq = le64_to_cpu(hdr2->seq); if ((s64)seq - (s64)con->in_seq < 1) { @@ -2869,6 +2871,10 @@ static int process_message_header(struct ceph_connection *con, WARN_ON(!con->in_msg); WARN_ON(con->in_msg->con != con); return 1; + +bad: + pr_err("failed to decode message header\n"); + return -EINVAL; } static int process_message(struct ceph_connection *con) @@ -2898,6 +2904,11 @@ static int __handle_control(struct ceph_connection *con, void *p) if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE) return process_control(con, p, end); + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected message"; + return -EINVAL; + } + ret = process_message_header(con, p, end); if (ret < 0) return ret; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index c227ececa925..d5080530ce0c 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -72,8 +72,8 @@ static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) struct ceph_monmap *monmap = NULL; struct ceph_fsid fsid; u32 struct_len; - int blob_len; - int num_mon; + u32 blob_len; + u32 num_mon; u8 struct_v; u32 epoch; int ret; @@ -112,12 +112,12 @@ static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) } ceph_decode_32_safe(p, end, num_mon, e_inval); - dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch, + dout("%s fsid %pU epoch %u num_mon %u\n", __func__, &fsid, epoch, num_mon); if (num_mon > CEPH_MAX_MON) goto e_inval; - monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO); + monmap = kmalloc_flex(*monmap, mon_inst, num_mon, GFP_NOIO); if (!monmap) { ret = -ENOMEM; goto fail; @@ -611,7 +611,7 @@ alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp) { struct ceph_mon_generic_request *req; - req = kzalloc(sizeof(*req), gfp); + req = kzalloc_obj(*req, gfp); if (!req) return NULL; @@ -1140,8 +1140,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc) int i; /* build initial monmap */ - monc->monmap = kzalloc(struct_size(monc->monmap, mon_inst, num_mon), - GFP_KERNEL); + monc->monmap = kzalloc_flex(*monc->monmap, mon_inst, num_mon); if (!monc->monmap) return -ENOMEM; monc->monmap->num_mon = num_mon; @@ -1417,7 +1416,7 @@ static int mon_handle_auth_done(struct ceph_connection *con, if (!ret) finish_hunting(monc); mutex_unlock(&monc->mutex); - return 0; + return ret; } static int mon_handle_auth_bad_method(struct ceph_connection *con, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6664ea73ccf8..2ff00070c181 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -556,7 +556,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); } else { BUG_ON(num_ops > CEPH_OSD_MAX_OPS); - req = kmalloc(struct_size(req, r_ops, num_ops), gfp_flags); + req = kmalloc_flex(*req, r_ops, num_ops, gfp_flags); } if (unlikely(!req)) return NULL; @@ -1153,9 +1153,8 @@ int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); op->extent.sparse_ext_cnt = cnt; - op->extent.sparse_ext = kmalloc_array(cnt, - sizeof(*op->extent.sparse_ext), - GFP_NOFS); + op->extent.sparse_ext = kmalloc_objs(*op->extent.sparse_ext, cnt, + GFP_NOFS); if (!op->extent.sparse_ext) return -ENOMEM; return 0; @@ -1264,7 +1263,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) WARN_ON(onum == CEPH_HOMELESS_OSD); - osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL); + osd = kzalloc_obj(*osd, GFP_NOIO | __GFP_NOFAIL); osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; @@ -1280,8 +1279,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) static struct ceph_osd *get_osd(struct ceph_osd *osd) { if (refcount_inc_not_zero(&osd->o_ref)) { - dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1, - refcount_read(&osd->o_ref)); + dout("get_osd %p -> %d\n", osd, refcount_read(&osd->o_ref)); return osd; } else { dout("get_osd %p FAIL\n", osd); @@ -1291,8 +1289,7 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd) static void put_osd(struct ceph_osd *osd) { - dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref), - refcount_read(&osd->o_ref) - 1); + dout("put_osd %p -> %d\n", osd, refcount_read(&osd->o_ref) - 1); if (refcount_dec_and_test(&osd->o_ref)) { osd_cleanup(osd); kfree(osd); @@ -1588,6 +1585,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; + bool should_be_paused; bool is_read = t->flags & CEPH_OSD_FLAG_READ; bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; @@ -1656,10 +1654,16 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, &last_pgid)) force_resend = true; - if (t->paused && !target_should_be_paused(osdc, t, pi)) { - t->paused = false; + should_be_paused = target_should_be_paused(osdc, t, pi); + if (t->paused && !should_be_paused) { unpaused = true; } + if (t->paused != should_be_paused) { + dout("%s t %p paused %d -> %d\n", __func__, t, t->paused, + should_be_paused); + t->paused = should_be_paused; + } + legacy_change = ceph_pg_compare(&t->pgid, &pgid) || ceph_osds_changed(&t->acting, &acting, t->used_replica || any_change); @@ -1713,7 +1717,7 @@ static struct ceph_spg_mapping *alloc_spg_mapping(void) { struct ceph_spg_mapping *spg; - spg = kmalloc(sizeof(*spg), GFP_NOIO); + spg = kmalloc_obj(*spg, GFP_NOIO); if (!spg) return NULL; @@ -1908,7 +1912,7 @@ static struct ceph_osd_backoff *alloc_backoff(void) { struct ceph_osd_backoff *backoff; - backoff = kzalloc(sizeof(*backoff), GFP_NOIO); + backoff = kzalloc_obj(*backoff, GFP_NOIO); if (!backoff) return NULL; @@ -2801,7 +2805,7 @@ linger_alloc(struct ceph_osd_client *osdc) { struct ceph_osd_linger_request *lreq; - lreq = kzalloc(sizeof(*lreq), GFP_NOIO); + lreq = kzalloc_obj(*lreq, GFP_NOIO); if (!lreq) return NULL; @@ -2943,7 +2947,7 @@ static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, { struct linger_work *lwork; - lwork = kzalloc(sizeof(*lwork), GFP_NOIO); + lwork = kzalloc_obj(*lwork, GFP_NOIO); if (!lwork) return NULL; @@ -4283,6 +4287,9 @@ static void osd_fault(struct ceph_connection *con) goto out_unlock; } + osd->o_sparse_op_idx = -1; + ceph_init_sparse_read(&osd->o_sparse_read); + if (!reopen_osd(osd)) kick_osd_requests(osd); maybe_request_map(osdc); @@ -4321,7 +4328,7 @@ static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) ceph_decode_8_safe(&p, end, m->op, e_inval); ceph_decode_64_safe(&p, end, m->id, e_inval); - m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO); + m->begin = kzalloc_obj(*m->begin, GFP_NOIO); if (!m->begin) return -ENOMEM; @@ -4331,7 +4338,7 @@ static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) return ret; } - m->end = kzalloc(sizeof(*m->end), GFP_NOIO); + m->end = kzalloc_obj(*m->end, GFP_NOIO); if (!m->end) { free_hoid(m->begin); return -ENOMEM; @@ -5024,7 +5031,7 @@ static int decode_watchers(void **p, void *end, return ret; *num_watchers = ceph_decode_32(p); - *watchers = kcalloc(*num_watchers, sizeof(**watchers), GFP_NOIO); + *watchers = kzalloc_objs(**watchers, *num_watchers, GFP_NOIO); if (!*watchers) return -ENOMEM; @@ -5824,9 +5831,8 @@ next_op: if (!sr->sr_extent || count > sr->sr_ext_len) { /* no extent array provided, or too short */ kfree(sr->sr_extent); - sr->sr_extent = kmalloc_array(count, - sizeof(*sr->sr_extent), - GFP_NOIO); + sr->sr_extent = kmalloc_objs(*sr->sr_extent, + count, GFP_NOIO); if (!sr->sr_extent) { pr_err("%s: failed to allocate %u extents\n", __func__, count); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d245fa508e1c..c89e66d4fcb7 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -231,7 +231,7 @@ static struct crush_choose_arg_map *alloc_choose_arg_map(void) { struct crush_choose_arg_map *arg_map; - arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); + arg_map = kzalloc_obj(*arg_map, GFP_NOIO); if (!arg_map) return NULL; @@ -241,22 +241,26 @@ static struct crush_choose_arg_map *alloc_choose_arg_map(void) static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) { - if (arg_map) { - int i, j; + int i, j; - WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); + if (!arg_map) + return; + WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); + + if (arg_map->args) { for (i = 0; i < arg_map->size; i++) { struct crush_choose_arg *arg = &arg_map->args[i]; - - for (j = 0; j < arg->weight_set_size; j++) - kfree(arg->weight_set[j].weights); - kfree(arg->weight_set); + if (arg->weight_set) { + for (j = 0; j < arg->weight_set_size; j++) + kfree(arg->weight_set[j].weights); + kfree(arg->weight_set); + } kfree(arg->ids); } kfree(arg_map->args); - kfree(arg_map); } + kfree(arg_map); } DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, @@ -316,9 +320,8 @@ static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) if (arg->weight_set_size) { u32 i; - arg->weight_set = kmalloc_array(arg->weight_set_size, - sizeof(*arg->weight_set), - GFP_NOIO); + arg->weight_set = kmalloc_objs(*arg->weight_set, + arg->weight_set_size, GFP_NOIO); if (!arg->weight_set) return -ENOMEM; @@ -364,8 +367,8 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) ceph_decode_64_safe(p, end, arg_map->choose_args_index, e_inval); arg_map->size = c->max_buckets; - arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), - GFP_NOIO); + arg_map->args = kzalloc_objs(*arg_map->args, arg_map->size, + GFP_NOIO); if (!arg_map->args) { ret = -ENOMEM; goto fail; @@ -439,7 +442,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); - c = kzalloc(sizeof(*c), GFP_NOFS); + c = kzalloc_obj(*c, GFP_NOFS); if (c == NULL) return ERR_PTR(-ENOMEM); @@ -464,10 +467,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->max_rules = ceph_decode_32(p); c->max_devices = ceph_decode_32(p); - c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); + c->buckets = kzalloc_objs(*c->buckets, c->max_buckets, GFP_NOFS); if (c->buckets == NULL) goto badmem; - c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); + c->rules = kzalloc_objs(*c->rules, c->max_rules, GFP_NOFS); if (c->rules == NULL) goto badmem; @@ -520,7 +523,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush_decode bucket size %d off %x %p to %p\n", b->size, (int)(*p-start), *p, end); - b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); + b->items = kzalloc_objs(__s32, b->size, GFP_NOFS); if (b->items == NULL) goto badmem; @@ -586,7 +589,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) / sizeof(struct crush_rule_step)) goto bad; #endif - r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); + r = kmalloc_flex(*r, steps, yes, GFP_NOFS); if (r == NULL) goto badmem; dout(" rule %d is at %p\n", i, r); @@ -806,51 +809,49 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) ceph_decode_need(p, end, len, bad); pool_end = *p + len; + ceph_decode_need(p, end, 4 + 4 + 4, bad); pi->type = ceph_decode_8(p); pi->size = ceph_decode_8(p); pi->crush_ruleset = ceph_decode_8(p); pi->object_hash = ceph_decode_8(p); - pi->pg_num = ceph_decode_32(p); pi->pgp_num = ceph_decode_32(p); - *p += 4 + 4; /* skip lpg* */ - *p += 4; /* skip last_change */ - *p += 8 + 4; /* skip snap_seq, snap_epoch */ + /* lpg*, last_change, snap_seq, snap_epoch */ + ceph_decode_skip_n(p, end, 8 + 4 + 8 + 4, bad); /* skip snaps */ - num = ceph_decode_32(p); + ceph_decode_32_safe(p, end, num, bad); while (num--) { - *p += 8; /* snapid key */ - *p += 1 + 1; /* versions */ - len = ceph_decode_32(p); - *p += len; + /* snapid key, pool snap (with versions) */ + ceph_decode_skip_n(p, end, 8 + 2, bad); + ceph_decode_skip_string(p, end, bad); } - /* skip removed_snaps */ - num = ceph_decode_32(p); - *p += num * (8 + 8); + /* removed_snaps */ + ceph_decode_skip_map(p, end, 64, 64, bad); + ceph_decode_need(p, end, 8 + 8 + 4, bad); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); *p += 4; /* skip crash_replay_interval */ if (ev >= 7) - pi->min_size = ceph_decode_8(p); + ceph_decode_8_safe(p, end, pi->min_size, bad); else pi->min_size = pi->size - pi->size / 2; if (ev >= 8) - *p += 8 + 8; /* skip quota_max_* */ + /* quota_max_* */ + ceph_decode_skip_n(p, end, 8 + 8, bad); if (ev >= 9) { - /* skip tiers */ - num = ceph_decode_32(p); - *p += num * 8; + /* tiers */ + ceph_decode_skip_set(p, end, 64, bad); + ceph_decode_need(p, end, 8 + 1 + 8 + 8, bad); *p += 8; /* skip tier_of */ *p += 1; /* skip cache_mode */ - pi->read_tier = ceph_decode_64(p); pi->write_tier = ceph_decode_64(p); } else { @@ -858,86 +859,76 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pi->write_tier = -1; } - if (ev >= 10) { - /* skip properties */ - num = ceph_decode_32(p); - while (num--) { - len = ceph_decode_32(p); - *p += len; /* key */ - len = ceph_decode_32(p); - *p += len; /* val */ - } - } + if (ev >= 10) + /* properties */ + ceph_decode_skip_map(p, end, string, string, bad); if (ev >= 11) { - /* skip hit_set_params */ - *p += 1 + 1; /* versions */ - len = ceph_decode_32(p); - *p += len; + /* hit_set_params (with versions) */ + ceph_decode_skip_n(p, end, 2, bad); + ceph_decode_skip_string(p, end, bad); - *p += 4; /* skip hit_set_period */ - *p += 4; /* skip hit_set_count */ + /* hit_set_period, hit_set_count */ + ceph_decode_skip_n(p, end, 4 + 4, bad); } if (ev >= 12) - *p += 4; /* skip stripe_width */ + /* stripe_width */ + ceph_decode_skip_32(p, end, bad); - if (ev >= 13) { - *p += 8; /* skip target_max_bytes */ - *p += 8; /* skip target_max_objects */ - *p += 4; /* skip cache_target_dirty_ratio_micro */ - *p += 4; /* skip cache_target_full_ratio_micro */ - *p += 4; /* skip cache_min_flush_age */ - *p += 4; /* skip cache_min_evict_age */ - } + if (ev >= 13) + /* target_max_*, cache_target_*, cache_min_* */ + ceph_decode_skip_n(p, end, 16 + 8 + 8, bad); - if (ev >= 14) { - /* skip erasure_code_profile */ - len = ceph_decode_32(p); - *p += len; - } + if (ev >= 14) + /* erasure_code_profile */ + ceph_decode_skip_string(p, end, bad); /* * last_force_op_resend_preluminous, will be overridden if the * map was encoded with RESEND_ON_SPLIT */ if (ev >= 15) - pi->last_force_request_resend = ceph_decode_32(p); + ceph_decode_32_safe(p, end, pi->last_force_request_resend, bad); else pi->last_force_request_resend = 0; if (ev >= 16) - *p += 4; /* skip min_read_recency_for_promote */ + /* min_read_recency_for_promote */ + ceph_decode_skip_32(p, end, bad); if (ev >= 17) - *p += 8; /* skip expected_num_objects */ + /* expected_num_objects */ + ceph_decode_skip_64(p, end, bad); if (ev >= 19) - *p += 4; /* skip cache_target_dirty_high_ratio_micro */ + /* cache_target_dirty_high_ratio_micro */ + ceph_decode_skip_32(p, end, bad); if (ev >= 20) - *p += 4; /* skip min_write_recency_for_promote */ + /* min_write_recency_for_promote */ + ceph_decode_skip_32(p, end, bad); if (ev >= 21) - *p += 1; /* skip use_gmt_hitset */ + /* use_gmt_hitset */ + ceph_decode_skip_8(p, end, bad); if (ev >= 22) - *p += 1; /* skip fast_read */ + /* fast_read */ + ceph_decode_skip_8(p, end, bad); - if (ev >= 23) { - *p += 4; /* skip hit_set_grade_decay_rate */ - *p += 4; /* skip hit_set_search_last_n */ - } + if (ev >= 23) + /* hit_set_grade_decay_rate, hit_set_search_last_n */ + ceph_decode_skip_n(p, end, 4 + 4, bad); if (ev >= 24) { - /* skip opts */ - *p += 1 + 1; /* versions */ - len = ceph_decode_32(p); - *p += len; + /* opts (with versions) */ + ceph_decode_skip_n(p, end, 2, bad); + ceph_decode_skip_string(p, end, bad); } if (ev >= 25) - pi->last_force_request_resend = ceph_decode_32(p); + ceph_decode_32_safe(p, end, pi->last_force_request_resend, bad); /* ignore the rest */ @@ -1124,7 +1115,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) { struct ceph_osdmap *map; - map = kzalloc(sizeof(*map), GFP_NOIO); + map = kzalloc_obj(*map, GFP_NOIO); if (!map) return NULL; @@ -1351,7 +1342,7 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, pi = lookup_pg_pool(&map->pg_pools, pool); if (!incremental || !pi) { - pi = kzalloc(sizeof(*pi), GFP_NOFS); + pi = kzalloc_obj(*pi, GFP_NOFS); if (!pi) return -ENOMEM; @@ -1438,7 +1429,7 @@ static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0 && incremental) return NULL; /* new_pg_temp: [] to remove */ - if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) + if ((size_t)len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, len * sizeof(u32), e_inval); @@ -1619,7 +1610,7 @@ static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); - if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) + if ((size_t)len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); @@ -1991,11 +1982,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, sizeof(u64) + sizeof(u32), e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); - BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); new_pool_max = ceph_decode_64(p); new_flags = ceph_decode_32(p); + if (epoch != map->epoch + 1) + goto e_inval; + /* full map? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 5a9c4be5f222..40847d873cff 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -10,7 +10,7 @@ struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags) { struct ceph_pagelist *pl; - pl = kmalloc(sizeof(*pl), gfp_flags); + pl = kmalloc_obj(*pl, gfp_flags); if (!pl) return NULL; diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 4509757d8b3b..858359873c4d 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -41,7 +41,7 @@ struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) struct page **pages; int i; - pages = kmalloc_array(num_pages, sizeof(*pages), flags); + pages = kmalloc_objs(*pages, num_pages, flags); if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { diff --git a/net/ceph/striper.c b/net/ceph/striper.c index 3b3fa75d1189..2b8288858079 100644 --- a/net/ceph/striper.c +++ b/net/ceph/striper.c @@ -230,8 +230,8 @@ int ceph_extent_to_file(struct ceph_file_layout *l, *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) - DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit); - *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents), - GFP_NOIO); + *file_extents = kmalloc_objs(**file_extents, *num_file_extents, + GFP_NOIO); if (!*file_extents) return -ENOMEM; diff --git a/net/compat.c b/net/compat.c index 485db8ee9b28..2c9bd0edac99 100644 --- a/net/compat.c +++ b/net/compat.c @@ -460,10 +460,10 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETSOCKNAME: - ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETPEERNAME: - ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1); break; case SYS_SOCKETPAIR: ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); diff --git a/net/core/Makefile b/net/core/Makefile index 9ef2099c5426..dc17c5a61e9a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux networking core. # -obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ +obj-y := sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ flow_dissector.o @@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o +obj-y += netdev_config.o obj-y += netdev_rx_queue.o obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 2e538399757f..f8338acebf07 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,31 +40,30 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata)); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + u32 uncharge; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; - bpf_local_storage_destroy(sk_storage); + uncharge = bpf_local_storage_destroy(sk_storage); + if (uncharge) + atomic_sub(uncharge, &sk->sk_omem_alloc); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void bpf_sk_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &sk_cache, NULL); + bpf_local_storage_map_free(map, &sk_cache); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) @@ -138,7 +137,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -161,8 +160,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) @@ -194,12 +192,19 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, copy_selem); + ret = bpf_selem_link_map(smap, new_sk_storage, copy_selem); + if (ret) { + bpf_selem_free(copy_selem, true); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { - bpf_selem_free(copy_selem, smap, true); + bpf_selem_free(copy_selem, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); @@ -213,8 +218,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. @@ -369,6 +373,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: @@ -495,7 +500,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) nr_maps++; } - diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); + diag = kzalloc_flex(*diag, maps, nr_maps); if (!diag) return ERR_PTR(-ENOMEM); diff --git a/net/core/dev.c b/net/core/dev.c index 2acfa44927da..14a83f2035b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -231,10 +231,13 @@ static bool use_backlog_threads(void) static inline void backlog_lock_irq_save(struct softnet_data *sd, unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); - else + } else { local_irq_save(*flags); + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_lock(&sd->input_pkt_queue.lock); + } } static inline void backlog_lock_irq_disable(struct softnet_data *sd) @@ -246,12 +249,15 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd) } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) + unsigned long flags) { - if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) - spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else - local_irq_restore(*flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, flags); + } else { + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) + spin_unlock(&sd->input_pkt_queue.lock); + local_irq_restore(flags); + } } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) @@ -267,7 +273,7 @@ static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, { struct netdev_name_node *name_node; - name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + name_node = kmalloc_obj(*name_node); if (!name_node) return NULL; INIT_HLIST_NODE(&name_node->hlist); @@ -478,15 +484,21 @@ static const unsigned short netdev_lock_type[] = { ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_CAN, ARPHRD_MCTP, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, - ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_RAWHDLC, ARPHRD_RAWIP, + ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, - ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, - ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + ARPHRD_IEEE80211_RADIOTAP, + ARPHRD_IEEE802154, ARPHRD_IEEE802154_MONITOR, + ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_CAIF, ARPHRD_IP6GRE, ARPHRD_NETLINK, ARPHRD_6LOWPAN, + ARPHRD_VSOCKMON, + ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", @@ -495,15 +507,21 @@ static const char *const netdev_lock_name[] = { "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_CAN", "_xmit_MCTP", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_RAWHDLC", "_xmit_RAWIP", + "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + "_xmit_IEEE80211_RADIOTAP", + "_xmit_IEEE802154", "_xmit_IEEE802154_MONITOR", + "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_CAIF", "_xmit_IP6GRE", "_xmit_NETLINK", "_xmit_6LOWPAN", + "_xmit_VSOCKMON", + "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -516,6 +534,7 @@ static inline unsigned short netdev_lock_pos(unsigned short dev_type) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ + WARN_ONCE(1, "netdev_lock_pos() could not find dev_type=%u\n", dev_type); return ARRAY_SIZE(netdev_lock_type) - 1; } @@ -725,7 +744,7 @@ static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; - if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + if (k >= NET_DEVICE_PATH_STACK_MAX) return NULL; return &stack->path[k]; @@ -1163,6 +1182,7 @@ void netdev_copy_name(struct net_device *dev, char *name) strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } +EXPORT_IPV6_MOD_GPL(netdev_copy_name); /** * netdev_get_name - get a netdevice name, knowing its ifindex. @@ -2462,9 +2482,9 @@ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) +static int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; @@ -2483,7 +2503,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb, list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; - if (pt_prev) + if (unlikely(pt_prev)) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } @@ -2544,7 +2564,7 @@ again: if (skb_loop_sk(ptype, skb)) continue; - if (pt_prev) { + if (unlikely(pt_prev)) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; @@ -3373,6 +3393,13 @@ static void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { + /* If q->defer_list is not empty, at least one thread is + * in __dev_xmit_skb() before llist_del_all(&q->defer_list). + * This thread will attempt to run the queue. + */ + if (!llist_empty(&q->defer_list)) + return; + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } @@ -3782,7 +3809,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) - features &= ~NETIF_F_TSO_MANGLEID; + features &= ~dev->mangleid_features; } /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, @@ -3793,8 +3820,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && skb_transport_header_was_set(skb) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr) && - !ipv6_has_hopopt_jumbo(skb)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr)) features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); return features; @@ -3897,8 +3923,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr) && - !ipv6_has_hopopt_jumbo(skb)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr)) goto sw_checksum; switch (skb->csum_offset) { @@ -3962,7 +3987,7 @@ static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, if (shinfo->nr_frags > 0) { niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); if (net_is_devmem_iov(niov) && - net_devmem_iov_binding(niov)->dev != dev) + READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev) goto out_free; } @@ -4061,17 +4086,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size && skb_transport_header_was_set(skb)) { - u16 gso_segs = shinfo->gso_segs; + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; /* mac layer + network layer */ @@ -4104,6 +4135,8 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } @@ -4125,9 +4158,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); - struct sk_buff *to_free = NULL; - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; qdisc_calculate_pkt_len(skb, q); @@ -4143,9 +4177,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); - goto no_lock_out; + goto free_skbs; } qdisc_bstats_cpu_update(q, skb); @@ -4153,81 +4187,95 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, !nolock_qdisc_is_empty(q)) __qdisc_run(q); - qdisc_run_end(q); - return NET_XMIT_SUCCESS; + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - qdisc_run(q); - -no_lock_out: - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - return rc; + to_free2 = qdisc_run(q); + goto free_skbs; } - if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { - kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); - return NET_XMIT_DROP; - } - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits qdisc->running owner to get the lock more - * often and dequeue packets faster. - * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit - * and then other tasks will only enqueue packets. The packets will be - * sent after the qdisc owner is scheduled again. To prevent this - * scenario the task always serialize on the lock. + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). + */ + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > READ_ONCE(net_hotdata.qdisc_max_burst))) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_BURST_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. */ - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); - if (unlikely(contended)) - spin_lock(&q->busylock); + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over qdisc_max_burst. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } - - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - WRITE_ONCE(q->owner, smp_processor_id()); - rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - WRITE_ONCE(q->owner, -1); - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + if (next) { + prefetch(next); + prefetch(&next->priority); + skb_mark_not_on_list(skb); } - __qdisc_run(q); - qdisc_run_end(q); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + to_free2 = qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - if (unlikely(contended)) - spin_unlock(&q->busylock); + +free_skbs: + tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } @@ -4332,7 +4380,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, return ret; tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); @@ -4398,12 +4446,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - if (*pt_prev) { + if (unlikely(*pt_prev)) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { @@ -4591,6 +4639,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); +int sk_tx_queue_get(const struct sock *sk) +{ + int resel, val; + + if (!sk) + return -1; + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + val = READ_ONCE(sk->sk_tx_queue_mapping); + + if (val == NO_QUEUE_MAPPING) + return -1; + + if (!sk_fullsock(sk)) + return val; + + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); + if (resel && time_is_before_jiffies( + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) + return -1; + + return val; +} +EXPORT_SYMBOL(sk_tx_queue_get); + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -4606,8 +4680,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); - if (queue_index != new_index && sk && - sk_fullsock(sk) && + if (sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); @@ -4689,7 +4762,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); - qdisc_pkt_len_init(skb); + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { @@ -4745,10 +4818,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - /* Other cpus might concurrently change txq->xmit_lock_owner - * to -1 or to their cpu id, but not to our id. - */ - if (READ_ONCE(txq->xmit_lock_owner) != cpu) { + if (!netif_tx_owned(txq, cpu)) { + bool is_list = false; + if (dev_xmit_recursion()) goto recursion_alert; @@ -4759,17 +4831,28 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { + is_list = !!skb->next; + dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); - if (dev_xmit_complete(rc)) { - HARD_TX_UNLOCK(dev, txq); - goto out; - } + + /* GSO segments a single SKB into + * a list of frames. TCP expects error + * to mean none of the data was sent. + */ + if (is_list) + rc = NETDEV_TX_OK; } HARD_TX_UNLOCK(dev, txq); + if (!skb) /* xmit completed */ + goto out; + net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); + /* NETDEV_TX_BUSY or queue was stopped */ + if (!is_list) + rc = -ENETDOWN; } else { /* Recursion is detected! It is possible, * unfortunately @@ -4777,10 +4860,10 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); + rc = -ENETDOWN; } } - rc = -ENETDOWN; rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); @@ -4919,8 +5002,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow, static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, - u32 flow_id) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash) { if (next_cpu < nr_cpu_ids) { u32 head; @@ -4931,6 +5013,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *tmp_rflow; unsigned int tmp_cpu; u16 rxq_index; + u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4946,6 +5029,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!flow_table) goto out; + flow_id = rfs_slot(hash, flow_table); tmp_rflow = &flow_table->flows[flow_id]; tmp_cpu = READ_ONCE(tmp_rflow->cpu); @@ -4993,7 +5077,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; - u32 flow_id; u32 tcpu; u32 hash; @@ -5040,8 +5123,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - flow_id = rfs_slot(hash, flow_table); - rflow = &flow_table->flows[flow_id]; + rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; tcpu = rflow->cpu; /* @@ -5060,8 +5142,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, - flow_id); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { @@ -5191,7 +5272,7 @@ void kick_defer_list_purge(unsigned int cpu) if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); @@ -5202,14 +5283,15 @@ void kick_defer_list_purge(unsigned int cpu) int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, + int max_backlog) { #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *fl; - struct softnet_data *sd; unsigned int old_flow, new_flow; + const struct softnet_data *sd; + struct sd_flow_limit *fl; - if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) + if (likely(qlen < (max_backlog >> 1))) return false; sd = this_cpu_ptr(&softnet_data); @@ -5254,19 +5336,19 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, u32 tail; reason = SKB_DROP_REASON_DEV_READY; - if (!netif_running(skb->dev)) + if (unlikely(!netif_running(skb->dev))) goto bad_dev; - reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); - if (unlikely(qlen > max_backlog)) + if (unlikely(qlen > max_backlog) || + skb_flow_limit(skb, qlen, max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (likely(qlen <= max_backlog)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. @@ -5277,16 +5359,17 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); cpu_backlog_drop: + reason = SKB_DROP_REASON_CPU_BACKLOG; numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); @@ -5693,8 +5776,9 @@ static __latent_entropy void net_tx_action(void) rcu_read_lock(); while (head) { - struct Qdisc *q = head; spinlock_t *root_lock = NULL; + struct sk_buff *to_free; + struct Qdisc *q = head; head = head->next_sched; @@ -5721,9 +5805,10 @@ static __latent_entropy void net_tx_action(void) } clear_bit(__QDISC_STATE_SCHED, &q->state); - qdisc_run(q); + to_free = qdisc_run(q); if (root_lock) spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } rcu_read_unlock(); @@ -5833,7 +5918,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, if (nf_hook_ingress_active(skb)) { int ingress_retval; - if (*pt_prev) { + if (unlikely(*pt_prev)) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } @@ -5910,13 +5995,13 @@ another_round: list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, list) { - if (pt_prev) + if (unlikely(pt_prev)) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { - if (pt_prev) + if (unlikely(pt_prev)) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } @@ -5947,7 +6032,7 @@ skip_classify: } if (skb_vlan_tag_present(skb)) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } @@ -5959,7 +6044,7 @@ skip_classify: rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } @@ -6433,8 +6518,7 @@ struct flush_backlogs { static struct flush_backlogs *flush_backlogs_alloc(void) { - return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), - GFP_KERNEL); + return kmalloc_flex(struct flush_backlogs, w, nr_cpu_ids); } static struct flush_backlogs *flush_backlogs_fallback; @@ -6732,6 +6816,7 @@ static void skb_defer_free_flush(void) free_list = llist_del_all(&sdn->defer_list); llist_for_each_entry_safe(skb, next, free_list, ll_node) { + prefetch(next); napi_consume_skb(skb, 1); } } @@ -7039,7 +7124,8 @@ static void napi_stop_kthread(struct napi_struct *napi) */ if ((val & NAPIF_STATE_SCHED_THREADED) || !(val & NAPIF_STATE_SCHED)) { - new = val & (~NAPIF_STATE_THREADED); + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); } else { msleep(20); continue; @@ -7063,6 +7149,16 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) +{ + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; + + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); +} + int napi_set_threaded(struct napi_struct *napi, enum netdev_napi_threaded threaded) { @@ -7089,7 +7185,7 @@ int napi_set_threaded(struct napi_struct *napi, } else { /* Make sure kthread is created before THREADED bit is set. */ smp_mb__before_atomic(); - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + napi_set_threaded_state(napi, threaded); } return 0; @@ -7481,7 +7577,9 @@ void napi_disable_locked(struct napi_struct *n) } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); @@ -7693,11 +7791,12 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi) +static void napi_threaded_poll_loop(struct napi_struct *napi, + unsigned long *busy_poll_last_qs) { + unsigned long last_qs = busy_poll_last_qs ? *busy_poll_last_qs : jiffies; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; - unsigned long last_qs = jiffies; for (;;) { bool repoll = false; @@ -7722,22 +7821,51 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) } skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. + */ + if (busy_poll_last_qs) + gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll_last_qs) { + rcu_softirq_qs_periodic(last_qs); + cond_resched(); + } + if (!repoll) break; - - rcu_softirq_qs_periodic(last_qs); - cond_resched(); } + + if (busy_poll_last_qs) + *busy_poll_last_qs = last_qs; } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + unsigned long last_qs = jiffies; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; + + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); - while (!napi_thread_wait(napi)) - napi_threaded_poll_loop(napi); + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; + + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; + + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); + + napi_threaded_poll_loop(napi, want_busy_poll ? &last_qs : NULL); + } return 0; } @@ -8578,7 +8706,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; } - adj = kmalloc(sizeof(*adj), GFP_KERNEL); + adj = kmalloc_obj(*adj); if (!adj) return -ENOMEM; @@ -9018,8 +9146,7 @@ static int netdev_offload_xstats_enable_l3(struct net_device *dev, int err; int rc; - dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), - GFP_KERNEL); + dev->offload_xstats_l3 = kzalloc_obj(*dev->offload_xstats_l3); if (!dev->offload_xstats_l3) return -ENOMEM; @@ -9885,7 +10012,7 @@ DECLARE_RWSEM(dev_addr_sem); /* "sa" is a true struct sockaddr with limited "sa_data" member. */ int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { - size_t size = sizeof(sa->sa_data_min); + size_t size = sizeof(sa->sa_data); struct net_device *dev; int ret = 0; @@ -10544,7 +10671,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return -EINVAL; } - link = kzalloc(sizeof(*link), GFP_USER); + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto unlock; @@ -11274,6 +11401,9 @@ int register_netdevice(struct net_device *dev) if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; + /* TSO_MANGLEID belongs in mangleid_features by definition */ + dev->mangleid_features |= NETIF_F_TSO_MANGLEID; + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; @@ -11822,7 +11952,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) #ifdef CONFIG_NET_CLS_ACT if (queue) return queue; - queue = kzalloc(sizeof(*queue), GFP_KERNEL); + queue = kzalloc_obj(*queue); if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); @@ -11897,8 +12027,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, maxqs = max(txqs, rxqs); - dev = kvzalloc(struct_size(dev, priv, sizeof_priv), - GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + dev = kvzalloc_flex(*dev, priv, sizeof_priv, + GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) return NULL; @@ -11969,11 +12099,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; - dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); + dev->ethtool = kzalloc_obj(*dev->ethtool, GFP_KERNEL_ACCOUNT); if (!dev->ethtool) goto free_all; - dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + dev->cfg = kzalloc_obj(*dev->cfg, GFP_KERNEL_ACCOUNT); if (!dev->cfg) goto free_all; dev->cfg_pending = dev->cfg; @@ -12646,12 +12776,100 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; - hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); + hash = kmalloc_objs(*hash, NETDEV_HASHENTRIES); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); @@ -12959,7 +13177,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog); + napi_threaded_poll_loop(&sd->backlog, NULL); } static void backlog_napi_setup(unsigned int cpu) diff --git a/net/core/dev.h b/net/core/dev.h index 900880e8b5b4..781619e76b3e 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -10,6 +10,7 @@ struct net; struct netlink_ext_ack; +struct netdev_queue_config; struct cpumask; /* Random bits of netdevice that don't need to be exposed */ @@ -29,7 +30,6 @@ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, @@ -92,6 +92,10 @@ extern struct rw_semaphore dev_addr_sem; extern struct list_head net_todo_list; void netdev_run_todo(void); +int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + /* netdev management, shared between various uAPI entry points */ struct netdev_name_node { struct hlist_node hlist; @@ -317,6 +321,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL; + if (test_bit(NAPI_STATE_THREADED, &n->state)) return NETDEV_NAPI_THREADED_ENABLED; @@ -359,41 +366,6 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) void kick_defer_list_purge(unsigned int cpu); -#define XMIT_RECURSION_LIMIT 8 - -#ifndef CONFIG_PREEMPT_RT -static inline bool dev_xmit_recursion(void) -{ - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > - XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - __this_cpu_inc(softnet_data.xmit.recursion); -} - -static inline void dev_xmit_recursion_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.recursion); -} -#else -static inline bool dev_xmit_recursion(void) -{ - return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - current->net_xmit.recursion++; -} - -static inline void dev_xmit_recursion_dec(void) -{ - current->net_xmit.recursion--; -} -#endif - int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 8bb71a10dba0..7a8966544c9d 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -249,10 +249,11 @@ int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) * * Helper for calling the default hardware provider timestamping. * - * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and - * there only exists a phydev->mii_ts->hwtstamp() method. So this will return - * -EOPNOTSUPP for phylib for now, which is still more accurate than letting - * the netdev handle the GET request. + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but + * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this + * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not + * implemented for now, which is still more accurate than letting the netdev + * handle the GET request. */ int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) @@ -286,7 +287,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) int err; if (!ops->ndo_hwtstamp_get) - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -413,7 +414,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) } if (!ops->ndo_hwtstamp_set) - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -437,48 +438,23 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return 0; } -static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, - struct kernel_hwtstamp_config *kernel_cfg) -{ - struct ifreq ifrr; - int err; - - if (!kernel_cfg->ifr) - return -EINVAL; - - strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); - ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; - - err = dev_eth_ioctl(dev, &ifrr, cmd); - if (err) - return err; - - kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; - kernel_cfg->copied_to_user = true; - - return 0; -} - int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg) { const struct net_device_ops *ops = dev->netdev_ops; + int err; if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) { - int err; - - netdev_lock_ops(dev); - err = dev_get_hwtstamp_phylib(dev, kernel_cfg); - netdev_unlock_ops(dev); + if (!ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; - return err; - } + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); - /* Legacy path: unconverted lower driver */ - return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); + return err; } EXPORT_SYMBOL(generic_hwtstamp_get_lower); @@ -487,22 +463,19 @@ int generic_hwtstamp_set_lower(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; + int err; if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) { - int err; - - netdev_lock_ops(dev); - err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); - netdev_unlock_ops(dev); + if (!ops->ndo_hwtstamp_set) + return -EOPNOTSUPP; - return err; - } + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); - /* Legacy path: unconverted lower driver */ - return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); + return err; } EXPORT_SYMBOL(generic_hwtstamp_set_lower); @@ -599,7 +572,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof(ifr->ifr_hwaddr.sa_data_min), + min(sizeof(ifr->ifr_hwaddr.sa_data), (size_t)dev->addr_len)); netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index 1d04754bc756..69d79aee07ef 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,11 +30,6 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; -bool net_is_devmem_iov(struct net_iov *niov) -{ - return niov->type == NET_IOV_DMABUF; -} - static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -54,6 +49,15 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } +static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) +{ + struct net_devmem_dmabuf_binding *binding = + container_of(ref, struct net_devmem_dmabuf_binding, ref); + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); +} + void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); @@ -75,6 +79,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + percpu_ref_exit(&binding->ref); kvfree(binding->tx_vec); kfree(binding); } @@ -97,9 +102,9 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) index = offset / PAGE_SIZE; niov = &owner->area.niovs[index]; - niov->pp_magic = 0; - niov->pp = NULL; - atomic_long_set(&niov->pp_ref_count, 0); + niov->desc.pp_magic = 0; + niov->desc.pp = NULL; + atomic_long_set(&niov->desc.pp_ref_count, 0); return niov; } @@ -143,7 +148,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - net_devmem_dmabuf_binding_put(binding); + percpu_ref_kill(&binding->ref); } int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, @@ -209,7 +214,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dev = dev; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); - refcount_set(&binding->ref, 1); + err = percpu_ref_init(&binding->ref, + net_devmem_dmabuf_binding_release, + 0, GFP_KERNEL); + if (err < 0) + goto err_free_binding; mutex_init(&binding->lock); @@ -220,7 +229,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_binding; + goto err_exit_ref; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, @@ -232,9 +241,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, } if (direction == DMA_TO_DEVICE) { - binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, - sizeof(struct net_iov *), - GFP_KERNEL); + binding->tx_vec = kvmalloc_objs(struct net_iov *, + dmabuf->size / PAGE_SIZE); if (!binding->tx_vec) { err = -ENOMEM; goto err_unmap; @@ -280,9 +288,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, goto err_free_chunks; } - owner->area.niovs = kvmalloc_array(owner->area.num_niovs, - sizeof(*owner->area.niovs), - GFP_KERNEL); + owner->area.niovs = kvmalloc_objs(*owner->area.niovs, + owner->area.num_niovs); if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; @@ -322,6 +329,8 @@ err_unmap: direction); err_detach: dma_buf_detach(dmabuf, binding->attachment); +err_exit_ref: + percpu_ref_exit(&binding->ref); err_free_binding: kfree(binding); err_put_dmabuf: @@ -387,7 +396,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, * net_device. */ dst_dev = dst_dev_rcu(dst); - if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { + if (unlikely(!dst_dev) || + unlikely(dst_dev != READ_ONCE(binding->dev))) { err = -ENODEV; goto out_unlock; } @@ -504,7 +514,8 @@ static void mp_dmabuf_devmem_uninstall(void *mp_priv, xa_erase(&binding->bound_rxqs, xa_idx); if (xa_empty(&binding->bound_rxqs)) { mutex_lock(&binding->lock); - binding->dev = NULL; + ASSERT_EXCLUSIVE_WRITER(binding->dev); + WRITE_ONCE(binding->dev, NULL); mutex_unlock(&binding->lock); } break; diff --git a/net/core/devmem.h b/net/core/devmem.h index 101150d761af..1c5c18581fcb 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -41,7 +41,7 @@ struct net_devmem_dmabuf_binding { * retransmits) hold a reference to the binding until the skb holding * them is freed. */ - refcount_t ref; + struct percpu_ref ref; /* The list of bindings currently active. Used for netlink to notify us * of the user dropping the bind. @@ -94,7 +94,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -126,17 +125,13 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - return refcount_inc_not_zero(&binding->ref); + return percpu_ref_tryget(&binding->ref); } static inline void net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) { - if (!refcount_dec_and_test(&binding->ref)) - return; - - INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); - schedule_work(&binding->unbind_w); + percpu_ref_put(&binding->ref); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -146,7 +141,7 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); -bool net_is_devmem_iov(struct net_iov *niov); + struct net_devmem_dmabuf_binding * net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); struct net_iov * @@ -219,11 +214,6 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) return 0; } -static inline bool net_is_devmem_iov(struct net_iov *niov) -{ - return false; -} - static inline struct net_devmem_dmabuf_binding * net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 60d31c2feed3..f23cea9e1aaf 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -306,8 +306,7 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) struct net_dm_hw_entries *hw_entries; unsigned long flags; - hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), - GFP_KERNEL); + hw_entries = kzalloc_flex(*hw_entries, entries, dm_hit_limit); if (!hw_entries) { /* If the memory allocation failed, we try to perform another * allocation in 1/10 second. Otherwise, the probe function @@ -856,7 +855,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) const char *trap_group_name; const char *trap_name; - hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); + hw_metadata = kzalloc_obj(*hw_metadata, GFP_ATOMIC); if (!hw_metadata) return NULL; @@ -1583,7 +1582,7 @@ static int dropmon_net_event(struct notifier_block *ev_block, case NETDEV_REGISTER: if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) break; - stat = kzalloc(sizeof(*stat), GFP_KERNEL); + stat = kzalloc_obj(*stat); if (!stat) break; diff --git a/net/core/dst.c b/net/core/dst.c index e9d35f49c9e7..092861133023 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -68,6 +68,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->lwtstate = NULL; rcuref_init(&dst->__rcuref, 1); INIT_LIST_HEAD(&dst->rt_uncached); + dst->rt_uncached_list = NULL; dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; @@ -190,7 +191,7 @@ EXPORT_SYMBOL(dst_release_immediate); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); + struct dst_metrics *p = kmalloc_obj(*p, GFP_ATOMIC); if (p) { struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); @@ -294,8 +295,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, { struct metadata_dst *md_dst; - md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), - flags); + md_dst = kmalloc_flex(*md_dst, u.tun_info.options, optslen, flags); if (!md_dst) return NULL; diff --git a/net/core/failover.c b/net/core/failover.c index 2a140b3ea669..0eb2e0ec875b 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -247,7 +247,7 @@ struct failover *failover_register(struct net_device *dev, if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); - failover = kzalloc(sizeof(*failover), GFP_KERNEL); + failover = kzalloc_obj(*failover); if (!failover) return ERR_PTR(-ENOMEM); diff --git a/net/core/filter.c b/net/core/filter.c index fa06c5a08e22..78b548158fb0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -600,8 +600,7 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, if (new_prog) { first_insn = new_prog->insnsi; - addrs = kcalloc(len, sizeof(*addrs), - GFP_KERNEL | __GFP_NOWARN); + addrs = kzalloc_objs(*addrs, len, GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } @@ -1162,7 +1161,7 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp, unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; - fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); + fp->orig_prog = kmalloc_obj(*fkprog); if (!fp->orig_prog) return -ENOMEM; @@ -1482,7 +1481,7 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; - fp = kmalloc(sizeof(*fp), GFP_KERNEL); + fp = kmalloc_obj(*fp); if (!fp) return -ENOMEM; @@ -2229,6 +2228,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, return -ENOMEM; } + if (unlikely(!ipv6_mod_enabled())) + goto out_drop; + rcu_read_lock(); if (!nh) { dst = skb_dst(skb); @@ -2289,12 +2291,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - DEV_STATS_INC(dev, tx_errors); + dev_core_stats_tx_dropped_inc(dev); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - DEV_STATS_INC(dev, tx_errors); + dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); out_xmit: return ret; @@ -2336,6 +2338,10 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { + if (unlikely(!ipv6_mod_enabled())) { + rcu_read_unlock(); + goto out_drop; + } neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { @@ -2396,12 +2402,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - DEV_STATS_INC(dev, tx_errors); + dev_core_stats_tx_dropped_inc(dev); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - DEV_STATS_INC(dev, tx_errors); + dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); out_xmit: return ret; @@ -2458,6 +2464,13 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; + /* BPF test infra's convert___skb_to_skb() can create type-less + * GSO packets. gso_features_check() will detect this as a bad + * offload. However, lets not leak them out in the first place. + */ + if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) + return -EBADMSG; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -3253,11 +3266,11 @@ static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { - /* Caller already did skb_cow() with len as headroom, + /* Caller already did skb_cow() with meta_len+len as headroom, * so no need to do it here. */ skb_push(skb, len); - memmove(skb->data, skb->data + len, off); + skb_postpush_data_move(skb, len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) @@ -3281,7 +3294,7 @@ static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); - memmove(skb->data, old_data, off); + skb_postpull_data_move(skb, len, off); return 0; } @@ -3326,10 +3339,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + const u8 meta_len = skb_metadata_len(skb); u32 off = skb_mac_header_len(skb); int ret; - ret = skb_cow(skb, len_diff); + ret = skb_cow(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; @@ -3345,6 +3359,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } + shinfo->gso_type |= SKB_GSO_DODGY; } bpf_skb_change_protocol(skb, ETH_P_IPV6); @@ -3375,6 +3390,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } + shinfo->gso_type |= SKB_GSO_DODGY; } bpf_skb_change_protocol(skb, ETH_P_IP); @@ -3489,6 +3505,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; + const u8 meta_len = skb_metadata_len(skb); unsigned int gso_type = SKB_GSO_DODGY; int ret; @@ -3499,7 +3516,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return -ENOTSUPP; } - ret = skb_cow_head(skb, len_diff); + ret = skb_cow_head(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; @@ -3873,6 +3890,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { + const u8 meta_len = skb_metadata_len(skb); u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; @@ -3882,7 +3900,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, new_len < skb->len)) return -EINVAL; - ret = skb_cow(skb, head_room); + ret = skb_cow(skb, meta_len + head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that @@ -3894,6 +3912,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, * for redirection into L2 device. */ __skb_push(skb, head_room); + skb_postpush_data_move(skb, head_room, 0); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); @@ -4124,7 +4143,7 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, }; @@ -4138,12 +4157,14 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; - unsigned int tailroom; + int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; - tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + tailroom = rxq->frag_size - skb_frag_size(frag) - + skb_frag_off(frag) % rxq->frag_size; + WARN_ON_ONCE(tailroom < 0); if (unlikely(offset > tailroom)) return -EINVAL; @@ -5734,6 +5755,77 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, + char *optval, int optlen, + bool getopt) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *(int *)optval = sk->sk_bypass_prot_mem; + return 0; + } + + val = *(int *)optval; + if (val < 0 || val > 1) + return -EINVAL; + + sk->sk_bypass_prot_mem = val; + return 0; +} + +BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); + + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { + .func = bpf_sock_create_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); + + if (err) + memset(optval, 0, optlen); + + return err; + } + + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { + .func = bpf_sock_create_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5908,7 +6000,7 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; - return __inet_bind(sk, addr, addr_len, flags); + return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) @@ -5918,7 +6010,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); + return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr, + addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ @@ -6316,7 +6409,7 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -6371,7 +6464,7 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -6422,9 +6515,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; - if (flags & BPF_MTU_CHK_SEGS && - !skb_gso_validate_network_len(skb, mtu)) - ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + if (flags & BPF_MTU_CHK_SEGS) { + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + if (!skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } } out: *mtu_len = mtu; @@ -7922,9 +8018,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct iphdr), - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -7954,9 +8050,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct ipv6hdr), - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -7974,9 +8070,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct iphdr), - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg2_size = sizeof(struct tcphdr), }; @@ -7998,9 +8094,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct ipv6hdr), - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ @@ -8063,6 +8159,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id, prog); } @@ -11921,7 +12031,7 @@ BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); - if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) + if (sk && sk_is_unix(sk)) return (unsigned long)sk; return (unsigned long)NULL; @@ -12016,6 +12126,18 @@ void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; } +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (unlikely(bpf_try_make_writable(skb, 0))) + return -EFAULT; + + memmove(bpf_skb_meta_pointer(skb, offset), from, len); + return 0; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12043,9 +12165,6 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * - * If passed @skb_ is a clone which shares the data with the original, the - * dynptr will be read-only. This limitation may be lifted in the future. - * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null @@ -12063,9 +12182,6 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); - if (skb_cloned(skb)) - bpf_dynptr_set_rdonly(ptr); - return 0; } @@ -12332,11 +12448,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12349,11 +12465,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) -BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) -BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { @@ -12448,7 +12564,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_destroy) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index bc5169482710..5071d7fe6ce2 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -12,8 +12,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) struct flow_rule *rule; int i; - rule = kzalloc(struct_size(rule, action.entries, num_actions), - GFP_KERNEL); + rule = kzalloc_flex(*rule, action.entries, num_actions); if (!rule) return NULL; @@ -33,8 +32,7 @@ struct flow_offload_action *offload_action_alloc(unsigned int num_actions) struct flow_offload_action *fl_action; int i; - fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), - GFP_KERNEL); + fl_action = kzalloc_flex(*fl_action, action.entries, num_actions); if (!fl_action) return NULL; @@ -264,7 +262,7 @@ struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, { struct flow_block_cb *block_cb; - block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + block_cb = kzalloc_obj(*block_cb); if (!block_cb) return ERR_PTR(-ENOMEM); @@ -391,7 +389,7 @@ static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, { struct flow_indr_dev *indr_dev; - indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); + indr_dev = kmalloc_obj(*indr_dev); if (!indr_dev) return NULL; @@ -571,7 +569,7 @@ static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, if (info) return -EEXIST; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc_obj(*info); if (!info) return -ENOMEM; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index f112156db587..c34e58c6c3e6 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -154,7 +154,7 @@ int gen_new_estimator(struct gnet_stats_basic_sync *bstats, if (parm->ewma_log == 0 || parm->ewma_log >= 31) return -EINVAL; - est = kzalloc(sizeof(*est), GFP_KERNEL); + est = kzalloc_obj(*est); if (!est) return -ENOBUFS; diff --git a/net/core/gro.c b/net/core/gro.c index 76f9c3712422..31d21de5b15a 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -115,8 +115,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || - (p->protocol == htons(ETH_P_IPV6) && - skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } @@ -265,6 +263,8 @@ static void gro_complete(struct gro_node *gro, struct sk_buff *skb) goto out; } + /* NICs can feed encapsulated packets into GRO */ + skb->encapsulation = 0; rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) @@ -415,7 +415,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); - BUG_ON(skb->end - skb->tail < grow); + DEBUG_NET_WARN_ON_ONCE(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index a725d21159a6..1b84385c04bd 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -132,7 +132,7 @@ void gro_cells_destroy(struct gro_cells *gcells) * because we might be called from cleanup_net(), and we * definitely do not want to block this critical task. */ - defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + defer = kmalloc_obj(*defer, GFP_KERNEL | __GFP_NOWARN); if (likely(defer)) { defer->ptr = gcells->cells; call_rcu(&defer->rcu, percpu_free_defer_callback); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 95d0a4df1006..a6db36580817 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -17,10 +17,11 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, + .qdisc_max_burst = 1000, .dev_tx_weight = 64, .dev_rx_weight = 64, .sysctl_max_skb_frags = MAX_SKB_FRAGS, - .sysctl_skb_defer_max = 64, + .sysctl_skb_defer_max = 128, .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 212cde35affa..25c455c10a01 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -185,10 +185,6 @@ static void linkwatch_do_dev(struct net_device *dev) netif_state_change(dev); } - /* Note: our callers are responsible for calling netdev_tracker_free(). - * This is the reason we use __dev_put() instead of dev_put(). - */ - __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) @@ -243,6 +239,11 @@ static void __linkwatch_run_queue(int urgent_only) netdev_lock_ops(dev); linkwatch_do_dev(dev); netdev_unlock_ops(dev); + /* Use __dev_put() because netdev_tracker_free() was already + * called above. Must be after netdev_unlock_ops() to prevent + * netdev_run_todo() from freeing the device while still in use. + */ + __dev_put(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } @@ -278,8 +279,13 @@ void __linkwatch_sync_dev(struct net_device *dev) { netdev_ops_assert_locked(dev); - if (linkwatch_clean_dev(dev)) + if (linkwatch_clean_dev(dev)) { linkwatch_do_dev(dev); + /* Use __dev_put() because netdev_tracker_free() was already + * called inside linkwatch_clean_dev(). + */ + __dev_put(dev); + } } void linkwatch_sync_dev(struct net_device *dev) @@ -288,6 +294,10 @@ void linkwatch_sync_dev(struct net_device *dev) netdev_lock_ops(dev); linkwatch_do_dev(dev); netdev_unlock_ops(dev); + /* Use __dev_put() because netdev_tracker_free() was already + * called inside linkwatch_clean_dev(). + */ + __dev_put(dev); } } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bddfa389effa..c56a4e7bf790 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -51,9 +51,8 @@ do { \ #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); -static void __neigh_notify(struct neighbour *n, int type, int flags, - u32 pid); -static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); +static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid); +static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm); @@ -81,7 +80,7 @@ static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family } /* - Neighbour hash table buckets are protected with rwlock tbl->lock. + Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks @@ -117,7 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); - __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); + neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } @@ -149,7 +148,7 @@ static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -172,14 +171,14 @@ static void neigh_update_gc_list(struct neighbour *n) } out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -193,7 +192,7 @@ static void neigh_update_managed_list(struct neighbour *n) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, @@ -263,7 +262,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { @@ -292,7 +291,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) WRITE_ONCE(tbl->last_flush, jiffies); unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); return shrunk; } @@ -454,23 +453,23 @@ static void neigh_flush_table(struct neigh_table *tbl) void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (likely(dev)) { neigh_flush_dev(tbl, dev, skip_perm); } else { DEBUG_NET_WARN_ON_ONCE(skip_perm); neigh_flush_table(tbl); } - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, @@ -563,7 +562,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) struct neigh_hash_table *ret; int i; - ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + ret = kmalloc_obj(*ret, GFP_ATOMIC); if (!ret) return NULL; @@ -687,7 +686,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -722,13 +721,13 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); @@ -821,7 +820,8 @@ int pneigh_create(struct neigh_table *tbl, struct net *net, update: WRITE_ONCE(n->flags, flags); n->permanent = permanent; - WRITE_ONCE(n->protocol, protocol); + if (protocol) + WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); return err; @@ -982,7 +982,7 @@ static void neigh_periodic_work(struct work_struct *work) NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -995,8 +995,7 @@ static void neigh_periodic_work(struct work_struct *work) WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) @@ -1037,9 +1036,9 @@ static void neigh_periodic_work(struct work_struct *work) * It's fine to release lock here, even if hash table * grows while we are preempted. */ - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); cond_resched(); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } @@ -1050,7 +1049,7 @@ out: */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -1106,6 +1105,7 @@ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); + bool skip_probe = false; unsigned int state; int notify = 0; @@ -1173,9 +1173,15 @@ static void neigh_timer_handler(struct timer_list *t) neigh_invalidate(neigh); } notify = 1; - goto out; + skip_probe = true; } + if (notify) + __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0); + + if (skip_probe) + goto out; + if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; @@ -1190,7 +1196,7 @@ out: } if (notify) - neigh_update_notify(neigh, 0); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); trace_neigh_timer_handler(neigh, 0); @@ -1304,6 +1310,47 @@ static void neigh_update_hhs(struct neighbour *neigh) } } +static void neigh_update_process_arp_queue(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + struct sk_buff *skb; + + /* Again: avoid deadlock if something went wrong. */ + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; + + write_unlock_bh(&neigh->lock); + + rcu_read_lock(); + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } + READ_ONCE(n1->output)(n1, skb); + if (n2) + neigh_release(n2); + rcu_read_unlock(); + + write_lock_bh(&neigh->lock); + } + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1330,6 +1377,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; + bool process_arp_queue = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; @@ -1463,53 +1511,30 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_connect(neigh); else neigh_suspect(neigh); - if (!(old & NUD_VALID)) { - struct sk_buff *skb; - - /* Again: avoid dead loop if something went wrong */ - while (neigh->nud_state & NUD_VALID && - (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { - struct dst_entry *dst = skb_dst(skb); - struct neighbour *n2, *n1 = neigh; - write_unlock_bh(&neigh->lock); - - rcu_read_lock(); - - /* Why not just use 'neigh' as-is? The problem is that - * things such as shaper, eql, and sch_teql can end up - * using alternative, different, neigh objects to output - * the packet in the output path. So what we need to do - * here is re-lookup the top-level neigh in the path so - * we can reinject the packet there. - */ - n2 = NULL; - if (dst && - READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { - n2 = dst_neigh_lookup_skb(dst, skb); - if (n2) - n1 = n2; - } - READ_ONCE(n1->output)(n1, skb); - if (n2) - neigh_release(n2); - rcu_read_unlock(); + if (!(old & NUD_VALID)) + process_arp_queue = true; - write_lock_bh(&neigh->lock); - } - __skb_queue_purge(&neigh->arp_queue); - neigh->arp_queue_len_bytes = 0; - } out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); + + if (notify) + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); + + if (process_arp_queue) + neigh_update_process_arp_queue(neigh); + write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); + if (notify) - neigh_update_notify(neigh, nlmsg_pid); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + trace_neigh_update_done(neigh, err); return err; } @@ -1642,12 +1667,12 @@ static void neigh_managed_work(struct work_struct *work) managed_work.work); struct neighbour *neigh; - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) @@ -1749,8 +1774,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; @@ -1763,9 +1787,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, return NULL; } - write_lock_bh(&tbl->lock); - list_add(&p->list, &tbl->parms.list); - write_unlock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); + list_add_rcu(&p->list, &tbl->parms.list); + spin_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } @@ -1785,10 +1809,12 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; - write_lock_bh(&tbl->lock); - list_del(&parms->list); + + spin_lock_bh(&tbl->lock); + list_del_rcu(&parms->list); parms->dead = 1; - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } @@ -1810,8 +1836,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); - tbl->parms.reachable_time = - neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + neigh_set_reach_time(&tbl->parms); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); @@ -1838,7 +1863,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); - rwlock_init(&tbl->lock); + spin_lock_init(&tbl->lock); mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); @@ -1981,10 +2006,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out: return err; @@ -2179,7 +2204,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) return -ENOBUFS; if ((parms->dev && - nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || + nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || @@ -2194,7 +2219,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || @@ -2231,8 +2256,6 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2258,11 +2281,9 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); - rcu_read_unlock(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; @@ -2300,12 +2321,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2324,8 +2343,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2334,11 +2351,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb, neightbl_fill_parms(skb, parms) < 0) goto errout; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2375,9 +2390,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; - struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; @@ -2393,26 +2408,33 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = rcu_dereference_rtnl(neigh_tables[tidx]); + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } - if (!found) - return -ENOENT; + if (!found) { + rcu_read_unlock(); + err = -ENOENT; + goto errout; + } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; @@ -2475,8 +2497,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, @@ -2532,7 +2553,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, err = 0; errout_tbl_lock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); errout: return err; } @@ -2579,10 +2601,12 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = rcu_dereference_rtnl(neigh_tables[tidx]); + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; @@ -2596,7 +2620,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) nidx = 0; p = list_next_entry(&tbl->parms, list); - list_for_each_entry_from(p, &tbl->parms_list, list) { + list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2616,14 +2640,16 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: + rcu_read_unlock(); + cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } -static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, - u32 pid, u32 seq, int type, unsigned int flags) +static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; @@ -2649,23 +2675,19 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; - read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); - if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { - read_unlock_bh(&neigh->lock); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) goto nla_put_failure; - } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; - read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -2684,6 +2706,20 @@ nla_put_failure: return -EMSGSIZE; } +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + int err; + + read_lock_bh(&neigh->lock); + err = __neigh_fill_info(skb, neigh, pid, seq, type, flags); + read_unlock_bh(&neigh->lock); + + return err; +} + static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) @@ -2727,12 +2763,6 @@ nla_put_failure: return -EMSGSIZE; } -static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) -{ - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); - __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); -} - static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; @@ -3127,14 +3157,14 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void rcu_read_lock(); nht = rcu_dereference(tbl->nht); - read_lock_bh(&tbl->lock); /* avoid resizes */ + spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); @@ -3404,7 +3434,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); - read_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -3444,7 +3474,7 @@ void neigh_seq_stop(struct seq_file *seq, void *v) struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -3545,7 +3575,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, if (skb == NULL) goto errout; - err = neigh_fill_info(skb, n, pid, 0, type, flags); + err = __neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3560,9 +3590,16 @@ out: rcu_read_unlock(); } +static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid) +{ + read_lock_bh(&neigh->lock); + __neigh_notify(neigh, type, flags, pid); + read_unlock_bh(&neigh->lock); +} + void neigh_app_ns(struct neighbour *n) { - __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); + neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); @@ -3721,8 +3758,7 @@ static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write * only be effective after the next time neigh_periodic_work * decides to recompute it */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } return ret; } @@ -3918,8 +3954,10 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, - {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, - {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; static int __init neigh_init(void) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 70e0e9a3b650..7dbfa6109f0b 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -170,8 +170,14 @@ static const struct seq_operations softnet_seq_ops = { .show = softnet_seq_show, }; +struct ptype_iter_state { + struct seq_net_private p; + struct net_device *dev; +}; + static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct ptype_iter_state *iter = seq->private; struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; struct net_device *dev; @@ -181,12 +187,16 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) for_each_netdev_rcu(seq_file_net(seq), dev) { ptype_list = &dev->ptype_all; list_for_each_entry_rcu(pt, ptype_list, list) { - if (i == pos) + if (i == pos) { + iter->dev = dev; return pt; + } ++i; } } + iter->dev = NULL; + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { if (i == pos) return pt; @@ -218,6 +228,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct ptype_iter_state *iter = seq->private; struct net *net = seq_file_net(seq); struct net_device *dev; struct packet_type *pt; @@ -229,19 +240,21 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ptype_get_idx(seq, 0); pt = v; - nxt = pt->list.next; - if (pt->dev) { - if (nxt != &pt->dev->ptype_all) + nxt = READ_ONCE(pt->list.next); + dev = iter->dev; + if (dev) { + if (nxt != &dev->ptype_all) goto found; - dev = pt->dev; for_each_netdev_continue_rcu(seq_file_net(seq), dev) { - if (!list_empty(&dev->ptype_all)) { - nxt = dev->ptype_all.next; + nxt = READ_ONCE(dev->ptype_all.next); + if (nxt != &dev->ptype_all) { + iter->dev = dev; goto found; } } - nxt = net->ptype_all.next; + iter->dev = NULL; + nxt = READ_ONCE(net->ptype_all.next); goto net_ptype_all; } @@ -252,20 +265,20 @@ net_ptype_all: if (nxt == &net->ptype_all) { /* continue with ->ptype_specific if it's not empty */ - nxt = net->ptype_specific.next; + nxt = READ_ONCE(net->ptype_specific.next); if (nxt != &net->ptype_specific) goto found; } hash = 0; - nxt = ptype_base[0].next; + nxt = READ_ONCE(ptype_base[0].next); } else hash = ntohs(pt->type) & PTYPE_HASH_MASK; while (nxt == &ptype_base[hash]) { if (++hash >= PTYPE_HASH_SIZE) return NULL; - nxt = ptype_base[hash].next; + nxt = READ_ONCE(ptype_base[hash].next); } found: return list_entry(nxt, struct packet_type, list); @@ -279,19 +292,24 @@ static void ptype_seq_stop(struct seq_file *seq, void *v) static int ptype_seq_show(struct seq_file *seq, void *v) { + struct ptype_iter_state *iter = seq->private; struct packet_type *pt = v; + struct net_device *dev; - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { seq_puts(seq, "Type Device Function\n"); - else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && - (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { + return 0; + } + dev = iter->dev; + if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!dev || net_eq(dev_net(dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else seq_printf(seq, "%04x", ntohs(pt->type)); seq_printf(seq, " %-8s %ps\n", - pt->dev ? pt->dev->name : "", pt->func); + dev ? dev->name : "", pt->func); } return 0; @@ -315,7 +333,7 @@ static int __net_init dev_proc_net_init(struct net *net) &softnet_seq_ops)) goto out_dev; if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, - sizeof(struct seq_net_private))) + sizeof(struct ptype_iter_state))) goto out_softnet; if (wext_proc_init(net)) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ca878525ad7c..07624b682b08 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1022,7 +1022,7 @@ static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, int rps_cpumask_housekeeping(struct cpumask *mask) { if (!cpumask_empty(mask)) { - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) return -EINVAL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0e0f22d7b21..1057d16d5dd2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -395,6 +395,7 @@ static __net_init void preinit_net_sysctl(struct net *net) net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; net->core.sysctl_tstamp_allow_data = 1; + net->core.sysctl_txq_reselection = msecs_to_jiffies(1000); } /* init code that must occur even if setup_net() is not called. */ @@ -439,7 +440,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = ns_tree_gen_id(&net->ns); + net->net_cookie = ns_tree_gen_id(net); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -491,7 +492,7 @@ static struct net *net_alloc(void) goto out_free; #ifdef CONFIG_KEYS - net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); + net->key_domain = kzalloc_obj(struct key_tag); if (!net->key_domain) goto out_free_2; refcount_set(&net->key_domain->usage, 1); @@ -623,9 +624,10 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) } EXPORT_SYMBOL_GPL(net_ns_get_ownership); -static void unhash_nsid(struct net *net, struct net *last) +static void unhash_nsid(struct net *last) { - struct net *tmp; + struct net *tmp, *peer; + /* This function is only called from cleanup_net() work, * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below @@ -633,22 +635,26 @@ static void unhash_nsid(struct net *net, struct net *last) * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { - int id; + int id = 0; spin_lock(&tmp->nsid_lock); - id = __peernet2id(tmp, net); - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - spin_unlock(&tmp->nsid_lock); - if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + while ((peer = idr_get_next(&tmp->netns_ids, &id))) { + int curr_id = id; + + id++; + if (!peer->is_dying) + continue; + + idr_remove(&tmp->netns_ids, curr_id); + spin_unlock(&tmp->nsid_lock); + rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL, GFP_KERNEL); + spin_lock(&tmp->nsid_lock); + } + spin_unlock(&tmp->nsid_lock); if (tmp == last) break; } - spin_lock(&net->nsid_lock); - idr_destroy(&net->netns_ids); - spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -673,6 +679,7 @@ static void cleanup_net(struct work_struct *work) llist_for_each_entry(net, net_kill_list, cleanup_list) { ns_tree_remove(net); list_del_rcu(&net->list); + net->is_dying = true; } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer @@ -687,8 +694,10 @@ static void cleanup_net(struct work_struct *work) last = list_last_entry(&net_namespace_list, struct net, list); up_write(&net_rwsem); + unhash_nsid(last); + llist_for_each_entry(net, net_kill_list, cleanup_list) { - unhash_nsid(net, last); + idr_destroy(&net->netns_ids); list_add_tail(&net->exit_list, &net_exit_list); } @@ -1222,15 +1231,13 @@ static void __init netns_ipv4_struct_check(void) sysctl_tcp_wmem); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_ip_fwd_use_pmtu); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); - - /* TXRX readonly hotpath cache lines */ - CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, - sysctl_tcp_moderate_rcvbuf); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); /* RX readonly hotpath cache line */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rcvbuf_low_rtt); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); @@ -1240,7 +1247,6 @@ static void __init netns_ipv4_struct_check(void) sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index dff66d8fb325..e1e30f0b60cd 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -32,7 +32,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = kzalloc_obj(*cs); if (!cs) return ERR_PTR(-ENOMEM); @@ -93,7 +93,7 @@ static void update_classid_task(struct task_struct *p, u32 classid) /* Only update the leader task, when many threads in this task, * so it can avoid the useless traversal. */ - if (p != p->group_leader) + if (!thread_group_leader(p)) return; do { diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index e9a2a6f26cb7..ba673e81716f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -97,7 +98,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index cf3fad74511f..cffc08517a41 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NETDEV_GEN_H #define _LINUX_NETDEV_GEN_H diff --git a/net/core/netdev_config.c b/net/core/netdev_config.c new file mode 100644 index 000000000000..f14af365d5cd --- /dev/null +++ b/net/core/netdev_config.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/netdevice.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> + +#include "dev.h" + +static int netdev_nop_validate_qcfg(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static int __netdev_queue_config(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack, + bool validate) +{ + int (*validate_cb)(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + struct pp_memory_provider_params *mpp; + int err; + + validate_cb = netdev_nop_validate_qcfg; + if (validate && dev->queue_mgmt_ops->ndo_validate_qcfg) + validate_cb = dev->queue_mgmt_ops->ndo_validate_qcfg; + + memset(qcfg, 0, sizeof(*qcfg)); + + /* Get defaults from the driver, in case user config not set */ + if (dev->queue_mgmt_ops->ndo_default_qcfg) + dev->queue_mgmt_ops->ndo_default_qcfg(dev, qcfg); + err = validate_cb(dev, qcfg, extack); + if (err) + return err; + + /* Apply MP overrides */ + mpp = &__netif_get_rx_queue(dev, rxq_idx)->mp_params; + if (mpp->rx_page_size) + qcfg->rx_page_size = mpp->rx_page_size; + err = validate_cb(dev, qcfg, extack); + if (err) + return err; + + return 0; +} + +/** + * netdev_queue_config() - get configuration for a given queue + * @dev: net_device instance + * @rxq_idx: index of the queue of interest + * @qcfg: queue configuration struct (output) + * + * Render the configuration for a given queue. This helper should be used + * by drivers which support queue configuration to retrieve config for + * a particular queue. + * + * @qcfg is an output parameter and is always fully initialized by this + * function. Some values may not be set by the user, drivers may either + * deal with the "unset" values in @qcfg, or provide the callback + * to populate defaults in queue_management_ops. + */ +void netdev_queue_config(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg) +{ + __netdev_queue_config(dev, rxq_idx, qcfg, NULL, false); +} +EXPORT_SYMBOL(netdev_queue_config); + +int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack) +{ + return __netdev_queue_config(dev, rxq_idx, qcfg, extack, true); +} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index c7d9341b7630..668a90658f25 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -7,6 +7,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/memory_provider.h> +#include "dev.h" #include "page_pool_priv.h" /* See also page_pool_is_unreadable() */ @@ -18,7 +19,10 @@ bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) } EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); -int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +static int netdev_rx_queue_reconfig(struct net_device *dev, + unsigned int rxq_idx, + struct netdev_queue_config *qcfg_old, + struct netdev_queue_config *qcfg_new) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; @@ -41,7 +45,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) goto err_free_new_mem; } - err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + err = qops->ndo_queue_mem_alloc(dev, qcfg_new, new_mem, rxq_idx); if (err) goto err_free_old_mem; @@ -54,7 +58,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) if (err) goto err_free_new_queue_mem; - err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + err = qops->ndo_queue_start(dev, qcfg_new, new_mem, rxq_idx); if (err) goto err_start_queue; } else { @@ -76,7 +80,7 @@ err_start_queue: * WARN if we fail to recover the old rx queue, and at least free * old_mem so we don't also leak that. */ - if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { + if (qops->ndo_queue_start(dev, qcfg_old, old_mem, rxq_idx)) { WARN(1, "Failed to restart old queue in error path. RX queue %d may be unhealthy.", rxq_idx); @@ -94,12 +98,22 @@ err_free_new_mem: return err; } + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_queue_config qcfg; + + netdev_queue_config(dev, rxq_idx, &qcfg); + return netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg, &qcfg); +} EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; + struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int ret; @@ -124,6 +138,10 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); return -EEXIST; } + if (p->rx_page_size && !(qops->supported_params & QCFG_RX_PAGE_SIZE)) { + NL_SET_ERR_MSG(extack, "device does not support: rx_page_size"); + return -EOPNOTSUPP; + } rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { @@ -137,12 +155,20 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, } #endif + netdev_queue_config(dev, rxq_idx, &qcfg[0]); rxq->mp_params = *p; - ret = netdev_rx_queue_restart(dev, rxq_idx); - if (ret) { - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - } + ret = netdev_queue_config_validate(dev, rxq_idx, &qcfg[1], extack); + if (ret) + goto err_clear_mp; + + ret = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]); + if (ret) + goto err_clear_mp; + + return 0; + +err_clear_mp: + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); return ret; } @@ -160,6 +186,7 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { + struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int err; @@ -179,9 +206,11 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, rxq->mp_params.mp_priv != old_p->mp_priv)) return; - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, ifq_idx); + netdev_queue_config(dev, ifq_idx, &qcfg[0]); + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); + netdev_queue_config(dev, ifq_idx, &qcfg[1]); + + err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]); WARN_ON(err && err != -ENETDOWN); } diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index cd95394399b4..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,19 +5,19 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; + return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) { - __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; } static inline void netmem_clear_pp_magic(netmem_ref netmem) { - WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - __netmem_clear_lsb(netmem)->pp_magic = 0; + netmem_to_nmdesc(netmem)->pp_magic = 0; } static inline bool netmem_is_pp(netmem_ref netmem) @@ -27,13 +27,13 @@ static inline bool netmem_is_pp(netmem_ref netmem) static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { - __netmem_clear_lsb(netmem)->pp = pool; + netmem_to_nmdesc(netmem)->pp = pool; } static inline void netmem_set_dma_addr(netmem_ref netmem, unsigned long dma_addr) { - __netmem_clear_lsb(netmem)->dma_addr = dma_addr; + netmem_to_nmdesc(netmem)->dma_addr = dma_addr; } static inline unsigned long netmem_get_dma_index(netmem_ref netmem) @@ -43,7 +43,7 @@ static inline unsigned long netmem_get_dma_index(netmem_ref netmem) if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return 0; - magic = __netmem_clear_lsb(netmem)->pp_magic; + magic = netmem_to_nmdesc(netmem)->pp_magic; return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; } @@ -57,6 +57,6 @@ static inline void netmem_set_dma_index(netmem_ref netmem, return; magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); - __netmem_clear_lsb(netmem)->pp_magic = magic; + netmem_to_nmdesc(netmem)->pp_magic = magic; } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 331764845e8f..cd74beffd209 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -132,7 +132,7 @@ static int netif_local_xmit_active(struct net_device *dev) for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + if (netif_tx_owned(txq, smp_processor_id())) return 1; } @@ -554,6 +554,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) int err; skb_queue_head_init(&np->skb_pool); + INIT_WORK(&np->refill_wq, refill_skbs_work_handler); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", @@ -564,7 +565,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) npinfo = rtnl_dereference(ndev->npinfo); if (!npinfo) { - npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + npinfo = kmalloc_obj(*npinfo); if (!npinfo) { err = -ENOMEM; goto out; @@ -591,7 +592,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) /* fill up the skb queue */ refill_skbs(np); - INIT_WORK(&np->refill_wq, refill_skbs_work_handler); /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 8456dfbe2eb4..76c33ab44761 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -135,7 +135,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; - css = kzalloc(sizeof(*css), GFP_KERNEL); + css = kzalloc_obj(*css); if (!css) return ERR_PTR(-ENOMEM); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a5edec485f1..265a729431bb 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -301,12 +301,16 @@ static int page_pool_init(struct page_pool *pool, } static_branch_inc(&page_pool_mem_providers); + } else if (pool->p.order > MAX_PAGE_ORDER) { + err = -EINVAL; + goto free_ptr_ring; } return 0; free_ptr_ring: ptr_ring_cleanup(&pool->ring, NULL); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index c82a95beceff..ee5060d8eec0 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -245,7 +245,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, goto err_cancel; if (pool->user.detach_time && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, - pool->user.detach_time)) + ktime_divns(pool->user.detach_time, NSEC_PER_SEC))) goto err_cancel; if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) @@ -337,7 +337,7 @@ err_unlock: void page_pool_detached(struct page_pool *pool) { mutex_lock(&page_pools_lock); - pool->user.detach_time = ktime_get_boottime_seconds(); + pool->user.detach_time = ktime_get_boottime(); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); mutex_unlock(&page_pools_lock); } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d41b03fd1f63..8e185b318288 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -126,6 +126,7 @@ #include <linux/string.h> #include <linux/ptrace.h> #include <linux/errno.h> +#include <linux/hex.h> #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/capability.h> diff --git a/net/core/request_sock.c b/net/core/request_sock.c deleted file mode 100644 index 897a8f01a67b..000000000000 --- a/net/core/request_sock.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET Generic infrastructure for Network protocols. - * - * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * From code originally in include/net/tcp.h - */ - -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/tcp.h> -#include <linux/vmalloc.h> - -#include <net/request_sock.h> - -/* - * Maximum number of SYN_RECV sockets in queue per LISTEN socket. - * One SYN_RECV socket costs about 80bytes on a 32bit machine. - * It would be better to replace it with a global counter for all sockets - * but then some measure against one socket starving all other sockets - * would be needed. - * - * The minimum value of it is 128. Experiments with real servers show that - * it is absolutely not enough even at 100conn/sec. 256 cures most - * of problems. - * This value is adjusted to 128 for low memory machines, - * and it will increase in proportion to the memory of machine. - * Note : Dont forget somaxconn that may limit backlog too. - */ - -void reqsk_queue_alloc(struct request_sock_queue *queue) -{ - queue->fastopenq.rskq_rst_head = NULL; - queue->fastopenq.rskq_rst_tail = NULL; - queue->fastopenq.qlen = 0; - - queue->rskq_accept_head = NULL; -} - -/* - * This function is called to set a Fast Open socket's "fastopen_rsk" field - * to NULL when a TFO socket no longer needs to access the request_sock. - * This happens only after 3WHS has been either completed or aborted (e.g., - * RST is received). - * - * Before TFO, a child socket is created only after 3WHS is completed, - * hence it never needs to access the request_sock. things get a lot more - * complex with TFO. A child socket, accepted or not, has to access its - * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, - * until 3WHS is either completed or aborted. Afterwards the req will stay - * until either the child socket is accepted, or in the rare case when the - * listener is closed before the child is accepted. - * - * In short, a request socket is only freed after BOTH 3WHS has completed - * (or aborted) and the child socket has been accepted (or listener closed). - * When a child socket is accepted, its corresponding req->sk is set to - * NULL since it's no longer needed. More importantly, "req->sk == NULL" - * will be used by the code below to determine if a child socket has been - * accepted or not, and the check is protected by the fastopenq->lock - * described below. - * - * Note that fastopen_rsk is only accessed from the child socket's context - * with its socket lock held. But a request_sock (req) can be accessed by - * both its child socket through fastopen_rsk, and a listener socket through - * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin - * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. - * only in the rare case when both the listener and the child locks are held, - * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. - * The lock also protects other fields such as fastopenq->qlen, which is - * decremented by this function when fastopen_rsk is no longer needed. - * - * Note that another solution was to simply use the existing socket lock - * from the listener. But first socket lock is difficult to use. It is not - * a simple spin lock - one must consider sock_owned_by_user() and arrange - * to use sk_add_backlog() stuff. But what really makes it infeasible is the - * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. - * - * This function also sets "treq->tfo_listener" to false. - * treq->tfo_listener is used by the listener so it is protected by the - * fastopenq->lock in this function. - */ -void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, - bool reset) -{ - struct sock *lsk = req->rsk_listener; - struct fastopen_queue *fastopenq; - - fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - - RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); - spin_lock_bh(&fastopenq->lock); - fastopenq->qlen--; - tcp_rsk(req)->tfo_listener = false; - if (req->sk) /* the child socket hasn't been accepted yet */ - goto out; - - if (!reset || lsk->sk_state != TCP_LISTEN) { - /* If the listener has been closed don't bother with the - * special RST handling below. - */ - spin_unlock_bh(&fastopenq->lock); - reqsk_put(req); - return; - } - /* Wait for 60secs before removing a req that has triggered RST. - * This is a simple defense against TFO spoofing attack - by - * counting the req against fastopen.max_qlen, and disabling - * TFO when the qlen exceeds max_qlen. - * - * For more details see CoNext'11 "TCP Fast Open" paper. - */ - req->rsk_timer.expires = jiffies + 60*HZ; - if (fastopenq->rskq_rst_head == NULL) - fastopenq->rskq_rst_head = req; - else - fastopenq->rskq_rst_tail->dl_next = req; - - req->dl_next = NULL; - fastopenq->rskq_rst_tail = req; - fastopenq->qlen++; -out: - spin_unlock_bh(&fastopenq->lock); -} diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 576d5ec3bb36..dad4b1054955 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -414,7 +414,7 @@ static int rtnl_register_internal(struct module *owner, if (!link) goto unlock; } else { - link = kzalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc_obj(*link); if (!link) goto unlock; } @@ -1270,13 +1270,13 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { - return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + size_t size; + + size = NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) - + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -1329,6 +1329,12 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(2) /* IFLA_HEADROOM */ + nla_total_size(2) /* IFLA_TAILROOM */ + 0; + + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) + size += nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + return size; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -2123,7 +2129,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - if (rtnl_fill_stats(skb, dev)) + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) && + rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) @@ -3962,7 +3969,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, int ops_srcu_index; int ret; - tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + tbs = kmalloc_obj(*tbs); if (!tbs) return -ENOMEM; diff --git a/net/core/scm.c b/net/core/scm.c index 66eaee783e8b..a29aa8fb8065 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -83,7 +83,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); + fpl = kmalloc_obj(struct scm_fp_list, GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -273,17 +273,13 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) check_object_size(data, cmlen - sizeof(*cm), true); - if (can_do_masked_user_access()) - cm = masked_user_access_begin(cm); - else if (!user_write_access_begin(cm, cmlen)) - goto efault; - - unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); - unsafe_put_user(level, &cm->cmsg_level, efault_end); - unsafe_put_user(type, &cm->cmsg_type, efault_end); - unsafe_copy_to_user(CMSG_USER_DATA(cm), data, - cmlen - sizeof(*cm), efault_end); - user_write_access_end(); + scoped_user_write_access_size(cm, cmlen, efault) { + unsafe_put_user(cmlen, &cm->cmsg_len, efault); + unsafe_put_user(level, &cm->cmsg_level, efault); + unsafe_put_user(type, &cm->cmsg_type, efault); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault); + } } else { struct cmsghdr *cm = msg->msg_control; @@ -301,8 +297,6 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_controllen -= cmlen; return 0; -efault_end: - user_write_access_end(); efault: return -EFAULT; } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 9a3965680451..6a6f2cda5aae 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -20,7 +20,6 @@ #include <net/tcp.h> static siphash_aligned_key_t net_secret; -static siphash_aligned_key_t ts_secret; #define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) @@ -28,11 +27,6 @@ static __always_inline void net_secret_init(void) { net_get_random_once(&net_secret, sizeof(net_secret)); } - -static __always_inline void ts_secret_init(void) -{ - net_get_random_once(&ts_secret, sizeof(ts_secret)); -} #endif #ifdef CONFIG_INET @@ -53,28 +47,9 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -u32 secure_tcpv6_ts_off(const struct net *net, - const __be32 *saddr, const __be32 *daddr) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - }; - - if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) - return 0; - - ts_secret_init(); - return siphash(&combined, offsetofend(typeof(combined), daddr), - &ts_secret); -} -EXPORT_IPV6_MOD(secure_tcpv6_ts_off); - -u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +union tcp_seq_and_ts_off +secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr, + const __be32 *daddr, __be16 sport, __be16 dport) { const struct { struct in6_addr saddr; @@ -87,14 +62,20 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, .sport = sport, .dport = dport }; - u32 hash; + union tcp_seq_and_ts_off st; net_secret_init(); - hash = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - return seq_scale(hash); + + st.hash64 = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + st.ts_off = 0; + + st.seq = seq_scale(st.seq); + return st; } -EXPORT_SYMBOL(secure_tcpv6_seq); +EXPORT_SYMBOL(secure_tcpv6_seq_and_ts_off); u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) @@ -118,33 +99,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) -{ - if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) - return 0; - - ts_secret_init(); - return siphash_2u32((__force u32)saddr, (__force u32)daddr, - &ts_secret); -} - /* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, * it would be easy enough to have the former function use siphash_4u32, passing * the arguments as separate u32. */ -u32 secure_tcp_seq(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +union tcp_seq_and_ts_off +secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) { - u32 hash; + u32 ports = (__force u32)sport << 16 | (__force u32)dport; + union tcp_seq_and_ts_off st; net_secret_init(); - hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - return seq_scale(hash); + + st.hash64 = siphash_3u32((__force u32)saddr, (__force u32)daddr, + ports, &net_secret); + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + st.ts_off = 0; + + st.seq = seq_scale(st.seq); + return st; } -EXPORT_SYMBOL_GPL(secure_tcp_seq); +EXPORT_SYMBOL_GPL(secure_tcp_seq_and_ts_off); u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { diff --git a/net/core/selftests.c b/net/core/selftests.c index 3d79133a91a6..0a203d3fb9dc 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -14,46 +14,10 @@ #include <net/tcp.h> #include <net/udp.h> -struct net_packet_attrs { - const unsigned char *src; - const unsigned char *dst; - u32 ip_src; - u32 ip_dst; - bool tcp; - u16 sport; - u16 dport; - int timeout; - int size; - int max_size; - u8 id; - u16 queue_mapping; - bool bad_csum; -}; - -struct net_test_priv { - struct net_packet_attrs *packet; - struct packet_type pt; - struct completion comp; - int double_vlan; - int vlan_id; - int ok; -}; - -struct netsfhdr { - __be32 version; - __be64 magic; - u8 id; -} __packed; - static u8 net_test_next_id; -#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct netsfhdr)) -#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL -#define NET_LB_TIMEOUT msecs_to_jiffies(200) - -static struct sk_buff *net_test_get_skb(struct net_device *ndev, - struct net_packet_attrs *attr) +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) { struct sk_buff *skb = NULL; struct udphdr *uhdr = NULL; @@ -142,8 +106,8 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, shdr = skb_put(skb, sizeof(*shdr)); shdr->version = 0; shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); - attr->id = net_test_next_id; - shdr->id = net_test_next_id++; + attr->id = id; + shdr->id = id; if (attr->size) { void *payload = skb_put(skb, attr->size); @@ -190,6 +154,7 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, return skb; } +EXPORT_SYMBOL_GPL(net_test_get_skb); static int net_test_loopback_validate(struct sk_buff *skb, struct net_device *ndev, @@ -272,7 +237,7 @@ static int __net_test_loopback(struct net_device *ndev, struct sk_buff *skb = NULL; int ret = 0; - tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + tpriv = kzalloc_obj(*tpriv); if (!tpriv) return -ENOMEM; @@ -286,12 +251,13 @@ static int __net_test_loopback(struct net_device *ndev, tpriv->packet = attr; dev_add_pack(&tpriv->pt); - skb = net_test_get_skb(ndev, attr); + skb = net_test_get_skb(ndev, net_test_next_id, attr); if (!skb) { ret = -ENOMEM; goto cleanup; } + net_test_next_id++; ret = dev_direct_xmit(skb, attr->queue_mapping); if (ret < 0) { goto cleanup; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6be01454f262..0e217041958a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -78,9 +78,11 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> +#include <net/can.h> #include <net/page_pool/helpers.h> #include <net/psp/types.h> #include <net/dropreason.h> +#include <net/xdp_sock.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -222,9 +224,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) skb_panic(skb, sz, addr, __func__); } -#define NAPI_SKB_CACHE_SIZE 64 -#define NAPI_SKB_CACHE_BULK 16 -#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#define NAPI_SKB_CACHE_SIZE 128 +#define NAPI_SKB_CACHE_BULK 32 +#define NAPI_SKB_CACHE_FREE 32 struct napi_alloc_cache { local_lock_t bh_lock; @@ -274,17 +276,23 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) } EXPORT_SYMBOL(__netdev_alloc_frag_align); -static struct sk_buff *napi_skb_cache_get(void) +/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler + * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. + */ +static u32 skbuff_cache_size __read_mostly; + +static inline struct sk_buff *napi_skb_cache_get(bool alloc) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { - nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - GFP_ATOMIC | __GFP_NOWARN, - NAPI_SKB_CACHE_BULK, - nc->skb_cache); + if (alloc) + nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); if (unlikely(!nc->skb_count)) { local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; @@ -292,12 +300,31 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; + if (nc->skb_count) + prefetch(nc->skb_cache[nc->skb_count - 1]); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); - kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); + kasan_mempool_unpoison_object(skb, skbuff_cache_size); return skb; } +/* + * Only clear those fields we need to clear, not those that we will + * actually initialise later. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ +static inline void skbuff_clear(struct sk_buff *skb) +{ + /* Replace memset(skb, 0, offsetof(struct sk_buff, tail)) + * with two smaller memset(), with a barrier() between them. + * This forces the compiler to inline both calls. + */ + BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128); + memset(skb, 0, 128); + barrier(); + memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128); +} + /** * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache * @skbs: pointer to an at least @n-sized array to fill with skb pointers @@ -345,12 +372,10 @@ u32 napi_skb_cache_get_bulk(void **skbs, u32 n) get: for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { - u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); - skbs[i] = nc->skb_cache[base + i]; - kasan_mempool_unpoison_object(skbs[i], cache_size); - memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); + skbuff_clear(skbs[i]); } nc->skb_count -= n; @@ -417,7 +442,7 @@ struct sk_buff *slab_build_skb(void *data) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); @@ -469,7 +494,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); __build_skb_around(skb, data, frag_size); return skb; @@ -526,11 +551,11 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = napi_skb_cache_get(); + skb = napi_skb_cache_get(true); if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); __build_skb_around(skb, data, frag_size); return skb; @@ -559,6 +584,16 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(napi_build_skb); +static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node) +{ + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + if (!obj_size) + return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, + flags, node); + return kmalloc_node_track_caller(obj_size, flags, node); +} + /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and @@ -567,9 +602,8 @@ EXPORT_SYMBOL(napi_build_skb); * memory is free */ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, - bool *pfmemalloc) + struct sk_buff *skb) { - bool ret_pfmemalloc = false; size_t obj_size; void *obj; @@ -580,12 +614,12 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; - if (obj || !(gfp_pfmemalloc_allowed(flags))) + if (likely(obj)) goto out; /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); - goto out; + if (skb) + skb->pfmemalloc = true; + return kmalloc_pfmemalloc(0, flags, node); } obj_size = kmalloc_size_roundup(obj_size); @@ -601,17 +635,14 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); - if (obj || !(gfp_pfmemalloc_allowed(flags))) + if (likely(obj)) goto out; /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(obj_size, flags, node); - + if (skb) + skb->pfmemalloc = true; + obj = kmalloc_pfmemalloc(obj_size, flags, node); out: - if (pfmemalloc) - *pfmemalloc = ret_pfmemalloc; - return obj; } @@ -641,56 +672,66 @@ out: struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { + struct sk_buff *skb = NULL; struct kmem_cache *cache; - struct sk_buff *skb; - bool pfmemalloc; u8 *data; - cache = (flags & SKB_ALLOC_FCLONE) - ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; - if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; - /* Get the HEAD */ - if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && - likely(node == NUMA_NO_NODE || node == numa_mem_id())) - skb = napi_skb_cache_get(); - else + if (flags & SKB_ALLOC_FCLONE) { + cache = net_hotdata.skbuff_fclone_cache; + goto fallback; + } + cache = net_hotdata.skbuff_cache; + if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) + goto fallback; + + if (flags & SKB_ALLOC_NAPI) { + skb = napi_skb_cache_get(true); + if (unlikely(!skb)) + return NULL; + } else if (!in_hardirq() && !irqs_disabled()) { + local_bh_disable(); + skb = napi_skb_cache_get(false); + local_bh_enable(); + } + + if (!skb) { +fallback: skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); - if (unlikely(!skb)) - return NULL; - prefetchw(skb); + if (unlikely(!skb)) + return NULL; + } + skbuff_clear(skb); /* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); + data = kmalloc_reserve(&size, gfp_mask, node, skb); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - prefetchw(data + SKB_WITH_OVERHEAD(size)); - - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, size); - skb->pfmemalloc = pfmemalloc; + __finalize_skb_around(skb, data, size); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - skb->fclone = SKB_FCLONE_ORIG; + /* skb->fclone is a 2bits field. + * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG) + * with a single OR. + */ + BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0); + DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE); + skb->fclone |= SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); } @@ -1136,12 +1177,22 @@ void skb_release_head_state(struct sk_buff *skb) skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); - skb->destructor(skb); - } -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb_nfct(skb)); +#ifdef CONFIG_INET + INDIRECT_CALL_4(skb->destructor, + tcp_wfree, __sock_wfree, sock_wfree, + xsk_destruct_skb, + skb); +#else + INDIRECT_CALL_2(skb->destructor, + sock_wfree, xsk_destruct_skb, + skb); + #endif - skb_ext_put(skb); + skb->destructor = NULL; + skb->sk = NULL; + } + nf_reset_ct(skb); + skb_ext_reset(skb); } /* Free everything but the sk_buff shell. */ @@ -1282,14 +1333,15 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) has_mac = skb_mac_header_was_set(skb); has_trans = skb_transport_header_was_set(skb); - printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" - "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" + printk("%sskb len=%u data_len=%u headroom=%u headlen=%u tailroom=%u\n" + "end-tail=%u mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", - level, skb->len, headroom, skb_headlen(skb), tailroom, + level, skb->len, skb->data_len, headroom, skb_headlen(skb), + tailroom, skb->end - skb->tail, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, skb->mac_len, @@ -1417,7 +1469,6 @@ void __consume_stateless_skb(struct sk_buff *skb) static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - u32 i; if (!kasan_mempool_poison_object(skb)) return; @@ -1426,13 +1477,16 @@ static void napi_skb_cache_put(struct sk_buff *skb) nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { - for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) + u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE; + + for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++) kasan_mempool_unpoison_object(nc->skb_cache[i], - kmem_cache_size(net_hotdata.skbuff_cache)); + skbuff_cache_size); - kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, - nc->skb_cache + NAPI_SKB_CACHE_HALF); - nc->skb_count = NAPI_SKB_CACHE_HALF; + kmem_cache_free_bulk(net_hotdata.skbuff_cache, + NAPI_SKB_CACHE_FREE, + nc->skb_cache + remaining); + nc->skb_count = remaining; } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } @@ -1455,16 +1509,32 @@ void napi_skb_free_stolen_head(struct sk_buff *skb) napi_skb_cache_put(skb); } +/** + * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache + * @skb: buffer to free + * @budget: NAPI budget + * + * Non-zero @budget must come from the @budget argument passed by the core + * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll + * for example when polling for netpoll / netconsole. + * + * Passing @budget of 0 is safe from any context, it turns this function + * into dev_consume_skb_any(). + */ void napi_consume_skb(struct sk_buff *skb, int budget) { - /* Zero budget indicate non-NAPI context called us, like netpoll */ - if (unlikely(!budget)) { + if (unlikely(!budget || !skb)) { dev_consume_skb_any(skb); return; } DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { + skb_release_head_state(skb); + return skb_attempt_defer_free(skb); + } + if (!skb_unref(skb)) return; @@ -2218,6 +2288,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. + * + * Note: If you skb_push() the start of the buffer after reallocating the + * header, call skb_postpush_data_move() first to move the metadata out of + * the way before writing to &sk_buff->data. */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, @@ -2289,8 +2363,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); - skb_metadata_clear(skb); - /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). @@ -4597,12 +4669,14 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, { struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; unsigned int tnl_hlen = skb_tnl_header_len(skb); - unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int len_diff, err; + /* Only skb_gro_receive_list generated skbs arrive here */ + DEBUG_NET_WARN_ON_ONCE(!(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)); + skb_push(skb, -skb_network_offset(skb) + offset); /* Ensure the head is writeable before touching the shared info */ @@ -4616,8 +4690,9 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, nskb = list_skb; list_skb = list_skb->next; + DEBUG_NET_WARN_ON_ONCE(nskb->sk); + err = 0; - delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { @@ -4660,7 +4735,6 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, goto err_linearize; } - skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; skb->len = skb->len - delta_len; @@ -5066,6 +5140,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_INET_PSP) [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), #endif +#if IS_ENABLED(CONFIG_CAN) + [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -5081,7 +5158,7 @@ static __always_inline unsigned int skb_ext_total_length(void) static void skb_extensions_init(void) { - BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(SKB_EXT_NUM > 8); #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); #endif @@ -5116,6 +5193,8 @@ void __init skb_init(void) offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); + skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, @@ -5511,15 +5590,28 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { - bool ret; + struct socket *sock; + struct file *file; + bool ret = false; if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; - read_lock_bh(&sk->sk_callback_lock); - ret = sk->sk_socket && sk->sk_socket->file && - file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); - read_unlock_bh(&sk->sk_callback_lock); + /* The sk pointer remains valid as long as the skb is. The sk_socket and + * file pointer may become NULL if the socket is closed. Both structures + * (including file->cred) are RCU freed which means they can be accessed + * within a RCU read section. + */ + rcu_read_lock(); + sock = READ_ONCE(sk->sk_socket); + if (!sock) + goto out; + file = READ_ONCE(sock->file); + if (!file) + goto out; + ret = file_ns_capable(file, &init_user_ns, CAP_NET_RAW); +out: + rcu_read_unlock(); return ret; } @@ -7187,10 +7279,15 @@ void skb_attempt_defer_free(struct sk_buff *skb) { struct skb_defer_node *sdn; unsigned long defer_count; - int cpu = skb->alloc_cpu; unsigned int defer_max; bool kick; + int cpu; + /* zero copy notifications should not be delayed. */ + if (skb_zcopy(skb)) + goto nodefer; + + cpu = skb->alloc_cpu; if (cpu == raw_smp_processor_id() || WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu)) { @@ -7348,31 +7445,56 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, } EXPORT_SYMBOL(csum_and_copy_from_iter_full); -void get_netmem(netmem_ref netmem) +void __get_netmem(netmem_ref netmem) { - struct net_iov *niov; + struct net_iov *niov = netmem_to_net_iov(netmem); - if (netmem_is_net_iov(netmem)) { - niov = netmem_to_net_iov(netmem); - if (net_is_devmem_iov(niov)) - net_devmem_get_net_iov(netmem_to_net_iov(netmem)); - return; - } - get_page(netmem_to_page(netmem)); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); } -EXPORT_SYMBOL(get_netmem); +EXPORT_SYMBOL(__get_netmem); -void put_netmem(netmem_ref netmem) +void __put_netmem(netmem_ref netmem) { - struct net_iov *niov; + struct net_iov *niov = netmem_to_net_iov(netmem); - if (netmem_is_net_iov(netmem)) { - niov = netmem_to_net_iov(netmem); - if (net_is_devmem_iov(niov)) - net_devmem_put_net_iov(netmem_to_net_iov(netmem)); - return; + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); +} +EXPORT_SYMBOL(__put_netmem); + +struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset) +{ + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; + + /* if type is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (vlan_depth) { + if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN)) + return (struct vlan_type_depth) { 0 }; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; } + do { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) + return (struct vlan_type_depth) { 0 }; - put_page(netmem_to_page(netmem)); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (eth_type_vlan(type)); + + return (struct vlan_type_depth) { + .type = type, + .depth = vlan_depth + }; } -EXPORT_SYMBOL(put_netmem); +EXPORT_SYMBOL(__vlan_get_protocol_offset); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2ac7731e1e0a..3261793abe83 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,22 +409,26 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -/* Receive sk_msg from psock->ingress_msg to @msg. */ -int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags) +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self) { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; + bool from_self; msg_rx = sk_psock_peek_msg(psock); + if (copied_from_self) + *copied_from_self = 0; + while (copied != len) { struct scatterlist *sge; if (unlikely(!msg_rx)) break; + from_self = msg_rx->sk == sk; i = msg_rx->sg.start; do { struct page *page; @@ -443,6 +447,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } copied += copy; + if (from_self && copied_from_self) + *copied_from_self += copy; + if (likely(!peek)) { sge->offset += copy; sge->length -= copy; @@ -451,6 +458,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy; + sk_psock_msg_len_add(psock, -copy); if (!sge->length) { sk_msg_iter_var_next(i); @@ -487,6 +495,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, out: return copied; } + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); +} EXPORT_SYMBOL_GPL(sk_msg_recvmsg); bool sk_msg_is_readable(struct sock *sk) @@ -507,7 +522,7 @@ static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; - msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); + msg = kzalloc_obj(*msg, gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); @@ -616,6 +631,12 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); + + /* This is used in tcp_bpf_recvmsg_parser() to determine whether the + * data originates from the socket's own protocol stack. No need to + * refcount sk because msg's lifetime is bound to sk via the ingress_msg. + */ + msg->sk = sk; err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); @@ -801,9 +822,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); + sk_psock_msg_len_add(psock, -msg->sg.size); sk_msg_free(psock->sk, msg); kfree(msg); } + WARN_ON_ONCE(psock->msg_tot_len); } static void __sk_psock_zap_ingress(struct sk_psock *psock) @@ -909,6 +932,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, sk_msg_compute_data_pointers(msg); msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); + msg->sk = NULL; ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { @@ -1181,8 +1205,8 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) return; psock->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; + WRITE_ONCE(sk->sk_data_ready, sk_psock_strp_data_ready); + WRITE_ONCE(sk->sk_write_space, sk_psock_write_space); } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) @@ -1192,8 +1216,8 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) if (!psock->saved_data_ready) return; - sk->sk_data_ready = psock->saved_data_ready; - psock->saved_data_ready = NULL; + WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready); + WRITE_ONCE(psock->saved_data_ready, NULL); strp_stop(&psock->strp); } @@ -1272,8 +1296,8 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) return; psock->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_verdict_data_ready; - sk->sk_write_space = sk_psock_write_space; + WRITE_ONCE(sk->sk_data_ready, sk_psock_verdict_data_ready); + WRITE_ONCE(sk->sk_write_space, sk_psock_write_space); } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) @@ -1284,6 +1308,6 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) if (!psock->saved_data_ready) return; - sk->sk_data_ready = psock->saved_data_ready; + WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready); psock->saved_data_ready = NULL; } diff --git a/net/core/sock.c b/net/core/sock.c index dc03d4b5909a..5976100a9d55 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -155,7 +155,7 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); static void sock_def_write_space(struct sock *sk); /** @@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes) if (!charged) return -ENOMEM; + if (sk->sk_bypass_prot_mem) + goto success; + /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); + /* If the system goes into memory pressure with this * precharge, give up and return error. */ @@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } + +success: sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, @@ -1091,7 +1097,7 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) return -EINVAL; num_tokens = optlen / sizeof(*tokens); - tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); + tokens = kvmalloc_objs(*tokens, num_tokens); if (!tokens) return -ENOMEM; @@ -2300,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + sk->sk_kern_sock = kern; sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); @@ -2313,7 +2324,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } sock_net_set(sk, net); - refcount_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -2451,13 +2462,16 @@ static void sk_init_common(struct sock *sk) } /** - * sk_clone_lock - clone a socket, and lock its clone - * @sk: the socket to clone - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * sk_clone - clone a socket + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @lock: if true, lock the cloned sk * - * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + * If @lock is true, the clone is locked by bh_lock_sock(), and + * caller must unlock socket even in error path by bh_unlock_sock(). */ -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, + bool lock) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; @@ -2486,16 +2500,19 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } + sk_node_init(&newsk->sk_node); sock_lock_init(newsk); - bh_lock_sock(newsk); + + if (lock) + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); - /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ - refcount_set(&newsk->sk_wmem_alloc, 1); + refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); @@ -2580,12 +2597,13 @@ free: * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); + if (lock) + bh_unlock_sock(newsk); sk_free(newsk); newsk = NULL; goto out; } -EXPORT_SYMBOL_GPL(sk_clone_lock); +EXPORT_SYMBOL_GPL(sk_clone); static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { @@ -2649,16 +2667,18 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); */ void sock_wfree(struct sk_buff *skb) { - struct sock *sk = skb->sk; unsigned int len = skb->truesize; + struct sock *sk = skb->sk; bool free; + int old; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); - free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); - sock_def_write_space_wfree(sk); + free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, + &old); + sock_def_write_space_wfree(sk, old - len); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); @@ -2695,6 +2715,8 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + int old_wmem; + skb_orphan(skb); #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) @@ -2708,7 +2730,15 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ - refcount_add(skb->truesize, &sk->sk_wmem_alloc); + __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); + + /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket + * is in a host queue (qdisc, NIC queue). + * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue + * based on XPS for better performance. + * Otherwise clear ooo_okay to not risk Out Of Order delivery. + */ + skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); } EXPORT_SYMBOL(skb_set_owner_w); @@ -3136,8 +3166,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; - sk_enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); @@ -3254,10 +3287,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - long allocated; + long allocated = 0; - sk_memory_allocated_add(sk, amt); - allocated = sk_memory_allocated(sk); + if (!sk->sk_bypass_prot_mem) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } if (mem_cgroup_sk_enabled(sk)) { memcg_enabled = true; @@ -3266,6 +3301,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) goto suppress_allocation; } + if (!allocated) + return 1; + /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); @@ -3344,7 +3382,8 @@ suppress_allocation: trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - sk_memory_allocated_sub(sk, amt); + if (allocated) + sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_sk_uncharge(sk, amt); @@ -3383,11 +3422,14 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sk_enabled(sk)) mem_cgroup_sk_uncharge(sk, amount); + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_sub(sk, amount); + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); @@ -3406,6 +3448,24 @@ void __sk_mem_reclaim(struct sock *sk, int amount) } EXPORT_SYMBOL(__sk_mem_reclaim); +void __sk_charge(struct sock *sk, gfp_t gfp) +{ + int amt; + + gfp |= __GFP_NOFAIL; + if (mem_cgroup_from_sk(sk)) { + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(sk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + if (amt) + mem_cgroup_sk_charge(sk, amt, gfp); + } + + kmem_cache_charge(sk, gfp); +} + int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); @@ -3420,13 +3480,13 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off); * function, some default processing is provided. */ -int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); -int sock_no_connect(struct socket *sock, struct sockaddr *saddr, +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { return -EOPNOTSUPP; @@ -3580,12 +3640,12 @@ static void sock_def_write_space(struct sock *sk) * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ -static void sock_def_write_space_wfree(struct sock *sk) +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if (sock_writeable(sk)) { + if (__sock_writeable(sk, wmem_alloc)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ @@ -3836,7 +3896,7 @@ void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { - struct sock_exterr_skb *serr; + struct sock_extended_err ee; struct sk_buff *skb; int copied, err; @@ -3856,8 +3916,9 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, sock_recv_timestamp(msg, sk, skb); - serr = SKB_EXT_ERR(skb); - put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ + ee = SKB_EXT_ERR(skb)->ee; + put_cmsg(msg, level, type, sizeof(ee), &ee); msg->msg_flags |= MSG_ERRQUEUE; err = copied; @@ -4132,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab) return -EINVAL; } if (alloc_slab) { - prot->slab = kmem_cache_create_usercopy(prot->name, - prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | - prot->slab_flags, - prot->useroffset, prot->usersize, - NULL); + struct kmem_cache_args args = { + .useroffset = prot->useroffset, + .usersize = prot->usersize, + .freeptr_offset = prot->freeptr_offset, + .use_freeptr_offset = !!prot->freeptr_offset, + }; + prot->slab = kmem_cache_create(prot->name, prot->obj_size, + &args, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); @@ -4353,7 +4418,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; @@ -4459,14 +4524,14 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 026ce9bd9e5e..c83335c62360 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -177,7 +177,7 @@ void sock_diag_broadcast_destroy(struct sock *sk) { /* Note, this function is often called from an interrupt context. */ struct broadcast_sk *bsk = - kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); + kmalloc_obj(struct broadcast_sk, GFP_ATOMIC); if (!bsk) return sk_destruct(sk); bsk->sk = sk; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 5947b38e4f8b..b0e96337a269 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1858,7 +1858,7 @@ int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) goto out; } - sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + sockmap_link = kzalloc_obj(*sockmap_link, GFP_USER); if (!sockmap_link) { ret = -ENOMEM; goto out; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 4211710393a8..29948cb44b7d 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -175,7 +175,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { struct sock_reuseport *reuse; - reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC); + reuse = kzalloc_flex(*reuse, socks, max_socks, GFP_ATOMIC); if (!reuse) return NULL; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8cf04b57ade1..03aea10073f0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/isolation.h> +#include <linux/hex.h> #include <net/ip.h> #include <net/sock.h> @@ -325,10 +326,16 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write, static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; + struct ctl_table fake_table; + char *pos = buf; + + for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) { + pos = hex_byte_pack(pos, netdev_rss_key[i]); + *pos++ = ':'; + } + *(--pos) = 0; - snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); @@ -430,6 +437,13 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec }, { + .procname = "qdisc_max_burst", + .data = &net_hotdata.qdisc_max_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "netdev_rss_key", .data = &netdev_rss_key, .maxlen = sizeof(int), @@ -668,6 +682,13 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dou8vec_minmax, }, { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { .procname = "tstamp_allow_data", .data = &init_net.core.sysctl_tstamp_allow_data, .maxlen = sizeof(u8), @@ -676,6 +697,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ diff --git a/net/core/utils.c b/net/core/utils.c index 5e63b0ea21f3..dd86913988f4 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -11,6 +11,7 @@ */ #include <linux/module.h> +#include <linux/hex.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/ctype.h> diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..9890a30584ba 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -214,7 +214,7 @@ static int __mem_id_init_hash_table(void) if (unlikely(mem_id_init)) return 0; - rht = kzalloc(sizeof(*rht), GFP_KERNEL); + rht = kzalloc_obj(*rht); if (!rht) return -ENOMEM; @@ -297,7 +297,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, return ERR_PTR(ret); } - xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + xdp_alloc = kzalloc_obj(*xdp_alloc, gfp); if (!xdp_alloc) return ERR_PTR(-ENOMEM); @@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 03eb1d941fca..35164bfa563d 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1021,8 +1021,7 @@ static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { - table = kmalloc_array(app_count, sizeof(struct dcb_app), - GFP_KERNEL); + table = kmalloc_objs(struct dcb_app, app_count); if (!table) return -ENOMEM; @@ -2004,7 +2003,7 @@ static int dcb_app_add(struct list_head *list, const struct dcb_app *app, { struct dcb_app_type *entry; - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + entry = kmalloc_obj(*entry, GFP_ATOMIC); if (!entry) return -ENOMEM; diff --git a/net/devlink/core.c b/net/devlink/core.c index 58093f49c090..d8e509a669bf 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -111,7 +111,7 @@ static struct devlink_rel *devlink_rel_alloc(void) static u32 next; int err; - rel = kzalloc(sizeof(*rel), GFP_KERNEL); + rel = kzalloc_obj(*rel); if (!rel) return ERR_PTR(-ENOMEM); @@ -178,9 +178,7 @@ int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. - * However, since the devlink lock of nested instance is held here, - * we would end with wrong devlink instance lock ordering and - * deadlock. Therefore the work is utilized to avoid that. + * Since the parent may or may not be locked, 'work' is utilized. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { @@ -420,7 +418,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, if (!devlink_reload_actions_valid(ops)) return NULL; - devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); + devlink = kvzalloc_flex(*devlink, priv, priv_size); if (!devlink) return NULL; @@ -477,7 +475,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(devlink_rates_check(devlink, NULL, NULL)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 02602704bdea..e3a36de4f4ae 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -434,7 +434,7 @@ static void devlink_reload_reinit_sanity_check(struct devlink *devlink) WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(devlink_rates_check(devlink, NULL, NULL)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); } @@ -713,10 +713,11 @@ int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { if (!ops->eswitch_mode_set) return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = devlink_rate_nodes_check(devlink, mode, info->extack); + err = devlink_rates_check(devlink, devlink_rate_is_node, + info->extack); if (err) return err; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 14eaad9cfe35..1377864383bc 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -297,8 +297,10 @@ int devlink_resources_validate(struct devlink *devlink, struct genl_info *info); /* Rates */ -int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack); +bool devlink_rate_is_node(const struct devlink_rate *devlink_rate); +int devlink_rates_check(struct devlink *devlink, + bool (*rate_filter)(const struct devlink_rate *), + struct netlink_ext_ack *extack); /* Linecards */ unsigned int devlink_linecard_index(struct devlink_linecard *linecard); diff --git a/net/devlink/dpipe.c b/net/devlink/dpipe.c index e55701b007f0..c8d4a4374ae1 100644 --- a/net/devlink/dpipe.c +++ b/net/devlink/dpipe.c @@ -851,7 +851,7 @@ int devl_dpipe_table_register(struct devlink *devlink, devlink)) return -EEXIST; - table = kzalloc(sizeof(*table), GFP_KERNEL); + table = kzalloc_obj(*table); if (!table) return -ENOMEM; diff --git a/net/devlink/health.c b/net/devlink/health.c index 136a67c36a20..449c7611c640 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -32,7 +32,7 @@ static struct devlink_fmsg *devlink_fmsg_alloc(void) { struct devlink_fmsg *fmsg; - fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); + fmsg = kzalloc_obj(*fmsg); if (!fmsg) return NULL; @@ -119,7 +119,7 @@ __devlink_health_reporter_create(struct devlink *devlink, if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) return ERR_PTR(-EINVAL); - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); + reporter = kzalloc_obj(*reporter); if (!reporter) return ERR_PTR(-ENOMEM); @@ -738,7 +738,7 @@ static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype) if (fmsg->err) return; - item = kzalloc(sizeof(*item), GFP_KERNEL); + item = kzalloc_obj(*item); if (!item) { fmsg->err = -ENOMEM; return; diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c index 67f70a621d27..8315d35cb91d 100644 --- a/net/devlink/linecard.c +++ b/net/devlink/linecard.c @@ -404,8 +404,7 @@ static int devlink_linecard_types_init(struct devlink_linecard *linecard) int i; count = linecard->ops->types_count(linecard, linecard->priv); - linecard->types = kmalloc_array(count, sizeof(*linecard_type), - GFP_KERNEL); + linecard->types = kmalloc_objs(*linecard_type, count); if (!linecard->types) return -ENOMEM; linecard->types_count = count; @@ -451,7 +450,7 @@ devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); - linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); + linecard = kzalloc_obj(*linecard); if (!linecard) return ERR_PTR(-ENOMEM); diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 9fd00977d59e..f4c61c2b4f22 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/devlink.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -229,7 +230,7 @@ static const struct nla_policy devlink_eswitch_get_nl_policy[DEVLINK_ATTR_DEV_NA static const struct nla_policy devlink_eswitch_set_nl_policy[DEVLINK_ATTR_ESWITCH_ENCAP_MODE + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, - [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 1), + [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 2), [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = NLA_POLICY_MAX(NLA_U8, 3), [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = NLA_POLICY_MAX(NLA_U8, 1), }; @@ -301,12 +302,13 @@ static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV }; /* DEVLINK_CMD_PARAM_SET - do */ -static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VALUE_CMODE + 1] = { +static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_RESET_DEFAULT + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate), [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2), + [DEVLINK_ATTR_PARAM_RESET_DEFAULT] = { .type = NLA_FLAG, }, }; /* DEVLINK_CMD_REGION_GET - do */ @@ -919,7 +921,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_param_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_param_set_nl_policy, - .maxattr = DEVLINK_ATTR_PARAM_VALUE_CMODE, + .maxattr = DEVLINK_ATTR_PARAM_RESET_DEFAULT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 09cc6f264ccf..2817d53a0eba 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/devlink.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_DEVLINK_GEN_H #define _LINUX_DEVLINK_GEN_H diff --git a/net/devlink/param.c b/net/devlink/param.c index 70e69523412c..cf95268da5b0 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -112,6 +112,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, + .name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -169,11 +174,12 @@ devlink_param_cmode_is_supported(const struct devlink_param *param, static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { if (!param->get) return -EOPNOTSUPP; - return param->get(devlink, param->id, ctx); + return param->get(devlink, param->id, ctx, extack); } static int devlink_param_set(struct devlink *devlink, @@ -186,68 +192,121 @@ static int devlink_param_set(struct devlink *devlink, return param->set(devlink, param->id, ctx, extack); } -static int -devlink_nl_param_value_fill_one(struct sk_buff *msg, - enum devlink_param_type type, - enum devlink_param_cmode cmode, - union devlink_param_value val) +static int devlink_param_get_default(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { - struct nlattr *param_value_attr; + if (!param->get_default) + return -EOPNOTSUPP; - param_value_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_PARAM_VALUE); - if (!param_value_attr) - goto nla_put_failure; + return param->get_default(devlink, param->id, ctx, extack); +} - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) - goto value_nest_cancel; +static int devlink_param_reset_default(struct devlink *devlink, + const struct devlink_param *param, + enum devlink_param_cmode cmode, + struct netlink_ext_ack *extack) +{ + if (!param->reset_default) + return -EOPNOTSUPP; + + return param->reset_default(devlink, param->id, cmode, extack); +} +static int +devlink_nl_param_value_put(struct sk_buff *msg, enum devlink_param_type type, + int nla_type, union devlink_param_value val, + bool flag_as_u8) +{ switch (type) { case DEVLINK_PARAM_TYPE_U8: - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) - goto value_nest_cancel; + if (nla_put_u8(msg, nla_type, val.vu8)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_U16: - if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) - goto value_nest_cancel; + if (nla_put_u16(msg, nla_type, val.vu16)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_U32: - if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) - goto value_nest_cancel; + if (nla_put_u32(msg, nla_type, val.vu32)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_U64: - if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, - val.vu64)) - goto value_nest_cancel; + if (devlink_nl_put_u64(msg, nla_type, val.vu64)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_STRING: - if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, - val.vstr)) - goto value_nest_cancel; + if (nla_put_string(msg, nla_type, val.vstr)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_BOOL: - if (val.vbool && - nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) - goto value_nest_cancel; + /* default values of type bool are encoded with u8, so that + * false can be distinguished from not present + */ + if (flag_as_u8) { + if (nla_put_u8(msg, nla_type, val.vbool)) + return -EMSGSIZE; + } else { + if (val.vbool && nla_put_flag(msg, nla_type)) + return -EMSGSIZE; + } break; } + return 0; +} + +static int +devlink_nl_param_value_fill_one(struct sk_buff *msg, + enum devlink_param_type type, + enum devlink_param_cmode cmode, + union devlink_param_value val, + union devlink_param_value default_val, + bool has_default) +{ + struct nlattr *param_value_attr; + int err = -EMSGSIZE; + + param_value_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_PARAM_VALUE); + if (!param_value_attr) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) + goto value_nest_cancel; + + err = devlink_nl_param_value_put(msg, type, + DEVLINK_ATTR_PARAM_VALUE_DATA, + val, false); + if (err) + goto value_nest_cancel; + + if (has_default) { + err = devlink_nl_param_value_put(msg, type, + DEVLINK_ATTR_PARAM_VALUE_DEFAULT, + default_val, true); + if (err) + goto value_nest_cancel; + } nla_nest_end(msg, param_value_attr); return 0; value_nest_cancel: nla_nest_cancel(msg, param_value_attr); -nla_put_failure: - return -EMSGSIZE; + return err; } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, - u32 portid, u32 seq, int flags) + u32 portid, u32 seq, int flags, + struct netlink_ext_ack *extack) { + union devlink_param_value default_value[DEVLINK_PARAM_CMODE_MAX + 1]; union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + bool default_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; const struct devlink_param *param = param_item->param; struct devlink_param_gset_ctx ctx; @@ -268,12 +327,26 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, param_value[i] = param_item->driverinit_value; else return -EOPNOTSUPP; + + if (param_item->driverinit_value_valid) { + default_value[i] = param_item->driverinit_default; + default_value_set[i] = true; + } } else { ctx.cmode = i; - err = devlink_param_get(devlink, param, &ctx); + err = devlink_param_get(devlink, param, &ctx, extack); if (err) return err; param_value[i] = ctx.val; + + err = devlink_param_get_default(devlink, param, &ctx, + extack); + if (!err) { + default_value[i] = ctx.val; + default_value_set[i] = true; + } else if (err != -EOPNOTSUPP) { + return err; + } } param_value_set[i] = true; } @@ -310,7 +383,9 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (!param_value_set[i]) continue; err = devlink_nl_param_value_fill_one(msg, param->type, - i, param_value[i]); + i, param_value[i], + default_value[i], + default_value_set[i]); if (err) goto values_list_nest_cancel; } @@ -352,7 +427,7 @@ static void devlink_param_notify(struct devlink *devlink, if (!msg) return; err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, - 0, 0, 0); + 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; @@ -395,7 +470,8 @@ static int devlink_nl_param_get_dump_one(struct sk_buff *msg, err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); + cb->nlh->nlmsg_seq, flags, + cb->extack); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -504,8 +580,8 @@ int devlink_nl_param_get_doit(struct sk_buff *skb, return -ENOMEM; err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - info->snd_portid, info->snd_seq, 0); + DEVLINK_CMD_PARAM_GET, info->snd_portid, + info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; @@ -526,6 +602,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, struct devlink_param_item *param_item; const struct devlink_param *param; union devlink_param_value value; + bool reset_default; int err = 0; param_item = devlink_param_get_from_info(params, info); @@ -537,13 +614,18 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return err; if (param_type != param->type) return -EINVAL; - err = devlink_param_value_get_from_info(param, info, &value); - if (err) - return err; - if (param->validate) { - err = param->validate(devlink, param->id, value, info->extack); + + reset_default = info->attrs[DEVLINK_ATTR_PARAM_RESET_DEFAULT]; + if (!reset_default) { + err = devlink_param_value_get_from_info(param, info, &value); if (err) return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, + info->extack); + if (err) + return err; + } } if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) @@ -553,6 +635,15 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + if (reset_default) { + if (!param_item->driverinit_value_valid) { + NL_SET_ERR_MSG(info->extack, + "Default value not available"); + return -EOPNOTSUPP; + } + value = param_item->driverinit_default; + } + param_item->driverinit_value_new = value; param_item->driverinit_value_new_valid = true; } else { @@ -560,7 +651,12 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; - err = devlink_param_set(devlink, param, &ctx, info->extack); + if (reset_default) + err = devlink_param_reset_default(devlink, param, cmode, + info->extack); + else + err = devlink_param_set(devlink, param, &ctx, + info->extack); if (err) return err; } @@ -622,7 +718,7 @@ static int devlink_param_register(struct devlink *devlink, else WARN_ON(!param->get || !param->set); - param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + param_item = kzalloc_obj(*param_item); if (!param_item) return -ENOMEM; @@ -808,6 +904,7 @@ void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; + param_item->driverinit_default = init_val; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } diff --git a/net/devlink/rate.c b/net/devlink/rate.c index d157a8419bca..41be2d6c2954 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -12,8 +12,7 @@ devlink_rate_is_leaf(struct devlink_rate *devlink_rate) return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } -static inline bool -devlink_rate_is_node(struct devlink_rate *devlink_rate) +bool devlink_rate_is_node(const struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } @@ -628,7 +627,7 @@ int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + rate_node = kzalloc_obj(*rate_node); if (!rate_node) return -ENOMEM; @@ -688,14 +687,16 @@ int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) return err; } -int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) +int devlink_rates_check(struct devlink *devlink, + bool (*rate_filter)(const struct devlink_rate *), + struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) - if (devlink_rate_is_node(devlink_rate)) { - NL_SET_ERR_MSG(extack, "Rate node(s) exists."); + if (!rate_filter || rate_filter(devlink_rate)) { + if (extack) + NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; @@ -720,7 +721,7 @@ devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + rate_node = kzalloc_obj(*rate_node); if (!rate_node) return ERR_PTR(-ENOMEM); @@ -765,7 +766,7 @@ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; - devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); + devlink_rate = kzalloc_obj(*devlink_rate); if (!devlink_rate) return -ENOMEM; diff --git a/net/devlink/region.c b/net/devlink/region.c index 63fb297f6d67..5588e3d560b9 100644 --- a/net/devlink/region.c +++ b/net/devlink/region.c @@ -50,7 +50,7 @@ devlink_port_region_get_by_name(struct devlink_port *port, struct devlink_region *region; list_for_each_entry(region, &port->region_list, list) - if (!strcmp(region->ops->name, region_name)) + if (!strcmp(region->port_ops->name, region_name)) return region; return NULL; @@ -428,7 +428,7 @@ __devlink_region_snapshot_create(struct devlink_region *region, if (devlink_region_snapshot_get_by_id(region, snapshot_id)) return -EEXIST; - snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + snapshot = kzalloc_obj(*snapshot); if (!snapshot) return -ENOMEM; @@ -1055,7 +1055,7 @@ struct devlink_region *devl_region_create(struct devlink *devlink, if (devlink_region_get_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc_obj(*region); if (!region) return ERR_PTR(-ENOMEM); @@ -1128,7 +1128,7 @@ devlink_port_region_create(struct devlink_port *port, goto unlock; } - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc_obj(*region); if (!region) { err = -ENOMEM; goto unlock; diff --git a/net/devlink/resource.c b/net/devlink/resource.c index 2d6324f3d91f..351835a710b1 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -347,7 +347,7 @@ int devl_resource_register(struct devlink *devlink, if (resource) return -EEXIST; - resource = kzalloc(sizeof(*resource), GFP_KERNEL); + resource = kzalloc_obj(*resource); if (!resource) return -ENOMEM; diff --git a/net/devlink/sb.c b/net/devlink/sb.c index 0a76bb32502b..49fcbfe08f15 100644 --- a/net/devlink/sb.c +++ b/net/devlink/sb.c @@ -943,7 +943,7 @@ int devl_sb_register(struct devlink *devlink, unsigned int sb_index, if (devlink_sb_index_exists(devlink, sb_index)) return -EEXIST; - devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); + devlink_sb = kzalloc_obj(*devlink_sb); if (!devlink_sb) return -ENOMEM; devlink_sb->index = sb_index; diff --git a/net/devlink/trap.c b/net/devlink/trap.c index f36087f90db5..8edb31654a68 100644 --- a/net/devlink/trap.c +++ b/net/devlink/trap.c @@ -1271,7 +1271,7 @@ devlink_trap_register(struct devlink *devlink, if (devlink_trap_item_lookup(devlink, trap->name)) return -EEXIST; - trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); + trap_item = kzalloc_obj(*trap_item); if (!trap_item) return -ENOMEM; @@ -1545,7 +1545,7 @@ devlink_trap_group_register(struct devlink *devlink, if (devlink_trap_group_item_lookup(devlink, group->name)) return -EEXIST; - group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); + group_item = kzalloc_obj(*group_item); if (!group_item) return -ENOMEM; @@ -1751,7 +1751,7 @@ devlink_trap_policer_register(struct devlink *devlink, if (devlink_trap_policer_item_lookup(devlink, policer->id)) return -EEXIST; - policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); + policer_item = kzalloc_obj(*policer_item); if (!policer_item) return -ENOMEM; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 82b084cc1cc6..53da62984447 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -78,7 +78,6 @@ int dns_query(struct net *net, { struct key *rkey; struct user_key_payload *upayload; - const struct cred *saved_cred; size_t typelen, desclen; char *desc, *cp; int ret, len; @@ -124,9 +123,8 @@ int dns_query(struct net *net, /* make the upcall, using special credentials to prevent the use of * add_key() to preinstall malicious redirections */ - saved_cred = override_creds(dns_resolver_cache); - rkey = request_key_net(&key_type_dns_resolver, desc, net, options); - revert_creds(saved_cred); + scoped_with_creds(dns_resolver_cache) + rkey = request_key_net(&key_type_dns_resolver, desc, net, options); kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 869cbe57162f..5ed8c704636d 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -104,6 +104,21 @@ config NET_DSA_TAG_MTK Say Y or M if you want to enable support for tagging frames for Mediatek switches. +config NET_DSA_TAG_MXL_862XX + tristate "Tag driver for MaxLinear MxL862xx switches" + help + Say Y or M if you want to enable support for tagging frames for the + MaxLinear MxL86252 and MxL86282 switches using their native 8-byte + tagging protocol. + +config NET_DSA_TAG_MXL_GSW1XX + tristate "Tag driver for MaxLinear GSW1xx switches" + help + The GSW1xx family of switches supports an 8-byte special tag which + can be used on the CPU port of the switch. + Say Y or M if you want to enable support for tagging frames for + MaxLinear GSW1xx switches. + config NET_DSA_TAG_KSZ tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches" help @@ -190,4 +205,10 @@ config NET_DSA_TAG_XRS700X Say Y or M if you want to enable support for tagging frames for Arrow SpeedChips XRS700x switches that use a single byte tag trailer. +config NET_DSA_TAG_YT921X + tristate "Tag driver for Motorcomm YT921x switches" + help + Say Y or M if you want to enable support for tagging frames for + Motorcomm YT921x switches. + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 555c07cfeb71..bf7247759a64 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -28,6 +28,8 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_MXL_862XX) += tag_mxl862xx.o +obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o @@ -39,6 +41,7 @@ obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o obj-$(CONFIG_NET_DSA_TAG_VSC73XX_8021Q) += tag_vsc73xx_8021q.o obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o +obj-$(CONFIG_NET_DSA_TAG_YT921X) += tag_yt921x.o # for tracing framework to find trace.h CFLAGS_trace.o := -I$(src) diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c index 4ae255cfb23f..a1b044467bd6 100644 --- a/net/dsa/conduit.c +++ b/net/dsa/conduit.c @@ -26,7 +26,7 @@ static int dsa_conduit_get_regs_len(struct net_device *dev) int ret = 0; int len; - if (ops->get_regs_len) { + if (ops && ops->get_regs_len) { netdev_lock_ops(dev); len = ops->get_regs_len(dev); netdev_unlock_ops(dev); @@ -59,7 +59,7 @@ static void dsa_conduit_get_regs(struct net_device *dev, int port = cpu_dp->index; int len; - if (ops->get_regs_len && ops->get_regs) { + if (ops && ops->get_regs_len && ops->get_regs) { netdev_lock_ops(dev); len = ops->get_regs_len(dev); if (len < 0) { @@ -87,30 +87,56 @@ static void dsa_conduit_get_regs(struct net_device *dev, } } +static ssize_t dsa_conduit_append_port_stats(struct dsa_switch *ds, int port, + u64 *data, size_t start) +{ + int count; + + if (!ds->ops->get_sset_count) + return 0; + + count = ds->ops->get_sset_count(ds, port, ETH_SS_STATS); + if (count < 0) + return count; + + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, port, data + start); + + return count; +} + static void dsa_conduit_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, - uint64_t *data) + u64 *data) { - struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_port *dp, *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - struct dsa_switch *ds = cpu_dp->ds; - int port = cpu_dp->index; - int count = 0; + struct dsa_switch_tree *dst = cpu_dp->dst; + int count, mcount = 0; - if (ops->get_sset_count && ops->get_ethtool_stats) { + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { netdev_lock_ops(dev); - count = ops->get_sset_count(dev, ETH_SS_STATS); + mcount = ops->get_sset_count(dev, ETH_SS_STATS); ops->get_ethtool_stats(dev, stats, data); netdev_unlock_ops(dev); } - if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, port, data + count); + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) + continue; + + count = dsa_conduit_append_port_stats(dp->ds, dp->index, + data, mcount); + if (count < 0) + return; + + mcount += count; + } } static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, struct ethtool_stats *stats, - uint64_t *data) + u64 *data) { struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; @@ -118,11 +144,11 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, int port = cpu_dp->index; int count = 0; - if (dev->phydev && !ops->get_ethtool_phy_stats) { + if (dev->phydev && (!ops || !ops->get_ethtool_phy_stats)) { count = phy_ethtool_get_sset_count(dev->phydev); if (count >= 0) phy_ethtool_get_stats(dev->phydev, stats, data); - } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) { + } else if (ops && ops->get_sset_count && ops->get_ethtool_phy_stats) { netdev_lock_ops(dev); count = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ops->get_ethtool_phy_stats(dev, stats, data); @@ -136,45 +162,82 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, ds->ops->get_ethtool_phy_stats(ds, port, data + count); } +static void dsa_conduit_append_port_sset_count(struct dsa_switch *ds, int port, + int sset, int *count) +{ + if (ds->ops->get_sset_count) + *count += ds->ops->get_sset_count(ds, port, sset); +} + static int dsa_conduit_get_sset_count(struct net_device *dev, int sset) { - struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_port *dp, *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - struct dsa_switch *ds = cpu_dp->ds; + struct dsa_switch_tree *dst = cpu_dp->dst; int count = 0; netdev_lock_ops(dev); if (sset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) + (!ops || !ops->get_ethtool_phy_stats)) count = phy_ethtool_get_sset_count(dev->phydev); - else if (ops->get_sset_count) + else if (ops && ops->get_sset_count) count = ops->get_sset_count(dev, sset); netdev_unlock_ops(dev); if (count < 0) count = 0; - if (ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds, cpu_dp->index, sset); + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) + continue; + + dsa_conduit_append_port_sset_count(dp->ds, dp->index, sset, + &count); + } return count; } -static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset, - uint8_t *data) +static ssize_t dsa_conduit_append_port_strings(struct dsa_switch *ds, int port, + u32 stringset, u8 *data, + size_t start) { - struct dsa_port *cpu_dp = dev->dsa_ptr; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - struct dsa_switch *ds = cpu_dp->ds; - int port = cpu_dp->index; int len = ETH_GSTRING_LEN; - int mcount = 0, count, i; - uint8_t pfx[4]; - uint8_t *ndata; + u8 pfx[8], *ndata; + int count, i; + + if (!ds->ops->get_strings) + return 0; - snprintf(pfx, sizeof(pfx), "p%.2d", port); + snprintf(pfx, sizeof(pfx), "s%.2d_p%.2d", ds->index, port); /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; + ndata = data + start * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->ops->get_strings(ds, port, stringset, ndata); + count = ds->ops->get_sset_count(ds, port, stringset); + if (count < 0) + return count; + + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + + return count; +} + +static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset, + u8 *data) +{ + struct dsa_port *dp, *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch_tree *dst = cpu_dp->dst; + int count, mcount = 0; netdev_lock_ops(dev); if (stringset == ETH_SS_PHY_STATS && dev->phydev && @@ -192,21 +255,17 @@ static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset, } netdev_unlock_ops(dev); - if (ds->ops->get_strings) { - ndata = data + mcount * len; - /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle - * the output after to prepend our CPU port prefix we - * constructed earlier - */ - ds->ops->get_strings(ds, port, stringset, ndata); - count = ds->ops->get_sset_count(ds, port, stringset); + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) + continue; + + count = dsa_conduit_append_port_strings(dp->ds, dp->index, + stringset, data, + mcount); if (count < 0) return; - for (i = 0; i < count; i++) { - memmove(ndata + (i * len + sizeof(pfx)), - ndata + i * len, len - sizeof(pfx)); - memcpy(ndata + i * len, pfx, sizeof(pfx)); - } + + mcount += count; } } diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c index f41f9fc2194e..ed342f345692 100644 --- a/net/dsa/devlink.c +++ b/net/dsa/devlink.c @@ -182,7 +182,8 @@ static const struct devlink_ops dsa_devlink_ops = { }; int dsa_devlink_param_get(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dsa_devlink_to_ds(dl); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5b01a0e43ebe..9cb732f6b1e3 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -9,6 +9,7 @@ #include <linux/device.h> #include <linux/err.h> +#include <linux/if_hsr.h> #include <linux/list.h> #include <linux/module.h> #include <linux/netdevice.h> @@ -157,7 +158,7 @@ unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max) bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges, DSA_MAX_NUM_OFFLOADING_BRIDGES, 1); - if (bridge_num >= max) + if (bridge_num > max) return 0; set_bit(bridge_num, &dsa_fwd_offloading_bridges); @@ -212,7 +213,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) { struct dsa_switch_tree *dst; - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = kzalloc_obj(*dst); if (!dst) return NULL; @@ -297,7 +298,7 @@ static struct dsa_link *dsa_link_touch(struct dsa_port *dp, if (dl->dp == dp && dl->link_dp == link_dp) return dl; - dl = kzalloc(sizeof(*dl), GFP_KERNEL); + dl = kzalloc_obj(*dl); if (!dl) return NULL; @@ -366,16 +367,10 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst) { - struct device_node *ethernet; - struct net_device *conduit; struct dsa_port *cpu_dp; cpu_dp = dsa_tree_find_first_cpu(dst); - ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0); - conduit = of_find_net_device_by_node(ethernet); - of_node_put(ethernet); - - return conduit; + return cpu_dp->conduit; } /* Assign the default CPU port (the first one in the tree) to all ports of the @@ -849,7 +844,7 @@ static int dsa_tree_setup_lags(struct dsa_switch_tree *dst) if (!len) return 0; - dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL); + dst->lags = kzalloc_objs(*dst->lags, len); if (!dst->lags) return -ENOMEM; @@ -1097,7 +1092,7 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) if (dp->index == index) return dp; - dp = kzalloc(sizeof(*dp), GFP_KERNEL); + dp = kzalloc_obj(*dp); if (!dp) return NULL; @@ -1252,14 +1247,25 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) if (ethernet) { struct net_device *conduit; const char *user_protocol; + int err; + rtnl_lock(); conduit = of_find_net_device_by_node(ethernet); of_node_put(ethernet); - if (!conduit) + if (!conduit) { + rtnl_unlock(); return -EPROBE_DEFER; + } + + netdev_hold(conduit, &dp->conduit_tracker, GFP_KERNEL); + put_device(&conduit->dev); + rtnl_unlock(); user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL); - return dsa_port_parse_cpu(dp, conduit, user_protocol); + err = dsa_port_parse_cpu(dp, conduit, user_protocol); + if (err) + netdev_put(conduit, &dp->conduit_tracker); + return err; } if (link) @@ -1392,37 +1398,30 @@ static struct device *dev_find_class(struct device *parent, char *class) return device_find_child(parent, class, dev_is_class); } -static struct net_device *dsa_dev_to_net_device(struct device *dev) -{ - struct device *d; - - d = dev_find_class(dev, "net"); - if (d != NULL) { - struct net_device *nd; - - nd = to_net_dev(d); - dev_hold(nd); - put_device(d); - - return nd; - } - - return NULL; -} - static int dsa_port_parse(struct dsa_port *dp, const char *name, struct device *dev) { if (!strcmp(name, "cpu")) { struct net_device *conduit; + struct device *d; + int err; - conduit = dsa_dev_to_net_device(dev); - if (!conduit) + rtnl_lock(); + d = dev_find_class(dev, "net"); + if (!d) { + rtnl_unlock(); return -EPROBE_DEFER; + } - dev_put(conduit); + conduit = to_net_dev(d); + netdev_hold(conduit, &dp->conduit_tracker, GFP_KERNEL); + put_device(d); + rtnl_unlock(); - return dsa_port_parse_cpu(dp, conduit, NULL); + err = dsa_port_parse_cpu(dp, conduit, NULL); + if (err) + netdev_put(conduit, &dp->conduit_tracker); + return err; } if (!strcmp(name, "dsa")) @@ -1490,6 +1489,9 @@ static void dsa_switch_release_ports(struct dsa_switch *ds) struct dsa_vlan *v, *n; dsa_switch_for_each_port_safe(dp, next, ds) { + if (dsa_port_is_cpu(dp) && dp->conduit) + netdev_put(dp->conduit, &dp->conduit_tracker); + /* These are either entries that upper layers lost track of * (probably due to bugs), or installed through interfaces * where one does not necessarily have to remove them, like @@ -1634,8 +1636,10 @@ void dsa_switch_shutdown(struct dsa_switch *ds) /* Disconnect from further netdevice notifiers on the conduit, * since netdev_uses_dsa() will now return false. */ - dsa_switch_for_each_cpu_port(dp, ds) + dsa_switch_for_each_cpu_port(dp, ds) { dp->conduit->dsa_ptr = NULL; + netdev_put(dp->conduit, &dp->conduit_tracker); + } rtnl_unlock(); out: @@ -1766,6 +1770,70 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, } EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); +/* Helpers for switches without specific HSR offloads, but which can implement + * NETIF_F_HW_HSR_DUP because their tagger uses dsa_xmit_port_mask() + */ +int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack) +{ + enum hsr_port_type type; + int err; + + err = hsr_get_port_type(hsr, dsa_to_port(ds, port)->user, &type); + if (err) + return err; + + if (type != HSR_PT_SLAVE_A && type != HSR_PT_SLAVE_B) { + NL_SET_ERR_MSG_MOD(extack, + "Only HSR slave ports can be offloaded"); + return -EOPNOTSUPP; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_validate); + +int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_to_port(ds, port), *other_dp; + int err; + + err = dsa_port_simple_hsr_validate(ds, port, hsr, extack); + if (err) + return err; + + dsa_hsr_foreach_port(other_dp, ds, hsr) { + if (other_dp != dp) { + dp->user->features |= NETIF_F_HW_HSR_DUP; + other_dp->user->features |= NETIF_F_HW_HSR_DUP; + break; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_join); + +int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port, + struct net_device *hsr) +{ + struct dsa_port *dp = dsa_to_port(ds, port), *other_dp; + + dsa_hsr_foreach_port(other_dp, ds, hsr) { + if (other_dp != dp) { + dp->user->features &= ~NETIF_F_HW_HSR_DUP; + other_dp->user->features &= ~NETIF_F_HW_HSR_DUP; + break; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_leave); + static const struct dsa_stubs __dsa_stubs = { .conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate, }; diff --git a/net/dsa/port.c b/net/dsa/port.c index 082573ae6864..1f5536c0dffc 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -431,7 +431,7 @@ static int dsa_port_bridge_create(struct dsa_port *dp, return 0; } - bridge = kzalloc(sizeof(*bridge), GFP_KERNEL); + bridge = kzalloc_obj(*bridge); if (!bridge) return -ENOMEM; @@ -617,7 +617,7 @@ static int dsa_port_lag_create(struct dsa_port *dp, return 0; } - lag = kzalloc(sizeof(*lag), GFP_KERNEL); + lag = kzalloc_obj(*lag); if (!lag) return -ENOMEM; @@ -1909,6 +1909,9 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) struct dsa_switch *ds = dp->ds; int err; + if (!dp->hsr_dev) + return; + dp->hsr_dev = NULL; if (ds->ops->port_hsr_leave) { diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 3d2feeea897b..a670a04edb72 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -182,7 +182,7 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp, goto out; } - a = kzalloc(sizeof(*a), GFP_KERNEL); + a = kzalloc_obj(*a); if (!a) { err = -ENOMEM; goto out; @@ -280,7 +280,7 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, goto out; } - a = kzalloc(sizeof(*a), GFP_KERNEL); + a = kzalloc_obj(*a); if (!a) { err = -ENOMEM; goto out; @@ -368,7 +368,7 @@ static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag, goto out; } - a = kzalloc(sizeof(*a), GFP_KERNEL); + a = kzalloc_obj(*a); if (!a) { err = -ENOMEM; goto out; @@ -719,7 +719,7 @@ static int dsa_port_do_vlan_add(struct dsa_port *dp, goto out; } - v = kzalloc(sizeof(*v), GFP_KERNEL); + v = kzalloc_obj(*v); if (!v) { err = -ENOMEM; goto out; diff --git a/net/dsa/tag.h b/net/dsa/tag.h index 5d80ddad4ff6..cf52283fe9df 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -319,6 +319,24 @@ static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) return skb->data + 2 * ETH_ALEN; } +static inline unsigned long dsa_xmit_port_mask(const struct sk_buff *skb, + const struct net_device *dev) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + unsigned long mask = BIT(dp->index); + + if (IS_ENABLED(CONFIG_HSR) && + unlikely(dev->features & NETIF_F_HW_HSR_DUP)) { + struct net_device *hsr_dev = dp->hsr_dev; + struct dsa_port *other_dp; + + dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev) + mask |= BIT(other_dp->index); + } + + return mask; +} + /* Create 2 modaliases per tagging protocol, one to auto-load the module * given the ID reported by get_tag_protocol(), and the other by name. */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 53e03fd8071b..f91646e60246 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -158,7 +158,7 @@ static int dsa_port_do_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, return 0; } - v = kzalloc(sizeof(*v), GFP_KERNEL); + v = kzalloc_obj(*v); if (!v) return -ENOMEM; @@ -420,7 +420,7 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) struct dsa_8021q_context *ctx; int err; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (!ctx) return -ENOMEM; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index eadb358179ce..cf9420439054 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -92,6 +92,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, { struct dsa_port *dp = dsa_user_to_port(dev); u16 queue = skb_get_queue_mapping(skb); + u16 port_mask; u8 *brcm_tag; /* The Ethernet switch we are interfaced with needs packets to be at @@ -119,10 +120,9 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) | ((queue & BRCM_IG_TC_MASK) << BRCM_IG_TC_SHIFT); brcm_tag[1] = 0; - brcm_tag[2] = 0; - if (dp->index == 8) - brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; - brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK; + port_mask = dsa_xmit_port_mask(skb, dev); + brcm_tag[2] = (port_mask >> 8) & BRCM_IG_DSTMAP2_MASK; + brcm_tag[3] = port_mask & BRCM_IG_DSTMAP1_MASK; /* Now tell the conduit network device about the desired output queue * as well diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 51a1f46a567f..5fa436121087 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -48,8 +48,7 @@ /* Byte 3 */ #define GSWIP_TX_DPID_EN BIT(0) -#define GSWIP_TX_PORT_MAP_SHIFT 1 -#define GSWIP_TX_PORT_MAP_MASK GENMASK(6, 1) +#define GSWIP_TX_PORT_MAP GENMASK(6, 1) #define GSWIP_RX_HEADER_LEN 8 @@ -61,7 +60,6 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); u8 *gswip_tag; skb_push(skb, GSWIP_TX_HEADER_LEN); @@ -70,7 +68,7 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, gswip_tag[0] = GSWIP_TX_SLPID_CPU; gswip_tag[1] = GSWIP_TX_DPID_ELAN; gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL; - gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK; + gswip_tag[3] = FIELD_PREP(GSWIP_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev)); gswip_tag[3] |= GSWIP_TX_DPID_EN; return skb; diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 663b25785d95..544ab15685a2 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -20,7 +20,6 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); u8 *tag; /* Calculate checksums (if required) before adding the trailer tag to @@ -33,7 +32,7 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, /* Tag encoding */ tag = skb_put(skb, HELLCREEK_TAG_LEN); - *tag = BIT(dp->index); + *tag = dsa_xmit_port_mask(skb, dev); return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0b7564b53790..d2475c3bbb7d 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -62,7 +62,7 @@ static int ksz_connect(struct dsa_switch *ds) struct ksz_tagger_private *priv; int ret; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; @@ -120,7 +120,6 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); struct ethhdr *hdr; u8 *tag; @@ -131,7 +130,7 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); hdr = skb_eth_hdr(skb); - *tag = 1 << dp->index; + *tag = dsa_xmit_port_mask(skb, dev); if (is_link_local_ether_addr(hdr->h_dest)) *tag |= KSZ8795_TAIL_TAG_OVERRIDE; @@ -259,7 +258,7 @@ static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb) if (!xmit_work_fn || !xmit_worker) return NULL; - xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC); if (!xmit_work) return NULL; @@ -294,21 +293,12 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); hdr = skb_eth_hdr(skb); - val = BIT(dp->index); - + val = dsa_xmit_port_mask(skb, dev); val |= FIELD_PREP(KSZ9477_TAIL_TAG_PRIO, prio); if (is_link_local_ether_addr(hdr->h_dest)) val |= KSZ9477_TAIL_TAG_OVERRIDE; - if (dev->features & NETIF_F_HW_HSR_DUP) { - struct net_device *hsr_dev = dp->hsr_dev; - struct dsa_port *other_dp; - - dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev) - val |= BIT(other_dp->index); - } - *tag = cpu_to_be16(val); return ksz_defer_xmit(dp, skb); @@ -371,8 +361,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); hdr = skb_eth_hdr(skb); - *tag = BIT(dp->index); - + *tag = dsa_xmit_port_mask(skb, dev); *tag |= FIELD_PREP(KSZ9893_TAIL_TAG_PRIO, prio); if (is_link_local_ether_addr(hdr->h_dest)) @@ -436,8 +425,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN); - val = BIT(dp->index); - + val = dsa_xmit_port_mask(skb, dev); val |= FIELD_PREP(LAN937X_TAIL_TAG_PRIO, prio); if (is_link_local_ether_addr(hdr->h_dest)) diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index b670e3c53e91..dea3eecaf093 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -54,7 +54,8 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, * whether that's a combined special tag with 802.1Q header. */ mtk_tag[0] = xmit_tpid; - mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; + mtk_tag[1] = FIELD_PREP(MTK_HDR_XMIT_DP_BIT_MASK, + dsa_xmit_port_mask(skb, dev)); /* Tag control information is kept for 802.1Q */ if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) { diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c new file mode 100644 index 000000000000..60f7c445e656 --- /dev/null +++ b/net/dsa/tag_mxl-gsw1xx.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * DSA driver Special Tag support for MaxLinear GSW1xx switch chips + * + * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org> + * Copyright (C) 2023 - 2024 MaxLinear Inc. + */ + +#include <linux/bitops.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <net/dsa.h> + +#include "tag.h" + +/* To define the outgoing port and to discover the incoming port a special + * tag is used by the GSW1xx. + * + * Dest MAC Src MAC special TAG EtherType + * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |... + * |<--------------->| + */ + +#define GSW1XX_TAG_NAME "gsw1xx" + +/* special tag header length (RX and TX) */ +#define GSW1XX_HEADER_LEN 8 + +/* Word 0 = Ethertype -> 0x88C3 */ + +/* Word 1 */ +#define GSW1XX_TX_PORT_MAP GENMASK(7, 0) +#define GSW1XX_TX_PORT_MAP_EN BIT(15) +#define GSW1XX_TX_CLASS_EN BIT(14) +#define GSW1XX_TX_TIME_STAMP_EN BIT(13) +#define GSW1XX_TX_LRN_DIS BIT(12) +#define GSW1XX_TX_CLASS GENMASK(11, 8) + +/* special tag in RX path header */ +/* Word 2 */ +#define GSW1XX_RX_PORT_MAP GENMASK(15, 8) + +static struct sk_buff *gsw1xx_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + __be16 *gsw1xx_tag; + u16 tag; + + /* provide additional space 'GSW1XX_HEADER_LEN' bytes */ + skb_push(skb, GSW1XX_HEADER_LEN); + + /* add space between MAC address and Ethertype */ + dsa_alloc_etype_header(skb, GSW1XX_HEADER_LEN); + + /* special tag ingress */ + gsw1xx_tag = dsa_etype_header_pos_tx(skb); + gsw1xx_tag[0] = htons(ETH_P_MXLGSW); + + tag = FIELD_PREP(GSW1XX_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev)) | + GSW1XX_TX_PORT_MAP_EN | GSW1XX_TX_LRN_DIS; + gsw1xx_tag[1] = htons(tag); + gsw1xx_tag[2] = 0; + gsw1xx_tag[3] = 0; + + return skb; +} + +static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + int port; + __be16 *gsw1xx_tag; + + if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) { + dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n"); + return NULL; + } + + gsw1xx_tag = dsa_etype_header_pos_rx(skb); + + if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n"); + dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag); + return NULL; + } + + /* Get source port information */ + port = FIELD_GET(GSW1XX_RX_PORT_MAP, ntohs(gsw1xx_tag[1])); + skb->dev = dsa_conduit_find_user(dev, 0, port); + if (!skb->dev) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); + dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag); + return NULL; + } + + /* remove the GSW1xx special tag between MAC addresses and the current + * ethertype field. + */ + skb_pull_rcsum(skb, GSW1XX_HEADER_LEN); + dsa_strip_etype_header(skb, GSW1XX_HEADER_LEN); + + return skb; +} + +static const struct dsa_device_ops gsw1xx_netdev_ops = { + .name = GSW1XX_TAG_NAME, + .proto = DSA_TAG_PROTO_MXL_GSW1XX, + .xmit = gsw1xx_tag_xmit, + .rcv = gsw1xx_tag_rcv, + .needed_headroom = GSW1XX_HEADER_LEN, +}; + +MODULE_DESCRIPTION("DSA tag driver for MaxLinear GSW1xx 8 byte protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL_GSW1XX, GSW1XX_TAG_NAME); + +module_dsa_tag_driver(gsw1xx_netdev_ops); diff --git a/net/dsa/tag_mxl862xx.c b/net/dsa/tag_mxl862xx.c new file mode 100644 index 000000000000..01f215868271 --- /dev/null +++ b/net/dsa/tag_mxl862xx.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * DSA Special Tag for MaxLinear 862xx switch chips + * + * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org> + * Copyright (C) 2024 MaxLinear Inc. + */ + +#include <linux/bitops.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <net/dsa.h> +#include "tag.h" + +#define MXL862_NAME "mxl862xx" + +#define MXL862_HEADER_LEN 8 + +/* Word 0 -> EtherType */ + +/* Word 2 */ +#define MXL862_SUBIF_ID GENMASK(4, 0) + +/* Word 3 */ +#define MXL862_IGP_EGP GENMASK(3, 0) + +static struct sk_buff *mxl862_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_port *cpu_dp = dp->cpu_dp; + unsigned int cpu_port, sub_interface; + __be16 *mxl862_tag; + + cpu_port = cpu_dp->index; + + /* target port sub-interface ID relative to the CPU port */ + sub_interface = dp->index + 16 - cpu_port; + + /* provide additional space 'MXL862_HEADER_LEN' bytes */ + skb_push(skb, MXL862_HEADER_LEN); + + /* shift MAC address to the beginning of the enlarged buffer, + * releasing the space required for DSA tag (between MAC address and + * Ethertype) + */ + dsa_alloc_etype_header(skb, MXL862_HEADER_LEN); + + /* special tag ingress (from the perspective of the switch) */ + mxl862_tag = dsa_etype_header_pos_tx(skb); + mxl862_tag[0] = htons(ETH_P_MXLGSW); + mxl862_tag[1] = 0; + mxl862_tag[2] = htons(FIELD_PREP(MXL862_SUBIF_ID, sub_interface)); + mxl862_tag[3] = htons(FIELD_PREP(MXL862_IGP_EGP, cpu_port)); + + return skb; +} + +static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + __be16 *mxl862_tag; + int port; + + if (unlikely(!pskb_may_pull(skb, MXL862_HEADER_LEN))) { + dev_warn_ratelimited(&dev->dev, "Cannot pull SKB, packet dropped\n"); + return NULL; + } + + mxl862_tag = dsa_etype_header_pos_rx(skb); + + if (unlikely(mxl862_tag[0] != htons(ETH_P_MXLGSW))) { + dev_warn_ratelimited(&dev->dev, + "Invalid special tag marker, packet dropped, tag: %8ph\n", + mxl862_tag); + return NULL; + } + + /* Get source port information */ + port = FIELD_GET(MXL862_IGP_EGP, ntohs(mxl862_tag[3])); + skb->dev = dsa_conduit_find_user(dev, 0, port); + if (unlikely(!skb->dev)) { + dev_warn_ratelimited(&dev->dev, + "Invalid source port, packet dropped, tag: %8ph\n", + mxl862_tag); + return NULL; + } + + /* remove the MxL862xx special tag between the MAC addresses and the + * current ethertype field. + */ + skb_pull_rcsum(skb, MXL862_HEADER_LEN); + dsa_strip_etype_header(skb, MXL862_HEADER_LEN); + + return skb; +} + +static const struct dsa_device_ops mxl862_netdev_ops = { + .name = MXL862_NAME, + .proto = DSA_TAG_PROTO_MXL862, + .xmit = mxl862_tag_xmit, + .rcv = mxl862_tag_rcv, + .needed_headroom = MXL862_HEADER_LEN, +}; + +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL862, MXL862_NAME); +MODULE_DESCRIPTION("DSA tag driver for MaxLinear MxL862xx switches"); +MODULE_LICENSE("GPL"); + +module_dsa_tag_driver(mxl862_netdev_ops); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index bf6608fc6be7..3405def79c2d 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -46,11 +46,10 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { - struct dsa_port *dp = dsa_user_to_port(netdev); void *injection; ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection); - ocelot_ifh_set_dest(injection, BIT_ULL(dp->index)); + ocelot_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev)); return skb; } @@ -58,11 +57,10 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, static struct sk_buff *seville_xmit(struct sk_buff *skb, struct net_device *netdev) { - struct dsa_port *dp = dsa_user_to_port(netdev); void *injection; ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection); - seville_ifh_set_dest(injection, BIT_ULL(dp->index)); + seville_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev)); return skb; } diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 3929584791e4..e89d9254e90a 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -43,7 +43,7 @@ static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) return NULL; - xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC); if (!xmit_work) return NULL; @@ -106,7 +106,7 @@ static int ocelot_connect(struct dsa_switch *ds) struct ocelot_8021q_tagger_private *priv; int err; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 0cf61286b426..9e3b429e8b36 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -14,7 +14,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); __be16 *phdr; u16 hdr; @@ -26,7 +25,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Set the version field, and set destination port information */ hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION); hdr |= QCA_HDR_XMIT_FROM_CPU; - hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(dp->index)); + hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, dsa_xmit_port_mask(skb, dev)); *phdr = htons(hdr); @@ -93,7 +92,7 @@ static int qca_tag_connect(struct dsa_switch *ds) { struct qca_tagger_data *tagger_data; - tagger_data = kzalloc(sizeof(*tagger_data), GFP_KERNEL); + tagger_data = kzalloc_obj(*tagger_data); if (!tagger_data) return -ENOMEM; diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index feaefa0e179b..3cc63eacfa03 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -57,7 +57,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT); /* The lower bits indicate the port number */ - out |= BIT(dp->index); + out |= dsa_xmit_port_mask(skb, dev); p = (__be16 *)(tag + 2); *p = htons(out); diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c index 15c2bae2b429..2464545da4d2 100644 --- a/net/dsa/tag_rtl8_4.c +++ b/net/dsa/tag_rtl8_4.c @@ -103,7 +103,6 @@ static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev, void *tag) { - struct dsa_port *dp = dsa_user_to_port(dev); __be16 tag16[RTL8_4_TAG_LEN / 2]; /* Set Realtek EtherType */ @@ -116,7 +115,7 @@ static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev, tag16[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1)); /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */ - tag16[3] = htons(FIELD_PREP(RTL8_4_RX, BIT(dp->index))); + tag16[3] = htons(FIELD_PREP(RTL8_4_RX, dsa_xmit_port_mask(skb, dev))); memcpy(tag, tag16, RTL8_4_TAG_LEN); } diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c index 69d51221b1e5..10994b3470f6 100644 --- a/net/dsa/tag_rzn1_a5psw.c +++ b/net/dsa/tag_rzn1_a5psw.c @@ -39,7 +39,6 @@ struct a5psw_tag { static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); struct a5psw_tag *ptag; u32 data2_val; @@ -60,7 +59,7 @@ static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *de ptag = dsa_etype_header_pos_tx(skb); - data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, BIT(dp->index)); + data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, dsa_xmit_port_mask(skb, dev)); ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW); ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD); ptag->ctrl_data2_lo = htons(data2_val); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 02adec693811..de6d4ce8668b 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -152,7 +152,7 @@ static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp, if (!xmit_work_fn || !xmit_worker) return NULL; - xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + xmit_work = kzalloc_obj(*xmit_work, GFP_ATOMIC); if (!xmit_work) return NULL; @@ -701,7 +701,7 @@ static int sja1105_connect(struct dsa_switch *ds) struct kthread_worker *xmit_worker; int err; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 22742a53d6f4..4dce24cfe6a7 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -14,12 +14,11 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); u8 *trailer; trailer = skb_put(skb, 4); trailer[0] = 0x80; - trailer[1] = 1 << dp->index; + trailer[1] = dsa_xmit_port_mask(skb, dev); trailer[2] = 0x10; trailer[3] = 0x00; diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index 68d4633ddd5e..a05219f702c6 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -13,16 +13,10 @@ static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *partner, *dp = dsa_user_to_port(dev); u8 *trailer; trailer = skb_put(skb, 1); - trailer[0] = BIT(dp->index); - - if (dp->hsr_dev) - dsa_hsr_foreach_port(partner, dp->ds, dp->hsr_dev) - if (partner != dp) - trailer[0] |= BIT(partner->index); + trailer[0] = dsa_xmit_port_mask(skb, dev); return skb; } diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c new file mode 100644 index 000000000000..aefef8c770e3 --- /dev/null +++ b/net/dsa/tag_yt921x.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Motorcomm YT921x Switch Extended CPU Port Tagging + * + * Copyright (c) 2025 David Yang <mmyangfl@gmail.com> + * + * +----+----+-------+-----+----+--------- + * | DA | SA | TagET | Tag | ET | Payload ... + * +----+----+-------+-----+----+--------- + * 6 6 2 6 2 N + * + * Tag Ethertype: CPU_TAG_TPID_TPID (default: ETH_P_YT921X = 0x9988) + * * Hardcoded for the moment, but still configurable. Discuss it if there + * are conflicts somewhere and/or you want to change it for some reason. + * Tag: + * 2: VLAN Tag + * 2: + * 15b: Rx Port Valid + * 14b-11b: Rx Port + * 10b-8b: Tx/Rx Priority + * 7b: Tx/Rx Code Valid + * 6b-1b: Tx/Rx Code + * 0b: ? (unset) + * 2: + * 15b: Tx Port(s) Valid + * 10b-0b: Tx Port(s) Mask + */ + +#include <linux/etherdevice.h> + +#include "tag.h" + +#define YT921X_TAG_NAME "yt921x" + +#define YT921X_TAG_LEN 8 + +#define YT921X_TAG_PORT_EN BIT(15) +#define YT921X_TAG_RX_PORT_M GENMASK(14, 11) +#define YT921X_TAG_PRIO_M GENMASK(10, 8) +#define YT921X_TAG_PRIO(x) FIELD_PREP(YT921X_TAG_PRIO_M, (x)) +#define YT921X_TAG_CODE_EN BIT(7) +#define YT921X_TAG_CODE_M GENMASK(6, 1) +#define YT921X_TAG_CODE(x) FIELD_PREP(YT921X_TAG_CODE_M, (x)) +#define YT921X_TAG_TX_PORTS_M GENMASK(10, 0) +#define YT921X_TAG_TX_PORTS(x) FIELD_PREP(YT921X_TAG_TX_PORTS_M, (x)) + +/* Incomplete. Some are configurable via RMA_CTRL_CPU_CODE, the meaning of + * others remains unknown. + */ +enum yt921x_tag_code { + YT921X_TAG_CODE_FORWARD = 0, + YT921X_TAG_CODE_UNK_UCAST = 0x19, + YT921X_TAG_CODE_UNK_MCAST = 0x1a, + YT921X_TAG_CODE_PORT_COPY = 0x1b, + YT921X_TAG_CODE_FDB_COPY = 0x1c, +}; + +static struct sk_buff * +yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + __be16 *tag; + u16 ctrl; + + skb_push(skb, YT921X_TAG_LEN); + dsa_alloc_etype_header(skb, YT921X_TAG_LEN); + + tag = dsa_etype_header_pos_tx(skb); + + tag[0] = htons(ETH_P_YT921X); + /* VLAN tag unrelated when TX */ + tag[1] = 0; + ctrl = YT921X_TAG_CODE(YT921X_TAG_CODE_FORWARD) | YT921X_TAG_CODE_EN | + YT921X_TAG_PRIO(skb->priority); + tag[2] = htons(ctrl); + ctrl = YT921X_TAG_TX_PORTS(dsa_xmit_port_mask(skb, netdev)) | + YT921X_TAG_PORT_EN; + tag[3] = htons(ctrl); + + return skb; +} + +static struct sk_buff * +yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev) +{ + unsigned int port; + __be16 *tag; + u16 rx; + + if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN))) + return NULL; + + tag = dsa_etype_header_pos_rx(skb); + + if (unlikely(tag[0] != htons(ETH_P_YT921X))) { + dev_warn_ratelimited(&netdev->dev, + "Unexpected EtherType 0x%04x\n", + ntohs(tag[0])); + return NULL; + } + + /* Locate which port this is coming from */ + rx = ntohs(tag[2]); + if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) { + dev_warn_ratelimited(&netdev->dev, + "Unexpected rx tag 0x%04x\n", rx); + return NULL; + } + + port = FIELD_GET(YT921X_TAG_RX_PORT_M, rx); + skb->dev = dsa_conduit_find_user(netdev, 0, port); + if (unlikely(!skb->dev)) { + dev_warn_ratelimited(&netdev->dev, + "Couldn't decode source port %u\n", port); + return NULL; + } + + skb->priority = FIELD_GET(YT921X_TAG_PRIO_M, rx); + + if (!(rx & YT921X_TAG_CODE_EN)) { + dev_warn_ratelimited(&netdev->dev, + "Tag code not enabled in rx packet\n"); + } else { + u16 code = FIELD_GET(YT921X_TAG_CODE_M, rx); + + switch (code) { + case YT921X_TAG_CODE_FORWARD: + case YT921X_TAG_CODE_PORT_COPY: + case YT921X_TAG_CODE_FDB_COPY: + /* Already forwarded by hardware */ + dsa_default_offload_fwd_mark(skb); + break; + case YT921X_TAG_CODE_UNK_UCAST: + case YT921X_TAG_CODE_UNK_MCAST: + /* NOTE: hardware doesn't distinguish between TRAP (copy + * to CPU only) and COPY (forward and copy to CPU). In + * order to perform a soft switch, NEVER use COPY action + * in the switch driver. + */ + break; + default: + dev_warn_ratelimited(&netdev->dev, + "Unknown code 0x%02x\n", code); + break; + } + } + + /* Remove YT921x tag and update checksum */ + skb_pull_rcsum(skb, YT921X_TAG_LEN); + dsa_strip_etype_header(skb, YT921X_TAG_LEN); + + return skb; +} + +static const struct dsa_device_ops yt921x_netdev_ops = { + .name = YT921X_TAG_NAME, + .proto = DSA_TAG_PROTO_YT921X, + .xmit = yt921x_tag_xmit, + .rcv = yt921x_tag_rcv, + .needed_headroom = YT921X_TAG_LEN, +}; + +MODULE_DESCRIPTION("DSA tag driver for Motorcomm YT921x switches"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_YT921X, YT921X_TAG_NAME); + +module_dsa_tag_driver(yt921x_netdev_ops); diff --git a/net/dsa/user.c b/net/dsa/user.c index f59d66f0975d..c4bd6fe90b45 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -147,7 +147,7 @@ static int dsa_user_schedule_standalone_work(struct net_device *dev, { struct dsa_standalone_event_work *standalone_work; - standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); + standalone_work = kzalloc_obj(*standalone_work, GFP_ATOMIC); if (!standalone_work) return -ENOMEM; @@ -1318,7 +1318,7 @@ static int dsa_user_netpoll_setup(struct net_device *dev) struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc_obj(*netpoll); if (!netpoll) return -ENOMEM; @@ -1430,7 +1430,7 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, return -EOPNOTSUPP; } - mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); + mall_tc_entry = kzalloc_obj(*mall_tc_entry); if (!mall_tc_entry) return -ENOMEM; @@ -1459,8 +1459,8 @@ dsa_user_add_cls_matchall_police(struct net_device *dev, struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); - struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; + struct flow_action_police *policer; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; @@ -1490,15 +1490,14 @@ dsa_user_add_cls_matchall_police(struct net_device *dev, act = &cls->rule->action.entries[0]; - mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); + mall_tc_entry = kzalloc_obj(*mall_tc_entry); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; - policer->rate_bytes_per_sec = act->police.rate_bytes_ps; - policer->burst = act->police.burst; + *policer = act->police; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { @@ -1824,7 +1823,7 @@ static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto, !dsa_switch_supports_mc_filtering(ds)) return 0; - v = kzalloc(sizeof(*v), GFP_KERNEL); + v = kzalloc_obj(*v); if (!v) { ret = -ENOMEM; goto rollback; @@ -2071,7 +2070,7 @@ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) if (min_mtu > user->mtu) min_mtu = user->mtu; - hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); + hw_port = kzalloc_obj(*hw_port); if (!hw_port) goto out; @@ -3739,7 +3738,7 @@ static int dsa_user_fdb_event(struct net_device *dev, return -EOPNOTSUPP; } - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); + switchdev_work = kzalloc_obj(*switchdev_work, GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 43e211e611b1..d9faadbe9b6c 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -154,9 +154,9 @@ EXPORT_SYMBOL(eth_get_headlen); */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { - unsigned short _service_access_point; const unsigned short *sap; const struct ethhdr *eth; + __be16 res; skb->dev = dev; skb_reset_mac_header(skb); @@ -181,26 +181,23 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * the protocol design and runs IPX over 802.3 without an 802.2 LLC * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. + * We use skb->dev as temporary storage to not hit + * CONFIG_STACKPROTECTOR_STRONG=y costs on some platforms. */ - sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); - if (sap && *sap == 0xFFFF) - return htons(ETH_P_802_3); + sap = skb_header_pointer(skb, 0, sizeof(*sap), &skb->dev); + res = (sap && *sap == 0xFFFF) ? htons(ETH_P_802_3) : htons(ETH_P_802_2); - /* - * Real 802.2 LLC - */ - return htons(ETH_P_802_2); + /* restore skb->dev in case it was mangled by skb_header_pointer(). */ + skb->dev = dev; + return res; } EXPORT_SYMBOL(eth_type_trans); -/** - * eth_header_parse - extract hardware address from packet - * @skb: packet to extract header from - * @haddr: destination buffer - */ -int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr) +int eth_header_parse(const struct sk_buff *skb, const struct net_device *dev, + unsigned char *haddr) { const struct ethhdr *eth = eth_hdr(skb); + memcpy(haddr, eth->h_source, ETH_ALEN); return ETH_ALEN; } diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 1e493553b977..629c10916670 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -9,4 +9,4 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \ - phy.o tsconfig.o + phy.o tsconfig.o mse.o diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index 3057576bc81e..3670ca42dd40 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -276,7 +276,7 @@ ethtool_cmis_cdb_init(struct net_device *dev, struct ethtool_cmis_cdb *cdb; int err; - cdb = kzalloc(sizeof(*cdb), GFP_KERNEL); + cdb = kzalloc_obj(*cdb); if (!cdb) return ERR_PTR(-ENOMEM); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 55223ebc2a7e..e252cf20c22f 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -233,6 +233,10 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full), __DEFINE_LINK_MODE_NAME(800000, SR4, Full), __DEFINE_LINK_MODE_NAME(800000, VR4, Full), + __DEFINE_LINK_MODE_NAME(1600000, CR8, Full), + __DEFINE_LINK_MODE_NAME(1600000, KR8, Full), + __DEFINE_LINK_MODE_NAME(1600000, DR8, Full), + __DEFINE_LINK_MODE_NAME(1600000, DR8_2, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -281,12 +285,35 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_DR8_2 8 #define __LINK_MODE_LANES_T1BRR 1 -#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ +#define __DEFINE_LINK_MODE_PARAMS_PAIRS(_speed, _type, _min_pairs, _pairs, _duplex, _medium) \ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ .speed = SPEED_ ## _speed, \ .lanes = __LINK_MODE_LANES_ ## _type, \ - .duplex = __DUPLEX_ ## _duplex \ + .min_pairs = _min_pairs, \ + .pairs = _pairs, \ + .duplex = __DUPLEX_ ## _duplex, \ + .mediums = BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium) \ } + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex, _medium) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ + .min_pairs = 0, \ + .pairs = 0, \ + .duplex = __DUPLEX_ ## _duplex, \ + .mediums = BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium) \ + } +#define __DEFINE_LINK_MODE_PARAMS_MEDIUMS(_speed, _type, _duplex, _mediums) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ + .min_pairs = 0, \ + .pairs = 0, \ + .duplex = __DUPLEX_ ## _duplex, \ + .mediums = (_mediums) \ + } +#define __MED(_medium) (BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium)) #define __DUPLEX_Half DUPLEX_HALF #define __DUPLEX_Full DUPLEX_FULL #define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ @@ -294,138 +321,168 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); .speed = SPEED_UNKNOWN, \ .lanes = 0, \ .duplex = DUPLEX_UNKNOWN, \ + .mediums = BIT(ETHTOOL_LINK_MEDIUM_NONE), \ } const struct link_mode_info link_mode_params[] = { - __DEFINE_LINK_MODE_PARAMS(10, T, Half), - __DEFINE_LINK_MODE_PARAMS(10, T, Full), - __DEFINE_LINK_MODE_PARAMS(100, T, Half), - __DEFINE_LINK_MODE_PARAMS(100, T, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T, Half), - __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T, 2, 4, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T, 2, 4, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T, 2, 4, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T, 2, 4, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T, 4, 4, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T, 4, 4, Full, T), __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), __DEFINE_SPECIAL_MODE_PARAMS(TP), __DEFINE_SPECIAL_MODE_PARAMS(AUI), __DEFINE_SPECIAL_MODE_PARAMS(MII), __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), __DEFINE_SPECIAL_MODE_PARAMS(BNC), - __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10000, T, 4, 4, Full, T), __DEFINE_SPECIAL_MODE_PARAMS(Pause), __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), - __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(2500, X, Full, + __MED(C) | __MED(S) | __MED(L)), __DEFINE_SPECIAL_MODE_PARAMS(Backplane), - __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full, K), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full, K), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full, K), [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { .speed = SPEED_10000, .lanes = 1, .duplex = DUPLEX_FULL, }, - __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), - __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(1000, X, Full), - __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), - __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), - __DEFINE_LINK_MODE_PARAMS(2500, T, Full), - __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full, MLD), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full, L), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full, L), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR4_ER4, Full, + __MED(L) | __MED(E)), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(1000, X, Full, + __MED(C) | __MED(S) | __MED(L)), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full, L), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full, L), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full, E), + __DEFINE_LINK_MODE_PARAMS_PAIRS(2500, T, 4, 4, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(5000, T, 4, 4, Full, T), __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), - __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, T1, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(50000, LR_ER_FR, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full, D), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR2_ER2_FR2, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(200000, LR4_ER4_FR4, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T1, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T1, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full, K), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(400000, LR8_ER8_FR8, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full, C), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), - __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, FX, Half), - __DEFINE_LINK_MODE_PARAMS(100, FX, Full), - __DEFINE_LINK_MODE_PARAMS(10, T1L, Full), - __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), - __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), - __DEFINE_LINK_MODE_PARAMS(10, T1S, Full), - __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), - __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), - __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, VR, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full), - __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), - __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR_ER_FR, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full, D), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(200000, LR2_ER2_FR2, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(400000, LR4_ER4_FR4, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half, F), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full, F), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1L, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full, C), + __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full, K), + __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full, S), + __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full, V), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S, 1, 1, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S_P2MP, 1, 1, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1BRR, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS(200000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(200000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(200000, DR, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(200000, VR, Full, V), + __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full, V), + __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full, V), + __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full, C), + __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full, K), + __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full, D), + __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full, D), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); EXPORT_SYMBOL_GPL(link_mode_params); +static const char ethtool_link_medium_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_LINK_MEDIUM_BASET] = "BaseT", + [ETHTOOL_LINK_MEDIUM_BASEK] = "BaseK", + [ETHTOOL_LINK_MEDIUM_BASES] = "BaseS", + [ETHTOOL_LINK_MEDIUM_BASEC] = "BaseC", + [ETHTOOL_LINK_MEDIUM_BASEL] = "BaseL", + [ETHTOOL_LINK_MEDIUM_BASED] = "BaseD", + [ETHTOOL_LINK_MEDIUM_BASEE] = "BaseE", + [ETHTOOL_LINK_MEDIUM_BASEF] = "BaseF", + [ETHTOOL_LINK_MEDIUM_BASEV] = "BaseV", + [ETHTOOL_LINK_MEDIUM_BASEMLD] = "BaseMLD", + [ETHTOOL_LINK_MEDIUM_NONE] = "None", +}; +static_assert(ARRAY_SIZE(ethtool_link_medium_names) == __ETHTOOL_LINK_MEDIUM_LAST); + const char netif_msg_class_names[][ETH_GSTRING_LEN] = { [NETIF_MSG_DRV_BIT] = "drv", [NETIF_MSG_PROBE_BIT] = "probe", @@ -580,21 +637,11 @@ int __ethtool_get_link(struct net_device *dev) int ethtool_get_rx_ring_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; - struct ethtool_rxnfc rx_rings = {}; - int ret; - - if (ops->get_rx_ring_count) - return ops->get_rx_ring_count(dev); - if (!ops->get_rxnfc) + if (!ops->get_rx_ring_count) return -EOPNOTSUPP; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret < 0) - return ret; - - return rx_rings.data; + return ops->get_rx_ring_count(dev); } static int ethtool_get_rxnfc_rule_count(struct net_device *dev) @@ -640,7 +687,7 @@ static int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max) if (rule_cnt <= 0) return -EINVAL; - info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL); + info = kvzalloc_flex(*info, rule_locs, rule_cnt); if (!info) return -ENOMEM; @@ -723,7 +770,7 @@ static u32 ethtool_get_max_rxfh_channel(struct net_device *dev) if (dev_size == 0) return current_max; - rxfh.indir = kcalloc(dev_size, sizeof(rxfh.indir[0]), GFP_USER); + rxfh.indir = kzalloc_objs(rxfh.indir[0], dev_size, GFP_USER); if (!rxfh.indir) return U32_MAX; @@ -794,7 +841,7 @@ int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context) if (rule_cnt < 0) return -EINVAL; - info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL); + info = kvzalloc_flex(*info, rule_locs, rule_cnt); if (!info) return -ENOMEM; @@ -854,9 +901,6 @@ ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops, ctx->key_off = key_off; ctx->priv_size = ops->rxfh_priv_size; - ctx->hfunc = ETH_RSS_HASH_NO_CHANGE; - ctx->input_xfrm = RXH_XFRM_NO_CHANGE; - return ctx; } @@ -1159,3 +1203,15 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id) ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id); } EXPORT_SYMBOL(ethtool_rxfh_context_lost); + +enum ethtool_link_medium ethtool_str_to_medium(const char *str) +{ + int i; + + for (i = 0; i < __ETHTOOL_LINK_MEDIUM_LAST; i++) + if (!strcmp(ethtool_link_medium_names[i], str)) + return i; + + return ETHTOOL_LINK_MEDIUM_NONE; +} +EXPORT_SYMBOL_GPL(ethtool_str_to_medium); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index fa83ddade4f8..ff4b4780d6af 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2383,7 +2383,10 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) return -ENOMEM; WARN_ON_ONCE(!ret); - gstrings.len = ret; + if (gstrings.len && gstrings.len != ret) + gstrings.len = 0; + else + gstrings.len = ret; if (gstrings.len) { data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); @@ -2509,10 +2512,13 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; - stats.n_stats = n_stats; + if (stats.n_stats && stats.n_stats != n_stats) + stats.n_stats = 0; + else + stats.n_stats = n_stats; - if (n_stats) { - data = vzalloc(array_size(n_stats, sizeof(u64))); + if (stats.n_stats) { + data = vzalloc(array_size(stats.n_stats, sizeof(u64))); if (!data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); @@ -2524,7 +2530,9 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) + if (stats.n_stats && + copy_to_user(useraddr, data, + array_size(stats.n_stats, sizeof(u64)))) goto out; ret = 0; @@ -2560,6 +2568,10 @@ static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, return -EOPNOTSUPP; n_stats = phy_ops->get_sset_count(phydev); + if (stats->n_stats && stats->n_stats != n_stats) { + stats->n_stats = 0; + return 0; + } ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) @@ -2580,6 +2592,10 @@ static int ethtool_get_phy_stats_ethtool(struct net_device *dev, return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); + if (stats->n_stats && stats->n_stats != n_stats) { + stats->n_stats = 0; + return 0; + } ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) @@ -2616,7 +2632,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) } useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) + if (stats.n_stats && + copy_to_user(useraddr, data, + array_size(stats.n_stats, sizeof(u64)))) ret = -EFAULT; out: @@ -3022,7 +3040,7 @@ ethtool_set_per_queue_coalesce(struct net_device *dev, bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); - tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); + tmp = backup = kmalloc_objs(*backup, n_queue); if (!backup) return -ENOMEM; @@ -3536,7 +3554,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; - state = kzalloc(sizeof(*state), GFP_KERNEL); + state = kzalloc_obj(*state); if (!state) return -ENOMEM; diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 4d4e0a82579a..0a761bf4771e 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -301,7 +301,7 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name, struct ethtool_module_fw_flash *module_fw; int err; - module_fw = kzalloc(sizeof(*module_fw), GFP_KERNEL); + module_fw = kzalloc_obj(*module_fw); if (!module_fw) return -ENOMEM; diff --git a/net/ethtool/mse.c b/net/ethtool/mse.c new file mode 100644 index 000000000000..8cb3fc5e7be4 --- /dev/null +++ b/net/ethtool/mse.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> +#include <linux/phy.h> +#include <linux/slab.h> + +#include "netlink.h" +#include "common.h" + +/* Channels A-D only; WORST and LINK are exclusive alternatives */ +#define PHY_MSE_CHANNEL_COUNT 4 + +struct mse_req_info { + struct ethnl_req_info base; +}; + +struct mse_snapshot_entry { + struct phy_mse_snapshot snapshot; + int channel; +}; + +struct mse_reply_data { + struct ethnl_reply_data base; + struct phy_mse_capability capability; + struct mse_snapshot_entry *snapshots; + unsigned int num_snapshots; +}; + +static struct mse_reply_data * +mse_repdata(const struct ethnl_reply_data *reply_base) +{ + return container_of(reply_base, struct mse_reply_data, base); +} + +const struct nla_policy ethnl_mse_get_policy[] = { + [ETHTOOL_A_MSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), +}; + +static int get_snapshot_if_supported(struct phy_device *phydev, + struct mse_reply_data *data, + unsigned int *idx, u32 cap_bit, + enum phy_mse_channel channel) +{ + int ret; + + if (data->capability.supported_caps & cap_bit) { + ret = phydev->drv->get_mse_snapshot(phydev, channel, + &data->snapshots[*idx].snapshot); + if (ret) + return ret; + data->snapshots[*idx].channel = channel; + (*idx)++; + } + + return 0; +} + +static int mse_get_channels(struct phy_device *phydev, + struct mse_reply_data *data) +{ + unsigned int i = 0; + int ret; + + if (!data->capability.supported_caps) + return 0; + + data->snapshots = kzalloc_objs(*data->snapshots, PHY_MSE_CHANNEL_COUNT); + if (!data->snapshots) + return -ENOMEM; + + /* Priority 1: Individual channels */ + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_A, + PHY_MSE_CHANNEL_A); + if (ret) + return ret; + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_B, + PHY_MSE_CHANNEL_B); + if (ret) + return ret; + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_C, + PHY_MSE_CHANNEL_C); + if (ret) + return ret; + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_D, + PHY_MSE_CHANNEL_D); + if (ret) + return ret; + + /* If any individual channels were found, we are done. */ + if (i > 0) { + data->num_snapshots = i; + return 0; + } + + /* Priority 2: Worst channel, if no individual channels supported. */ + ret = get_snapshot_if_supported(phydev, data, &i, + PHY_MSE_CAP_WORST_CHANNEL, + PHY_MSE_CHANNEL_WORST); + if (ret) + return ret; + + /* If worst channel was found, we are done. */ + if (i > 0) { + data->num_snapshots = i; + return 0; + } + + /* Priority 3: Link-wide, if nothing else is supported. */ + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_LINK, + PHY_MSE_CHANNEL_LINK); + if (ret) + return ret; + + data->num_snapshots = i; + return 0; +} + +static int mse_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + const struct genl_info *info) +{ + struct mse_reply_data *data = mse_repdata(reply_base); + struct net_device *dev = reply_base->dev; + struct phy_device *phydev; + int ret; + + phydev = ethnl_req_get_phydev(req_base, info->attrs, + ETHTOOL_A_MSE_HEADER, info->extack); + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + if (!phydev) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret) + return ret; + + mutex_lock(&phydev->lock); + + if (!phydev->drv || !phydev->drv->get_mse_capability || + !phydev->drv->get_mse_snapshot) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + if (!phydev->link) { + ret = -ENETDOWN; + goto out_unlock; + } + + ret = phydev->drv->get_mse_capability(phydev, &data->capability); + if (ret) + goto out_unlock; + + ret = mse_get_channels(phydev, data); + +out_unlock: + mutex_unlock(&phydev->lock); + ethnl_ops_complete(dev); + if (ret) + kfree(data->snapshots); + return ret; +} + +static void mse_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct mse_reply_data *data = mse_repdata(reply_base); + + kfree(data->snapshots); +} + +static int mse_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct mse_reply_data *data = mse_repdata(reply_base); + size_t len = 0; + unsigned int i; + + /* ETHTOOL_A_MSE_CAPABILITIES */ + len += nla_total_size(0); + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) + /* ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE */ + len += nla_total_size(sizeof(u64)); + if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK | + PHY_MSE_CAP_WORST_PEAK)) + /* ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE */ + len += nla_total_size(sizeof(u64)); + /* ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS */ + len += nla_total_size(sizeof(u64)); + /* ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS */ + len += nla_total_size(sizeof(u64)); + + for (i = 0; i < data->num_snapshots; i++) { + size_t snapshot_len = 0; + + /* Per-channel nest (e.g., ETHTOOL_A_MSE_CHANNEL_A / _B / _C / + * _D / _WORST_CHANNEL / _LINK) + */ + snapshot_len += nla_total_size(0); + + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) + snapshot_len += nla_total_size(sizeof(u64)); + if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) + snapshot_len += nla_total_size(sizeof(u64)); + if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) + snapshot_len += nla_total_size(sizeof(u64)); + + len += snapshot_len; + } + + return len; +} + +static int mse_channel_to_attr(int ch) +{ + switch (ch) { + case PHY_MSE_CHANNEL_A: + return ETHTOOL_A_MSE_CHANNEL_A; + case PHY_MSE_CHANNEL_B: + return ETHTOOL_A_MSE_CHANNEL_B; + case PHY_MSE_CHANNEL_C: + return ETHTOOL_A_MSE_CHANNEL_C; + case PHY_MSE_CHANNEL_D: + return ETHTOOL_A_MSE_CHANNEL_D; + case PHY_MSE_CHANNEL_WORST: + return ETHTOOL_A_MSE_WORST_CHANNEL; + case PHY_MSE_CHANNEL_LINK: + return ETHTOOL_A_MSE_LINK; + default: + return -EINVAL; + } +} + +static int mse_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct mse_reply_data *data = mse_repdata(reply_base); + struct nlattr *nest; + unsigned int i; + int ret; + + nest = nla_nest_start(skb, ETHTOOL_A_MSE_CAPABILITIES); + if (!nest) + return -EMSGSIZE; + + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) { + ret = nla_put_uint(skb, + ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE, + data->capability.max_average_mse); + if (ret < 0) + goto nla_put_nest_failure; + } + + if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK | + PHY_MSE_CAP_WORST_PEAK)) { + ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE, + data->capability.max_peak_mse); + if (ret < 0) + goto nla_put_nest_failure; + } + + ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS, + data->capability.refresh_rate_ps); + if (ret < 0) + goto nla_put_nest_failure; + + ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS, + data->capability.num_symbols); + if (ret < 0) + goto nla_put_nest_failure; + + nla_nest_end(skb, nest); + + for (i = 0; i < data->num_snapshots; i++) { + const struct mse_snapshot_entry *s = &data->snapshots[i]; + int chan_attr; + + chan_attr = mse_channel_to_attr(s->channel); + if (chan_attr < 0) + return chan_attr; + + nest = nla_nest_start(skb, chan_attr); + if (!nest) + return -EMSGSIZE; + + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) { + ret = nla_put_uint(skb, + ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE, + s->snapshot.average_mse); + if (ret) + goto nla_put_nest_failure; + } + if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) { + ret = nla_put_uint(skb, ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE, + s->snapshot.peak_mse); + if (ret) + goto nla_put_nest_failure; + } + if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) { + ret = nla_put_uint(skb, + ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE, + s->snapshot.worst_peak_mse); + if (ret) + goto nla_put_nest_failure; + } + + nla_nest_end(skb, nest); + } + + return 0; + +nla_put_nest_failure: + nla_nest_cancel(skb, nest); + return ret; +} + +const struct ethnl_request_ops ethnl_mse_request_ops = { + .request_cmd = ETHTOOL_MSG_MSE_GET, + .reply_cmd = ETHTOOL_MSG_MSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_MSE_HEADER, + .req_info_size = sizeof(struct mse_req_info), + .reply_data_size = sizeof(struct mse_reply_data), + + .prepare_data = mse_prepare_data, + .cleanup_data = mse_cleanup_data, + .reply_size = mse_reply_size, + .fill_reply = mse_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 2f813f25f07e..6e5f0f4f815a 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -420,6 +420,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, [ETHTOOL_MSG_PHY_GET] = ðnl_phy_request_ops, + [ETHTOOL_MSG_MSE_GET] = ðnl_mse_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1534,6 +1535,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_delete_policy, .maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_MSE_GET, + .doit = ethnl_default_doit, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, + .policy = ethnl_mse_get_policy, + .maxattr = ARRAY_SIZE(ethnl_mse_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 1d4f9ecb3d26..89010eaa67df 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -442,6 +442,7 @@ extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; +extern const struct ethnl_request_ops ethnl_mse_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -497,6 +498,7 @@ extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; +extern const struct nla_policy ethnl_mse_get_policy[ETHTOOL_A_MSE_HEADER + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 4dced53be4b3..da5934cceb07 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -824,8 +824,8 @@ rss_set_ctx_update(struct ethtool_rxfh_context *ctx, struct nlattr **tb, static int ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info) { - bool indir_reset = false, indir_mod, xfrm_sym = false; struct rss_req_info *request = RSS_REQINFO(req_info); + bool indir_reset = false, indir_mod, xfrm_sym; struct ethtool_rxfh_context *ctx = NULL; struct net_device *dev = req_info->dev; bool mod = false, fields_mod = false; @@ -860,12 +860,7 @@ ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info) rxfh.input_xfrm = data.input_xfrm; ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod); - /* For drivers which don't support input_xfrm it will be set to 0xff - * in the RSS context info. In all other case input_xfrm != 0 means - * symmetric hashing is requested. - */ - if (!request->rss_context || ops->rxfh_per_ctx_key) - xfrm_sym = rxfh.input_xfrm || data.input_xfrm; + xfrm_sym = rxfh.input_xfrm || data.input_xfrm; if (rxfh.input_xfrm == data.input_xfrm) rxfh.input_xfrm = RXH_XFRM_NO_CHANGE; diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 169b413b31fc..e49e612a68c2 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -202,10 +202,10 @@ static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) int reply_len = 0; int ret; - req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + req_info = kzalloc_obj(*req_info); if (!req_info) return -ENOMEM; - reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL); + reply_data = kmalloc_obj(*reply_data); if (!reply_data) { kfree(req_info); return -ENOMEM; @@ -280,7 +280,7 @@ tsconfig_set_hwprov_from_desc(struct net_device *dev, source = HWTSTAMP_SOURCE_PHYLIB; } - hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL); + hwprov = kzalloc_obj(*hwprov); if (!hwprov) return ERR_PTR(-ENOMEM); diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 8c654caa6805..c0145c752d2f 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -505,10 +505,10 @@ int ethnl_tsinfo_start(struct netlink_callback *cb) BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + req_info = kzalloc_obj(*req_info); if (!req_info) return -ENOMEM; - reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL); + reply_data = kzalloc_obj(*reply_data); if (!reply_data) { ret = -ENOMEM; goto free_req_info; diff --git a/net/handshake/genl.c b/net/handshake/genl.c index f55d14d7b726..870612609491 100644 --- a/net/handshake/genl.c +++ b/net/handshake/genl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/handshake/genl.h b/net/handshake/genl.h index ae72a596f6cc..8d3e18672daf 100644 --- a/net/handshake/genl.h +++ b/net/handshake/genl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_HANDSHAKE_GEN_H #define _LINUX_HANDSHAKE_GEN_H diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 7e46d130dce2..b989456fc4c5 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -93,7 +93,7 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; struct socket *sock; - int class, fd, err; + int class, err; err = -EOPNOTSUPP; if (!hn) @@ -106,29 +106,28 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) err = -EAGAIN; req = handshake_req_next(hn, class); - if (!req) - goto out_status; - - sock = req->hr_sk->sk_socket; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_complete; - } - - err = req->hr_proto->hp_accept(req, info, fd); - if (err) { - put_unused_fd(fd); - goto out_complete; + if (req) { + sock = req->hr_sk->sk_socket; + + FD_PREPARE(fdf, O_CLOEXEC, sock->file); + if (fdf.err) { + err = fdf.err; + goto out_complete; + } + + get_file(sock->file); /* FD_PREPARE() consumes a reference. */ + err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf)); + if (err) + goto out_complete; /* Automatic cleanup handles fput */ + + trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf)); + fd_publish(fdf); + return 0; } - fd_install(fd, get_file(sock->file)); - - trace_handshake_cmd_accept(net, req, req->hr_sk, fd); - return 0; - out_complete: - handshake_complete(req, -EIO, NULL); + if (req) + handshake_complete(req, -EIO, NULL); out_status: trace_handshake_cmd_accept_err(net, req, NULL, err); return err; diff --git a/net/handshake/request.c b/net/handshake/request.c index 274d2c89b6b2..2829adbeb149 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -119,7 +119,7 @@ struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto, if (!proto->hp_accept || !proto->hp_done) return NULL; - req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags); + req = kzalloc_flex(*req, hr_priv, proto->hp_privsize, flags); if (!req) return NULL; @@ -276,6 +276,8 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, out_unlock: spin_unlock(&hn->hn_lock); out_err: + /* Restore original destructor so socket teardown still runs on failure */ + req->hr_sk->sk_destruct = req->hr_odestruct; trace_handshake_submit_err(net, req, req->hr_sk, ret); handshake_req_destroy(req); return ret; @@ -324,7 +326,11 @@ bool handshake_req_cancel(struct sock *sk) hn = handshake_pernet(net); if (hn && remove_pending(hn, req)) { - /* Request hadn't been accepted */ + /* Request hadn't been accepted - mark cancelled */ + if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { + trace_handshake_cancel_busy(net, req, sk); + return false; + } goto out_true; } if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 492cbc78ab75..d1bfc49b5f01 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -690,6 +690,26 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, } EXPORT_SYMBOL(hsr_get_port_ndev); +int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev, + enum hsr_port_type *type) +{ + struct hsr_priv *hsr = netdev_priv(hsr_dev); + struct hsr_port *port; + + rcu_read_lock(); + hsr_for_each_port(hsr, port) { + if (port->dev == dev) { + *type = port->type; + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + return -EINVAL; +} +EXPORT_SYMBOL(hsr_get_port_type); + /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 339f0d220212..aefc9b6936ba 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -205,6 +205,8 @@ struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, __pskb_copy(frame->skb_prp, skb_headroom(frame->skb_prp), GFP_ATOMIC); + if (!frame->skb_std) + return NULL; } else { /* Unexpected */ WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 3a2a2fa7a0a3..50996f4de7f9 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -11,6 +11,7 @@ * Same code handles filtering of duplicates for PRP as well. */ +#include <kunit/visibility.h> #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> @@ -19,24 +20,6 @@ #include "hsr_framereg.h" #include "hsr_netlink.h" -/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, - * false otherwise. - */ -static bool seq_nr_after(u16 a, u16 b) -{ - /* Remove inconsistency where - * seq_nr_after(a, b) == seq_nr_before(a, b) - */ - if ((int)b - a == 32768) - return false; - - return (((s16)(b - a)) < 0); -} - -#define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) -#define PRP_DROP_WINDOW_LEN 32768 - bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) { if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox)) @@ -97,7 +80,7 @@ int hsr_create_self_node(struct hsr_priv *hsr, { struct hsr_self_node *sn, *old; - sn = kmalloc(sizeof(*sn), GFP_KERNEL); + sn = kmalloc_obj(*sn); if (!sn) return -ENOMEM; @@ -126,13 +109,29 @@ void hsr_del_self_node(struct hsr_priv *hsr) kfree_rcu(old, rcu_head); } +static void hsr_free_node(struct hsr_node *node) +{ + xa_destroy(&node->seq_blocks); + kfree(node->block_buf); + kfree(node); +} + +static void hsr_free_node_rcu(struct rcu_head *rn) +{ + struct hsr_node *node = container_of(rn, struct hsr_node, rcu_head); + + hsr_free_node(node); +} + void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; - list_for_each_entry_safe(node, tmp, node_db, mac_list) - kfree(node); + list_for_each_entry_safe(node, tmp, node_db, mac_list) { + list_del(&node->mac_list); + hsr_free_node(node); + } } void prp_handle_san_frame(bool san, enum hsr_port_type port, @@ -148,39 +147,43 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port, node->san_b = true; } -/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; - * seq_out is used to initialize filtering of outgoing duplicate frames - * originating from the newly added node. +/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, - unsigned char addr[], - u16 seq_out, bool san, + unsigned char addr[], bool san, enum hsr_port_type rx_port) { - struct hsr_node *new_node, *node; + struct hsr_node *new_node, *node = NULL; unsigned long now; + size_t block_sz; int i; - new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); + new_node = kzalloc_obj(*new_node, GFP_ATOMIC); if (!new_node) return NULL; ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); + if (hsr->prot_version == PRP_V1) + new_node->seq_port_cnt = 1; + else + new_node->seq_port_cnt = HSR_PT_PORTS - 1; + + block_sz = hsr_seq_block_size(new_node); + new_node->block_buf = kcalloc(HSR_MAX_SEQ_BLOCKS, block_sz, GFP_ATOMIC); + if (!new_node->block_buf) + goto free; + + xa_init(&new_node->seq_blocks); + /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; - new_node->time_out[i] = now; - } - for (i = 0; i < HSR_PT_PORTS; i++) { - new_node->seq_out[i] = seq_out; - new_node->seq_expected[i] = seq_out + 1; - new_node->seq_start[i] = seq_out + 1; } if (san && hsr->proto_ops->handle_san_frame) @@ -199,6 +202,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, return new_node; out: spin_unlock_bh(&hsr->list_lock); + kfree(new_node->block_buf); +free: kfree(new_node); return node; } @@ -223,7 +228,6 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; - u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; @@ -260,25 +264,72 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return NULL; - - /* Use the existing sequence_nr from the tag as starting point - * for filtering duplicate frames. - */ - seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); - if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { - seq_out = prp_get_skb_sequence_nr(rct); - } else { - if (rx_port != HSR_PT_MASTER) - san = true; - seq_out = HSR_SEQNR_START; + if (!rct && rx_port != HSR_PT_MASTER) + san = true; + } + + return hsr_add_node(hsr, node_db, ethhdr->h_source, san, rx_port); +} + +static bool hsr_seq_block_is_old(struct hsr_seq_block *block) +{ + unsigned long expiry = msecs_to_jiffies(HSR_ENTRY_FORGET_TIME); + + return time_is_before_jiffies(block->time + expiry); +} + +static void hsr_forget_seq_block(struct hsr_node *node, + struct hsr_seq_block *block) +{ + if (block->time) + xa_erase(&node->seq_blocks, block->block_idx); + block->time = 0; +} + +/* Get the currently active sequence number block. If there is no block yet, or + * the existing one is expired, a new block is created. The idea is to maintain + * a "sparse bitmap" where a bitmap for the whole sequence number space is + * split into blocks and not all blocks exist all the time. The blocks can + * expire after time (in low traffic situations) or when they are replaced in + * the backing fixed size buffer (in high traffic situations). + */ +VISIBLE_IF_KUNIT struct hsr_seq_block *hsr_get_seq_block(struct hsr_node *node, + u16 block_idx) +{ + struct hsr_seq_block *block, *res; + size_t block_sz; + + block = xa_load(&node->seq_blocks, block_idx); + + if (block && hsr_seq_block_is_old(block)) { + hsr_forget_seq_block(node, block); + block = NULL; + } + + if (!block) { + block_sz = hsr_seq_block_size(node); + block = node->block_buf + node->next_block * block_sz; + hsr_forget_seq_block(node, block); + + memset(block, 0, block_sz); + block->time = jiffies; + block->block_idx = block_idx; + + res = xa_store(&node->seq_blocks, block_idx, block, GFP_ATOMIC); + if (xa_is_err(res)) { + block->time = 0; + return NULL; } + + node->next_block = + (node->next_block + 1) & (HSR_MAX_SEQ_BLOCKS - 1); } - return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, - san, rx_port); + return block; } +EXPORT_SYMBOL_IF_KUNIT(hsr_get_seq_block); /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate @@ -288,16 +339,18 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; + struct hsr_seq_block *src_blk, *merge_blk; struct hsr_priv *hsr = port_rcv->hsr; - struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; + struct hsr_sup_payload *hsr_sp; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; - int i; - unsigned int pull_size = 0; unsigned int total_pull_size = 0; + unsigned int pull_size = 0; + unsigned long idx; + int i; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol @@ -340,8 +393,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, - HSR_SEQNR_START - 1, true, - port_rcv->type); + true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) @@ -388,8 +440,20 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } - if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) - node_real->seq_out[i] = node_curr->seq_out[i]; + } + + xa_for_each(&node_curr->seq_blocks, idx, src_blk) { + if (hsr_seq_block_is_old(src_blk)) + continue; + + merge_blk = hsr_get_seq_block(node_real, src_blk->block_idx); + if (!merge_blk) + continue; + merge_blk->time = min(merge_blk->time, src_blk->time); + for (i = 0; i < node_real->seq_port_cnt; i++) { + bitmap_or(merge_blk->seq_nrs[i], merge_blk->seq_nrs[i], + src_blk->seq_nrs[i], HSR_SEQ_BLOCK_SIZE); + } } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; @@ -398,7 +462,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; - kfree_rcu(node_curr, rcu_head); + call_rcu(&node_curr->rcu_head, hsr_free_node_rcu); } spin_unlock_bh(&hsr->list_lock); @@ -466,56 +530,79 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { - /* Don't register incoming frames without a valid sequence number. This - * ensures entries of restarted nodes gets pruned so that they can - * re-register and resume communications. - */ - if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && - seq_nr_before(sequence_nr, node->seq_out[port->type])) - return; - node->time_in[port->type] = jiffies; node->time_in_stale[port->type] = false; } -/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid - * ethhdr->h_source address and skb->mac_header set. +/* Duplicate discard algorithm: we maintain a bitmap where we set a bit for + * every seen sequence number. The bitmap is split into blocks and the block + * management is detailed in hsr_get_seq_block(). In any case, we err on the + * side of accepting a packet, as the specification requires the algorithm to + * be "designed such that it never rejects a legitimate frame, while occasional + * acceptance of a duplicate can be tolerated." (IEC 62439-3:2021, 4.1.10.3). + * While this requirement is explicit for PRP, applying it to HSR does no harm + * either. + * + * 'frame' is the frame to be sent + * 'port_type' is the type of the outgoing interface * * Return: * 1 if frame can be shown to have been sent recently on this interface, - * 0 otherwise, or - * negative error code on error + * 0 otherwise */ -int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) +static int hsr_check_duplicate(struct hsr_frame_info *frame, + unsigned int port_type) { - struct hsr_node *node = frame->node_src; - u16 sequence_nr = frame->sequence_nr; + u16 sequence_nr, seq_bit, block_idx; + struct hsr_seq_block *block; + struct hsr_node *node; + + node = frame->node_src; + sequence_nr = frame->sequence_nr; + + if (WARN_ON_ONCE(port_type >= node->seq_port_cnt)) + return 0; spin_lock_bh(&node->seq_out_lock); - if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && - time_is_after_jiffies(node->time_out[port->type] + - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { - spin_unlock_bh(&node->seq_out_lock); - return 1; - } - node->time_out[port->type] = jiffies; - node->seq_out[port->type] = sequence_nr; + block_idx = hsr_seq_block_index(sequence_nr); + block = hsr_get_seq_block(node, block_idx); + if (!block) + goto out_new; + + seq_bit = hsr_seq_block_bit(sequence_nr); + if (__test_and_set_bit(seq_bit, block->seq_nrs[port_type])) + goto out_seen; + +out_new: spin_unlock_bh(&node->seq_out_lock); return 0; + +out_seen: + spin_unlock_bh(&node->seq_out_lock); + return 1; } -/* Adaptation of the PRP duplicate discard algorithm described in wireshark - * wiki (https://wiki.wireshark.org/PRP) +/* HSR duplicate discard: we check if the same frame has already been sent on + * this outgoing interface. The check follows the general duplicate discard + * algorithm. * - * A drop window is maintained for both LANs with start sequence set to the - * first sequence accepted on the LAN that has not been seen on the other LAN, - * and expected sequence set to the latest received sequence number plus one. + * 'port' is the outgoing interface + * 'frame' is the frame to be sent * - * When a frame is received on either LAN it is compared against the received - * frames on the other LAN. If it is outside the drop window of the other LAN - * the frame is accepted and the drop window is updated. - * The drop window for the other LAN is reset. + * Return: + * 1 if frame can be shown to have been sent recently on this interface, + * 0 otherwise + */ +int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) +{ + return hsr_check_duplicate(frame, port->type - 1); +} + +/* PRP duplicate discard: we only consider frames that are received on port A + * or port B and should go to the master port. For those, we check if they have + * already been received by the host, i.e., master port. The check uses the + * general duplicate discard algorithm, but without tracking multiple ports. * * 'port' is the outgoing interface * 'frame' is the frame to be sent @@ -526,18 +613,9 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) */ int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) { - enum hsr_port_type other_port; - enum hsr_port_type rcv_port; - struct hsr_node *node; - u16 sequence_diff; - u16 sequence_exp; - u16 sequence_nr; - - /* out-going frames are always in order - * and can be checked the same way as for HSR - */ + /* out-going frames are always in order */ if (frame->port_rcv->type == HSR_PT_MASTER) - return hsr_register_frame_out(port, frame); + return 0; /* for PRP we should only forward frames from the slave ports * to the master port @@ -545,52 +623,9 @@ int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) if (port->type != HSR_PT_MASTER) return 1; - node = frame->node_src; - sequence_nr = frame->sequence_nr; - sequence_exp = sequence_nr + 1; - rcv_port = frame->port_rcv->type; - other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B : - HSR_PT_SLAVE_A; - - spin_lock_bh(&node->seq_out_lock); - if (time_is_before_jiffies(node->time_out[port->type] + - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) || - (node->seq_start[rcv_port] == node->seq_expected[rcv_port] && - node->seq_start[other_port] == node->seq_expected[other_port])) { - /* the node hasn't been sending for a while - * or both drop windows are empty, forward the frame - */ - node->seq_start[rcv_port] = sequence_nr; - } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) && - seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) { - /* drop the frame, update the drop window for the other port - * and reset our drop window - */ - node->seq_start[other_port] = sequence_exp; - node->seq_expected[rcv_port] = sequence_exp; - node->seq_start[rcv_port] = node->seq_expected[rcv_port]; - spin_unlock_bh(&node->seq_out_lock); - return 1; - } - - /* update the drop window for the port where this frame was received - * and clear the drop window for the other port - */ - node->seq_start[other_port] = node->seq_expected[other_port]; - node->seq_expected[rcv_port] = sequence_exp; - sequence_diff = sequence_exp - node->seq_start[rcv_port]; - if (sequence_diff > PRP_DROP_WINDOW_LEN) - node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN; - - node->time_out[port->type] = jiffies; - node->seq_out[port->type] = sequence_nr; - spin_unlock_bh(&node->seq_out_lock); - return 0; + return hsr_check_duplicate(frame, 0); } - -#if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) -EXPORT_SYMBOL(prp_register_frame_out); -#endif +EXPORT_SYMBOL_IF_KUNIT(prp_register_frame_out); static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) @@ -672,7 +707,7 @@ void hsr_prune_nodes(struct timer_list *t) list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ - kfree_rcu(node, rcu_head); + call_rcu(&node->rcu_head, hsr_free_node_rcu); } } } @@ -706,7 +741,7 @@ void hsr_prune_proxy_nodes(struct timer_list *t) list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ - kfree_rcu(node, rcu_head); + call_rcu(&node->rcu_head, hsr_free_node_rcu); } } } @@ -740,6 +775,39 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, return NULL; } +/* Fill the last sequence number that has been received from node on if1 by + * finding the last sequence number sent on port B; accordingly get the last + * received sequence number for if2 using sent sequence numbers on port A. + */ +static void fill_last_seq_nrs(struct hsr_node *node, u16 *if1_seq, u16 *if2_seq) +{ + struct hsr_seq_block *block; + unsigned int block_off; + size_t block_sz; + u16 seq_bit; + + spin_lock_bh(&node->seq_out_lock); + + /* Get last inserted block */ + block_off = (node->next_block - 1) & (HSR_MAX_SEQ_BLOCKS - 1); + block_sz = hsr_seq_block_size(node); + block = node->block_buf + block_off * block_sz; + + if (!bitmap_empty(block->seq_nrs[HSR_PT_SLAVE_B - 1], + HSR_SEQ_BLOCK_SIZE)) { + seq_bit = find_last_bit(block->seq_nrs[HSR_PT_SLAVE_B - 1], + HSR_SEQ_BLOCK_SIZE); + *if1_seq = (block->block_idx << HSR_SEQ_BLOCK_SHIFT) | seq_bit; + } + if (!bitmap_empty(block->seq_nrs[HSR_PT_SLAVE_A - 1], + HSR_SEQ_BLOCK_SIZE)) { + seq_bit = find_last_bit(block->seq_nrs[HSR_PT_SLAVE_A - 1], + HSR_SEQ_BLOCK_SIZE); + *if2_seq = (block->block_idx << HSR_SEQ_BLOCK_SHIFT) | seq_bit; + } + spin_unlock_bh(&node->seq_out_lock); +} + int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], @@ -780,8 +848,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ - *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; - *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; + *if1_seq = 0; + *if2_seq = 0; + if (hsr->prot_version != PRP_V1) + fill_last_seq_nrs(node, if1_seq, if2_seq); if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index b04948659d84..c65ecb925734 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -74,9 +74,30 @@ bool hsr_is_node_in_db(struct list_head *node_db, int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame); +#if IS_ENABLED(CONFIG_KUNIT) +struct hsr_seq_block *hsr_get_seq_block(struct hsr_node *node, u16 block_idx); +#endif + +#define HSR_SEQ_BLOCK_SHIFT 7 /* 128 bits */ +#define HSR_SEQ_BLOCK_SIZE (1 << HSR_SEQ_BLOCK_SHIFT) +#define HSR_SEQ_BLOCK_MASK (HSR_SEQ_BLOCK_SIZE - 1) +#define HSR_MAX_SEQ_BLOCKS 64 + +#define hsr_seq_block_index(sequence_nr) ((sequence_nr) >> HSR_SEQ_BLOCK_SHIFT) +#define hsr_seq_block_bit(sequence_nr) ((sequence_nr) & HSR_SEQ_BLOCK_MASK) + +struct hsr_seq_block { + unsigned long time; + u16 block_idx; + /* Should be a flexible array member of what DECLARE_BITMAP() would + * produce. + */ + unsigned long seq_nrs[][BITS_TO_LONGS(HSR_SEQ_BLOCK_SIZE)]; +}; + struct hsr_node { struct list_head mac_list; - /* Protect R/W access to seq_out */ + /* Protect R/W access seq_blocks */ spinlock_t seq_out_lock; unsigned char macaddress_A[ETH_ALEN]; unsigned char macaddress_B[ETH_ALEN]; @@ -84,16 +105,22 @@ struct hsr_node { enum hsr_port_type addr_B_port; unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; - unsigned long time_out[HSR_PT_PORTS]; /* if the node is a SAN */ bool san_a; bool san_b; - u16 seq_out[HSR_PT_PORTS]; bool removed; - /* PRP specific duplicate handling */ - u16 seq_expected[HSR_PT_PORTS]; - u16 seq_start[HSR_PT_PORTS]; + /* Duplicate detection */ + struct xarray seq_blocks; + void *block_buf; + unsigned int next_block; + unsigned int seq_port_cnt; struct rcu_head rcu_head; }; +static inline size_t hsr_seq_block_size(struct hsr_node *node) +{ + WARN_ON_ONCE(node->seq_port_cnt == 0); + return struct_size_t(struct hsr_seq_block, seq_nrs, node->seq_port_cnt); +} + #endif /* __HSR_FRAMEREG_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index c96b63adf96f..db0b0af7a692 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -166,12 +166,20 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } + port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + if (port) { + if (nla_put_u32(skb, IFLA_HSR_INTERLINK, port->dev->ifindex)) + goto nla_put_failure; + } + if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN, hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; if (hsr->prot_version == PRP_V1) proto = HSR_PROTOCOL_PRP; + else if (nla_put_u8(skb, IFLA_HSR_VERSION, hsr->prot_version)) + goto nla_put_failure; if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) goto nla_put_failure; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 8177ac6c2d26..44f83c8c56a7 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -198,7 +198,7 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, if (port) return -EBUSY; /* This port already exists */ - port = kzalloc(sizeof(*port), GFP_KERNEL); + port = kzalloc_obj(*port); if (!port) return -ENOMEM; @@ -207,14 +207,14 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, port->type = type; ether_addr_copy(port->original_macaddress, dev->dev_addr); + list_add_tail_rcu(&port->port_list, &hsr->ports); + if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } - list_add_tail_rcu(&port->port_list, &hsr->ports); - master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); @@ -222,7 +222,8 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, return 0; fail_dev_setup: - kfree(port); + list_del_rcu(&port->port_list); + kfree_rcu(port, rcu); return res; } diff --git a/net/hsr/prp_dup_discard_test.c b/net/hsr/prp_dup_discard_test.c index e86b7b633ae8..b028e71e6a0f 100644 --- a/net/hsr/prp_dup_discard_test.c +++ b/net/hsr/prp_dup_discard_test.c @@ -4,6 +4,8 @@ #include "hsr_main.h" #include "hsr_framereg.h" +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + struct prp_test_data { struct hsr_port port; struct hsr_port port_rcv; @@ -13,37 +15,55 @@ struct prp_test_data { static struct prp_test_data *build_prp_test_data(struct kunit *test) { + size_t block_sz; + struct prp_test_data *data = kunit_kzalloc(test, sizeof(struct prp_test_data), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data); + data->node.seq_port_cnt = 1; + block_sz = hsr_seq_block_size(&data->node); + data->node.block_buf = kunit_kcalloc(test, HSR_MAX_SEQ_BLOCKS, block_sz, + GFP_ATOMIC); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data->node.block_buf); + + xa_init(&data->node.seq_blocks); + spin_lock_init(&data->node.seq_out_lock); + data->frame.node_src = &data->node; data->frame.port_rcv = &data->port_rcv; data->port_rcv.type = HSR_PT_SLAVE_A; - data->node.seq_start[HSR_PT_SLAVE_A] = 1; - data->node.seq_expected[HSR_PT_SLAVE_A] = 1; - data->node.seq_start[HSR_PT_SLAVE_B] = 1; - data->node.seq_expected[HSR_PT_SLAVE_B] = 1; - data->node.seq_out[HSR_PT_MASTER] = 0; - data->node.time_out[HSR_PT_MASTER] = jiffies; data->port.type = HSR_PT_MASTER; return data; } -static void check_prp_counters(struct kunit *test, - struct prp_test_data *data, - u16 seq_start_a, u16 seq_expected_a, - u16 seq_start_b, u16 seq_expected_b) +static void check_prp_frame_seen(struct kunit *test, struct prp_test_data *data, + u16 sequence_nr) +{ + u16 block_idx, seq_bit; + struct hsr_seq_block *block; + + block_idx = sequence_nr >> HSR_SEQ_BLOCK_SHIFT; + block = xa_load(&data->node.seq_blocks, block_idx); + KUNIT_EXPECT_NOT_NULL(test, block); + + seq_bit = sequence_nr & HSR_SEQ_BLOCK_MASK; + KUNIT_EXPECT_TRUE(test, test_bit(seq_bit, block->seq_nrs[0])); +} + +static void check_prp_frame_unseen(struct kunit *test, + struct prp_test_data *data, u16 sequence_nr) { - KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_A], - seq_start_a); - KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_B], - seq_start_b); - KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_A], - seq_expected_a); - KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_B], - seq_expected_b); + u16 block_idx, seq_bit; + struct hsr_seq_block *block; + + block_idx = sequence_nr >> HSR_SEQ_BLOCK_SHIFT; + block = hsr_get_seq_block(&data->node, block_idx); + KUNIT_EXPECT_NOT_NULL(test, block); + + seq_bit = sequence_nr & HSR_SEQ_BLOCK_MASK; + KUNIT_EXPECT_FALSE(test, test_bit(seq_bit, block->seq_nrs[0])); } static void prp_dup_discard_forward(struct kunit *test) @@ -54,52 +74,48 @@ static void prp_dup_discard_forward(struct kunit *test) data->frame.sequence_nr = 2; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 1, 1); + check_prp_frame_seen(test, data, data->frame.sequence_nr); } -static void prp_dup_discard_inside_dropwindow(struct kunit *test) +static void prp_dup_discard_drop_duplicate(struct kunit *test) { - /* Normal situation, other LAN ahead by one. Frame is dropped */ struct prp_test_data *data = build_prp_test_data(test); - unsigned long time = jiffies - 10; - data->frame.sequence_nr = 1; - data->node.seq_expected[HSR_PT_SLAVE_B] = 3; - data->node.seq_out[HSR_PT_MASTER] = 2; - data->node.time_out[HSR_PT_MASTER] = time; + data->frame.sequence_nr = 2; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, 2, data->node.seq_out[HSR_PT_MASTER]); - KUNIT_EXPECT_EQ(test, time, data->node.time_out[HSR_PT_MASTER]); - check_prp_counters(test, data, 2, 2, 2, 3); + check_prp_frame_seen(test, data, data->frame.sequence_nr); } -static void prp_dup_discard_node_timeout(struct kunit *test) +static void prp_dup_discard_entry_timeout(struct kunit *test) { /* Timeout situation, node hasn't sent anything for a while */ struct prp_test_data *data = build_prp_test_data(test); + struct hsr_seq_block *block; + u16 block_idx; data->frame.sequence_nr = 7; - data->node.seq_start[HSR_PT_SLAVE_A] = 1234; - data->node.seq_expected[HSR_PT_SLAVE_A] = 1235; - data->node.seq_start[HSR_PT_SLAVE_B] = 1234; - data->node.seq_expected[HSR_PT_SLAVE_B] = 1234; - data->node.seq_out[HSR_PT_MASTER] = 1234; - data->node.time_out[HSR_PT_MASTER] = - jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); + + data->frame.sequence_nr = 11; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); + + block_idx = data->frame.sequence_nr >> HSR_SEQ_BLOCK_SHIFT; + block = hsr_get_seq_block(&data->node, block_idx); + block->time = jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 1234, 1234); + check_prp_frame_seen(test, data, data->frame.sequence_nr); + check_prp_frame_unseen(test, data, 7); } static void prp_dup_discard_out_of_sequence(struct kunit *test) @@ -107,50 +123,36 @@ static void prp_dup_discard_out_of_sequence(struct kunit *test) /* One frame is received out of sequence on both LANs */ struct prp_test_data *data = build_prp_test_data(test); - data->node.seq_start[HSR_PT_SLAVE_A] = 10; - data->node.seq_expected[HSR_PT_SLAVE_A] = 10; - data->node.seq_start[HSR_PT_SLAVE_B] = 10; - data->node.seq_expected[HSR_PT_SLAVE_B] = 10; - data->node.seq_out[HSR_PT_MASTER] = 9; + /* initial frame, should be accepted */ + data->frame.sequence_nr = 9; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); /* 1st old frame, should be accepted */ data->frame.sequence_nr = 8; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 10, 10); + check_prp_frame_seen(test, data, data->frame.sequence_nr); /* 2nd frame should be dropped */ data->frame.sequence_nr = 8; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1); /* Next frame, this is forwarded */ data->frame.sequence_nr = 10; data->port_rcv.type = HSR_PT_SLAVE_A; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 9, 9); + check_prp_frame_seen(test, data, data->frame.sequence_nr); /* and next one is dropped */ data->frame.sequence_nr = 10; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1); } static void prp_dup_discard_lan_b_late(struct kunit *test) @@ -158,43 +160,31 @@ static void prp_dup_discard_lan_b_late(struct kunit *test) /* LAN B is behind */ struct prp_test_data *data = build_prp_test_data(test); - data->node.seq_start[HSR_PT_SLAVE_A] = 9; - data->node.seq_expected[HSR_PT_SLAVE_A] = 9; - data->node.seq_start[HSR_PT_SLAVE_B] = 9; - data->node.seq_expected[HSR_PT_SLAVE_B] = 9; - data->node.seq_out[HSR_PT_MASTER] = 8; - data->frame.sequence_nr = 9; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, 9, 10, 9, 9); + check_prp_frame_seen(test, data, data->frame.sequence_nr); data->frame.sequence_nr = 10; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, 9, 11, 9, 9); + check_prp_frame_seen(test, data, data->frame.sequence_nr); data->frame.sequence_nr = 9; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, 10, 11, 10, 10); data->frame.sequence_nr = 10; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, 11, 11, 11, 11); } static struct kunit_case prp_dup_discard_test_cases[] = { KUNIT_CASE(prp_dup_discard_forward), - KUNIT_CASE(prp_dup_discard_inside_dropwindow), - KUNIT_CASE(prp_dup_discard_node_timeout), + KUNIT_CASE(prp_dup_discard_drop_duplicate), + KUNIT_CASE(prp_dup_discard_entry_timeout), KUNIT_CASE(prp_dup_discard_out_of_sequence), KUNIT_CASE(prp_dup_discard_lan_b_late), {} diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 5a024ca60d35..115f43d0e158 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -603,7 +603,7 @@ nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); if (!state) { - state = kzalloc(sizeof(*state), GFP_KERNEL); + state = kzalloc_obj(*state); if (!state) { rtnl_unlock(); return -ENOMEM; @@ -1418,7 +1418,7 @@ static int nl802154_trigger_scan(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } - request = kzalloc(sizeof(*request), GFP_KERNEL); + request = kzalloc_obj(*request); if (!request) return -ENOMEM; @@ -1586,7 +1586,7 @@ nl802154_send_beacons(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } - request = kzalloc(sizeof(*request), GFP_KERNEL); + request = kzalloc_obj(*request); if (!request) return -ENOMEM; diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 18d267921bb5..e542fbe113e7 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -96,7 +96,7 @@ static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, return sk->sk_prot->sendmsg(sk, msg, len); } -static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, +static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; @@ -107,7 +107,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, return sock_no_bind(sock, uaddr, addr_len); } -static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, +static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -193,7 +193,7 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) +static int raw_bind(struct sock *sk, struct sockaddr_unsized *_uaddr, int len) { struct ieee802154_addr addr; struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; @@ -227,7 +227,7 @@ out: return err; } -static int raw_connect(struct sock *sk, struct sockaddr *uaddr, +static int raw_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { return -ENOTSUPP; @@ -485,7 +485,7 @@ static void dgram_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) +static int dgram_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct ieee802154_addr haddr; @@ -563,7 +563,7 @@ static int dgram_ioctl(struct sock *sk, int cmd, int *karg) } /* FIXME: autobind */ -static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, +static int dgram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 12850a277251..df922f9f5289 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -748,6 +748,7 @@ config TCP_SIGPOOL config TCP_AO bool "TCP: Authentication Option (RFC5925)" select CRYPTO + select CRYPTO_LIB_UTILS select TCP_SIGPOOL depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64) help @@ -760,9 +761,8 @@ config TCP_AO config TCP_MD5SIG bool "TCP: MD5 Signature Option support (RFC2385)" - select CRYPTO - select CRYPTO_MD5 - select TCP_SIGPOOL + select CRYPTO_LIB_MD5 + select CRYPTO_LIB_UTILS help RFC2385 specifies a method of giving MD5 protection to TCP sessions. Its main (only?) use is to protect BGP sessions between core routers diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ec36d2ec059e..18108a6f0499 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_rate.o tcp_recovery.o tcp_ulp.o \ + tcp_recovery.o tcp_ulp.o \ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3109c5ec38f3..c7731e300a44 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -124,6 +124,12 @@ #include <trace/events/sock.h> +/* Keep the definition of IPv6 disable here for now, to avoid annoying linker + * issues in case IPv6=m + */ +int disable_ipv6_mod; +EXPORT_SYMBOL(disable_ipv6_mod); + /* The inetsw table contains everything that inet_create needs to * build a new socket. */ @@ -441,7 +447,7 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; int err; @@ -464,13 +470,13 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) return __inet_bind(sk, uaddr, addr_len, flags); } -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { return inet_bind_sk(sock->sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_bind); -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -567,7 +573,7 @@ out: return err; } -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -623,7 +629,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; @@ -741,7 +747,7 @@ sock_error: } EXPORT_SYMBOL(__inet_stream_connect); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { int err; @@ -755,6 +761,11 @@ EXPORT_SYMBOL(inet_stream_connect); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) { + if (mem_cgroup_sockets_enabled) { + mem_cgroup_sk_alloc(newsk); + __sk_charge(newsk, GFP_KERNEL); + } + sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | @@ -768,6 +779,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new newsock->state = SS_CONNECTED; } +EXPORT_SYMBOL_GPL(__inet_accept); /* * Accept a pending connection. The TCP layer now gives BSD semantics. @@ -813,7 +825,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len, CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; @@ -821,7 +833,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len, CGROUP_INET4_GETSOCKNAME); } release_sock(sk); @@ -1731,8 +1743,7 @@ static __net_init int ipv4_mib_init_net(struct net *net) net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); if (!net->mib.icmp_statistics) goto err_icmp_mib; - net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), - GFP_KERNEL); + net->mib.icmpmsg_statistics = kzalloc_obj(struct icmpmsg_mib); if (!net->mib.icmpmsg_statistics) goto err_icmpmsg_mib; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 64aec3dff8ec..5fb812443a08 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -484,7 +484,7 @@ static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) goto error; } - ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); + ahp = kzalloc_obj(*ahp); if (!ahp) return -ENOMEM; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 833f2cf97178..51d70180e1cc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -79,6 +79,7 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> +#include <linux/hex.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> @@ -564,7 +565,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, skb_reserve(skb, hlen); skb_reset_network_header(skb); - arp = skb_put(skb, arp_hdr_len(dev)); + skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) @@ -572,12 +573,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, if (!dest_hw) dest_hw = dev->broadcast; - /* - * Fill the device header for the ARP frame + /* Fill the device header for the ARP frame. + * Note: skb->head can be changed. */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; + arp = arp_hdr(skb); /* * Fill out the arp protocol part. * @@ -1189,7 +1191,7 @@ static int arp_req_get(struct net *net, struct arpreq *r) read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, - min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); + min(dev->addr_len, sizeof(r->arp_ha.sa_data))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); @@ -1217,10 +1219,10 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } return err; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 709021197e1c..a05aa075de1a 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -168,9 +168,8 @@ static int __init cipso_v4_cache_init(void) { u32 iter; - cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, - sizeof(struct cipso_v4_map_cache_bkt), - GFP_KERNEL); + cipso_v4_cache = kzalloc_objs(struct cipso_v4_map_cache_bkt, + CIPSO_V4_CACHE_BUCKETS); if (!cipso_v4_cache) return -ENOMEM; @@ -308,7 +307,7 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr, cipso_ptr_len = cipso_ptr[1]; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); @@ -2196,7 +2195,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ - ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + ret_val = skb_cow(skb, + skb_headroom(skb) + (len_delta > 0 ? len_delta : 0)); if (ret_val < 0) return ret_val; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index c2b2cda1a7e5..1614593b6d72 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -16,7 +16,7 @@ #include <net/tcp_states.h> #include <net/sock_reuseport.h> -int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; @@ -84,7 +84,7 @@ out: } EXPORT_SYMBOL(__ip4_datagram_connect); -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 942a887bf089..537bb6c315d2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -209,7 +209,7 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; - ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); + ifa = kzalloc_obj(*ifa, GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; @@ -270,7 +270,7 @@ static struct in_device *inetdev_init(struct net_device *dev) ASSERT_RTNL(); - in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); + in_dev = kzalloc_obj(*in_dev); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, @@ -2754,9 +2754,8 @@ static __net_init int devinet_init_net(struct net *net) int i; err = -ENOMEM; - net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, - sizeof(struct hlist_head), - GFP_KERNEL); + net->ipv4.inet_addr_lst = kmalloc_objs(struct hlist_head, + IN4_ADDR_HSIZE); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 05828d4cb6cd..abd77162f5e7 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -122,8 +122,8 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, - XFRM_MODE_SKB_CB(skb)->protocol); + struct xfrm_offload *xo = xfrm_offload(skb); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, xo->proto); __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index f9b9e26c32c1..0b72796dd1ad 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -28,8 +28,10 @@ struct fib_alias { /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { - if (!(fa->fa_state & FA_S_ACCESSED)) - fa->fa_state |= FA_S_ACCESSED; + u8 fa_state = READ_ONCE(fa->fa_state); + + if (!(fa_state & FA_S_ACCESSED)) + WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED); } /* Exported by fib_semantics.c */ diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index a5f3c8459758..01cb587866d8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -365,8 +365,7 @@ static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ - return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head), - GFP_KERNEL); + return kvzalloc_objs(struct hlist_head, (1 << hash_bits) * 2); } static void fib_info_hash_free(struct hlist_head *head) @@ -1399,7 +1398,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fib_info_hash_grow(net); - fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); + fi = kzalloc_flex(*fi, fib_nh, nhs); if (!fi) { err = -ENOBUFS; goto failure; @@ -2167,8 +2166,8 @@ void fib_select_multipath(struct fib_result *res, int hash, { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool found = false; bool use_neigh; + int score = -1; __be32 saddr; if (unlikely(res->fi->nh)) { @@ -2180,7 +2179,7 @@ void fib_select_multipath(struct fib_result *res, int hash, saddr = fl4 ? fl4->saddr : 0; change_nexthops(fi) { - int nh_upper_bound; + int nh_upper_bound, nh_score = 0; /* Nexthops without a carrier are assigned an upper bound of * minus one when "ignore_routes_with_linkdown" is set. @@ -2190,24 +2189,18 @@ void fib_select_multipath(struct fib_result *res, int hash, (use_neigh && !fib_good_nh(nexthop_nh))) continue; - if (!found) { + if (saddr && nexthop_nh->nh_saddr == saddr) + nh_score += 2; + if (hash <= nh_upper_bound) + nh_score++; + if (score < nh_score) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; - found = !saddr || nexthop_nh->nh_saddr == saddr; + if (nh_score == 3 || (!saddr && nh_score == 1)) + return; + score = nh_score; } - if (hash > nh_upper_bound) - continue; - - if (!saddr || nexthop_nh->nh_saddr == saddr) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; - } - - if (found) - return; - } endfor_nexthops(fi); } #endif diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 59a6f0a9638f..1308213791f1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1280,7 +1280,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - state = fa->fa_state; + state = READ_ONCE(fa->fa_state); new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; @@ -1745,7 +1745,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fib_remove_alias(t, tp, l, fa_to_delete); - if (fa_to_delete->fa_state & FA_S_ACCESSED) + if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); @@ -2053,10 +2053,11 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) continue; } - /* Do not flush error routes if network namespace is - * not being dismantled + /* When not flushing the entire table, skip error + * routes that are not marked for deletion. */ - if (!flush_all && fib_props[fa->fa_type].error) { + if (!flush_all && fib_props[fa->fa_type].error && + !(fi->fib_flags & RTNH_F_DEAD)) { slen = fa->fa_slen; continue; } diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 3970b6b7ace5..3baaa4df7e42 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -215,6 +215,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) return gue_control_message(skb, guehdr); proto_ctype = guehdr->proto_ctype; + if (unlikely(!proto_ctype)) + goto drop; + __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); @@ -578,7 +581,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, goto error; /* Allocate FOU port structure */ - fou = kzalloc(sizeof(*fou), GFP_KERNEL); + fou = kzalloc_obj(*fou); if (!fou) { err = -ENOMEM; goto error; diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 506260b4a4dc..309d5ba983d0 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -14,7 +15,7 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_BE16, }, [FOU_ATTR_AF] = { .type = NLA_U8, }, - [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = NLA_POLICY_MIN(NLA_U8, 1), [FOU_ATTR_TYPE] = { .type = NLA_U8, }, [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h index 63a6c4ed803d..438342dc8507 100644 --- a/net/ipv4/fou_nl.h +++ b/net/ipv4/fou_nl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_FOU_GEN_H #define _LINUX_FOU_GEN_H diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1b7fb5d935ed..568bd1e95d44 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -112,7 +112,9 @@ struct icmp_bxm { __be32 times[3]; } data; int head_len; - struct ip_options_data replyopts; + + /* Must be last as it ends in a flexible-array member. */ + struct ip_options_rcu replyopts; }; /* An array of errno for error messages from dest unreach. */ @@ -248,7 +250,8 @@ bool icmp_global_allow(struct net *net) if (delta < HZ / 50) return false; - incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; + incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec); + incr = div_u64((u64)incr * delta, HZ); if (!incr) return false; @@ -313,23 +316,29 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct dst_entry *dst = &rt->dst; struct inet_peer *peer; struct net_device *dev; + int peer_timeout; bool rc = true; if (!apply_ratelimit) return true; + peer_timeout = READ_ONCE(net->ipv4.sysctl_icmp_ratelimit); + if (!peer_timeout) + goto out; + /* No rate limit on loopback */ rcu_read_lock(); dev = dst_dev_rcu(dst); if (dev && (dev->flags & IFF_LOOPBACK)) - goto out; + goto out_unlock; peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, l3mdev_master_ifindex_rcu(dev)); - rc = inet_peer_xrlim_allow(peer, - READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); -out: + rc = inet_peer_xrlim_allow(peer, peer_timeout); + +out_unlock: rcu_read_unlock(); +out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else @@ -353,9 +362,12 @@ void icmp_out_count(struct net *net, unsigned char type) static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct icmp_bxm *icmp_param = from; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); __wsum csum; + icmp_param = from; + csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len); @@ -413,7 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) int type = icmp_param->data.icmph.type; int code = icmp_param->data.icmph.code; - if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) + if (ip_options_echo(net, &icmp_param->replyopts.opt, skb)) return; /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ @@ -435,10 +447,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - if (icmp_param->replyopts.opt.opt.optlen) { - ipc.opt = &icmp_param->replyopts.opt; + if (icmp_param->replyopts.opt.optlen) { + ipc.opt = &icmp_param->replyopts; if (ipc.opt->opt.srr) - daddr = icmp_param->replyopts.opt.opt.faddr; + daddr = icmp_param->replyopts.opt.faddr; } memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -491,8 +503,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, int err; memset(fl4, 0, sizeof(*fl4)); - fl4->daddr = (param->replyopts.opt.opt.srr ? - param->replyopts.opt.opt.faddr : iph->saddr); + fl4->daddr = (param->replyopts.opt.srr ? + param->replyopts.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); @@ -554,6 +566,21 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, /* steal dst entry from skb_in, don't drop refcnt */ skb_dstref_steal(skb_in); skb_dstref_restore(skb_in, orefdst); + + /* + * At this point, fl4_dec.daddr should NOT be local (we + * checked fl4_dec.saddr above). However, a race condition + * may occur if the address is added to the interface + * concurrently. In that case, ip_route_input() returns a + * LOCAL route with dst.output=ip_rt_bug, which must not + * be used for output. + */ + if (!err && rt2 && rt2->rt_type == RTN_LOCAL) { + net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n", + &fl4_dec.daddr, &fl4_dec.saddr); + dst_release(&rt2->dst); + err = -EINVAL; + } } if (err) @@ -582,6 +609,185 @@ relookup_failed: return ERR_PTR(err); } +struct icmp_ext_iio_addr4_subobj { + __be16 afi; + __be16 reserved; + __be32 addr4; +}; + +static unsigned int icmp_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp_ext_iio_addr4_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp_ext_iio_len(); + + return ext_max_len; +} + +static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + return 0; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) + continue; + if (ifa->ifa_scope != RT_SCOPE_UNIVERSE || + ipv4_is_multicast(ifa->ifa_address)) + continue; + return ifa->ifa_address; + } + + return 0; +} + +static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + data = icmp_ext_iio_addr4_find(dev); + if (data) { + struct icmp_ext_iio_addr4_subobj *addr4_subobj; + + addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj)); + addr4_subobj->afi = htons(ICMP_AFI_IP); + addr4_subobj->addr4 = data; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph, + unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return NULL; + } + + ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 32-bit words (RFC 4884). */ + icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a situation * @@ -596,11 +802,13 @@ relookup_failed: void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, const struct inet_skb_parm *parm) { + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct iphdr *iph; int room; - struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -726,7 +934,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in, &parm->opt)) goto out_unlock; @@ -735,21 +943,21 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * Prepare data for ICMP header. */ - icmp_param.data.icmph.type = type; - icmp_param.data.icmph.code = code; - icmp_param.data.icmph.un.gateway = info; - icmp_param.data.icmph.checksum = 0; - icmp_param.skb = skb_in; - icmp_param.offset = skb_network_offset(skb_in); + icmp_param->data.icmph.type = type; + icmp_param->data.icmph.code = code; + icmp_param->data.icmph.un.gateway = info; + icmp_param->data.icmph.checksum = 0; + icmp_param->skb = skb_in; + icmp_param->offset = skb_network_offset(skb_in); ipcm_init(&ipc); ipc.tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts.opt; + ipc.opt = &icmp_param->replyopts; ipc.sockc.mark = mark; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, inet_dsfield_to_dscp(tos), mark, type, code, - &icmp_param); + icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -759,10 +967,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->dst); + room = dst4_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen; room -= sizeof(struct icmphdr); /* Guard against tiny mtu. We need to include at least one * IP network header for this message to make any sense. @@ -770,10 +978,15 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (room <= (int)sizeof(struct iphdr)) goto ende; - icmp_param.data_len = skb_in->len - icmp_param.offset; - if (icmp_param.data_len > room) - icmp_param.data_len = room; - icmp_param.head_len = sizeof(struct icmphdr); + ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room, + parm->iif); + if (ext_skb) + icmp_param->skb = ext_skb; + + icmp_param->data_len = icmp_param->skb->len - icmp_param->offset; + if (icmp_param->data_len > room) + icmp_param->data_len = room; + icmp_param->head_len = sizeof(struct icmphdr); /* if we don't have a source address at this point, fall back to the * dummy address instead of sending out a packet with a source address @@ -784,7 +997,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, trace_icmp_send(skb_in, type, code); - icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); + + if (ext_skb) + consume_skb(ext_skb); ende: ip_rt_put(rt); out_unlock: @@ -843,24 +1059,32 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); - return; - } + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + /* IPPROTO_RAW sockets are not supposed to receive anything. */ + if (protocol == IPPROTO_RAW) + goto out; raw_icmp_error(skb, protocol, info); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); + return; + +out: + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); } static bool icmp_tag_validation(int proto) { + const struct net_protocol *ipprot; bool ok; rcu_read_lock(); - ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; + ipprot = rcu_dereference(inet_protos[proto]); + ok = ipprot ? ipprot->icmp_strict_tag_validation : false; rcu_read_unlock(); return ok; } @@ -1018,7 +1242,8 @@ static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) static enum skb_drop_reason icmp_echo(struct sk_buff *skb) { - struct icmp_bxm icmp_param; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct net *net; net = skb_dst_dev_net_rcu(skb); @@ -1026,18 +1251,18 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; - icmp_param.data.icmph = *icmp_hdr(skb); - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = skb->len; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data.icmph = *icmp_hdr(skb); + icmp_param->skb = skb; + icmp_param->offset = 0; + icmp_param->data_len = skb->len; + icmp_param->head_len = sizeof(struct icmphdr); - if (icmp_param.data.icmph.type == ICMP_ECHO) - icmp_param.data.icmph.type = ICMP_ECHOREPLY; - else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) + if (icmp_param->data.icmph.type == ICMP_ECHO) + icmp_param->data.icmph.type = ICMP_ECHOREPLY; + else if (!icmp_build_probe(skb, &icmp_param->data.icmph)) return SKB_NOT_DROPPED_YET; - icmp_reply(&icmp_param, skb); + icmp_reply(icmp_param, skb); return SKB_NOT_DROPPED_YET; } @@ -1165,7 +1390,8 @@ EXPORT_SYMBOL_GPL(icmp_build_probe); */ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) { - struct icmp_bxm icmp_param; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); /* * Too short. */ @@ -1175,19 +1401,19 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - icmp_param.data.times[1] = inet_current_timestamp(); - icmp_param.data.times[2] = icmp_param.data.times[1]; - - BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); - - icmp_param.data.icmph = *icmp_hdr(skb); - icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; - icmp_param.data.icmph.code = 0; - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = 0; - icmp_param.head_len = sizeof(struct icmphdr) + 12; - icmp_reply(&icmp_param, skb); + icmp_param->data.times[1] = inet_current_timestamp(); + icmp_param->data.times[2] = icmp_param->data.times[1]; + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4)); + + icmp_param->data.icmph = *icmp_hdr(skb); + icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY; + icmp_param->data.icmph.code = 0; + icmp_param->skb = skb; + icmp_param->offset = 0; + icmp_param->data_len = 0; + icmp_param->head_len = sizeof(struct icmphdr) + 12; + icmp_reply(icmp_param, skb); return SKB_NOT_DROPPED_YET; out_err: @@ -1502,6 +1728,7 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_errors_extension_mask = 0; net->ipv4.sysctl_icmp_msgs_per_sec = 1000; net->ipv4.sysctl_icmp_msgs_burst = 50; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7182f1419c2a..a674fb44ec25 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -227,7 +227,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = get_random_u32_below(in_dev->mr_maxdelay); + int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay)); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ - in_dev->mr_maxdelay = max_delay; + WRITE_ONCE(in_dev->mr_maxdelay, max_delay); /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically @@ -1187,7 +1187,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = kzalloc(sizeof(*pmc), gfp); + pmc = kzalloc_obj(*pmc, gfp); if (!pmc) return; spin_lock_init(&pmc->lock); @@ -1532,7 +1532,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, goto out; } - im = kzalloc(sizeof(*im), gfp); + im = kzalloc_obj(*im, gfp); if (!im) goto out; @@ -2075,7 +2075,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, psf_prev = psf; } if (!psf) { - psf = kzalloc(sizeof(*psf), GFP_ATOMIC); + psf = kzalloc_obj(*psf, GFP_ATOMIC); if (!psf) return -ENOBUFS; psf->sf_inaddr = *psfsrc; @@ -2150,7 +2150,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { - dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc_obj(*dpsf, GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index cdd1e12aac8c..5dfac6ce1110 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -20,6 +20,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> @@ -712,31 +713,6 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) release_sock(sk); - if (mem_cgroup_sockets_enabled) { - gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; - int amt = 0; - - /* atomically get the memory usage, set and charge the - * newsk->sk_memcg. - */ - lock_sock(newsk); - - mem_cgroup_sk_alloc(newsk); - if (mem_cgroup_from_sk(newsk)) { - /* The socket has not been accepted yet, no need - * to look at newsk->sk_wmem_queued. - */ - amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&newsk->sk_rmem_alloc)); - } - - if (amt) - mem_cgroup_sk_charge(newsk, amt, gfp); - kmem_cache_charge(newsk, gfp); - - release_sock(newsk); - } - if (req) reqsk_put(req); @@ -762,9 +738,9 @@ void inet_csk_init_xmit_timers(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); - timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); - timer_setup(&sk->sk_timer, keepalive_handler, 0); + timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } @@ -775,9 +751,9 @@ void inet_csk_clear_xmit_timers(struct sock *sk) smp_store_release(&icsk->icsk_pending, 0); smp_store_release(&icsk->icsk_ack.pending, 0); - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->tcp_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &icsk->icsk_keepalive_timer); } void inet_csk_clear_xmit_timers_sync(struct sock *sk) @@ -790,9 +766,9 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) smp_store_release(&icsk->icsk_pending, 0); smp_store_release(&icsk->icsk_ack.pending, 0); - sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); - sk_stop_timer_sync(sk, &sk->sk_timer); + sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer); } struct dst_entry *inet_csk_route_req(const struct sock *sk, @@ -910,7 +886,6 @@ reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->syncookie = 0; - req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; @@ -938,13 +913,22 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); ireq->ireq_family = sk_listener->sk_family; - req->timeout = TCP_TIMEOUT_INIT; } return req; } EXPORT_SYMBOL(inet_reqsk_alloc); +void __reqsk_free(struct request_sock *req) +{ + req->rsk_ops->destructor(req); + if (req->rsk_listener) + sock_put(req->rsk_listener); + kfree(req->saved_syn); + kmem_cache_free(req->rsk_ops->slab, req); +} +EXPORT_SYMBOL_GPL(__reqsk_free); + static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { @@ -1121,16 +1105,20 @@ static void reqsk_timer_handler(struct timer_list *t) young <<= 1; } } + syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); - req->rsk_ops->syn_ack_timeout(req); + tcp_syn_ack_timeout(req); + if (!expire && (!resend || !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; if (req->num_timeout++ == 0) atomic_dec(&queue->young); - mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); + mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req)); if (!nreq) return; @@ -1167,8 +1155,7 @@ drop: reqsk_put(oreq); } -static bool reqsk_queue_hash_req(struct request_sock *req, - unsigned long timeout) +static bool reqsk_queue_hash_req(struct request_sock *req) { bool found_dup_sk = false; @@ -1176,8 +1163,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req, return false; /* The timer needs to be setup after a successful insertion. */ + req->timeout = tcp_timeout_init((struct sock *)req); timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); - mod_timer(&req->rsk_timer, jiffies + timeout); + mod_timer(&req->rsk_timer, jiffies + req->timeout); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. @@ -1187,10 +1175,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req, return true; } -bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req) { - if (!reqsk_queue_hash_req(req, timeout)) + if (!reqsk_queue_hash_req(req)) return false; inet_csk_reqsk_queue_added(sk); @@ -1222,7 +1209,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, { struct sock *newsk = sk_clone_lock(sk, priority); struct inet_connection_sock *newicsk; - struct inet_request_sock *ireq; + const struct inet_request_sock *ireq; struct inet_sock *newinet; if (!newsk) @@ -1337,6 +1324,15 @@ static int inet_ulp_can_listen(const struct sock *sk) return 0; } +static void reqsk_queue_alloc(struct request_sock_queue *queue) +{ + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; + + queue->rskq_accept_head = NULL; +} + int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f0b6c5a411a2..9d215485b5c7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -287,17 +287,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); r->idiag_expires = - jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); + jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = - jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); - } else if (timer_pending(&sk->sk_timer)) { + jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { r->idiag_timer = 2; r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = - jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies); } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { @@ -845,7 +845,7 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct nlattr *nla; int err; - cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); + cb_data = kzalloc_obj(*cb_data); if (!cb_data) return -ENOMEM; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 025895eb6ec5..393770920abd 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -188,7 +188,7 @@ static void fqdir_work_fn(struct work_struct *work) int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { - struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); + struct fqdir *fqdir = kzalloc_obj(*fqdir); int res; if (!fqdir) @@ -218,6 +218,41 @@ static int __init inet_frag_wq_init(void) pure_initcall(inet_frag_wq_init); +void fqdir_pre_exit(struct fqdir *fqdir) +{ + struct inet_frag_queue *fq; + struct rhashtable_iter hti; + + /* Prevent creation of new frags. + * Pairs with READ_ONCE() in inet_frag_find(). + */ + WRITE_ONCE(fqdir->high_thresh, 0); + + /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() + * and ip6frag_expire_frag_queue(). + */ + WRITE_ONCE(fqdir->dead, true); + + rhashtable_walk_enter(&fqdir->rhashtable, &hti); + rhashtable_walk_start(&hti); + + while ((fq = rhashtable_walk_next(&hti))) { + if (IS_ERR(fq)) { + if (PTR_ERR(fq) != -EAGAIN) + break; + continue; + } + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) + inet_frag_queue_flush(fq, 0); + spin_unlock_bh(&fq->lock); + } + + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); +} +EXPORT_SYMBOL(fqdir_pre_exit); + void fqdir_exit(struct fqdir *fqdir) { INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); @@ -263,8 +298,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) kmem_cache_free(f->frags_cachep, q); } -unsigned int inet_frag_rbtree_purge(struct rb_root *root, - enum skb_drop_reason reason) +static unsigned int +inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason) { struct rb_node *p = rb_first(root); unsigned int sum = 0; @@ -284,7 +319,17 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root, } return sum; } -EXPORT_SYMBOL(inet_frag_rbtree_purge); + +void inet_frag_queue_flush(struct inet_frag_queue *q, + enum skb_drop_reason reason) +{ + unsigned int sum; + + reason = reason ?: SKB_DROP_REASON_FRAG_REASM_TIMEOUT; + sum = inet_frag_rbtree_purge(&q->rb_fragments, reason); + sub_frag_mem_limit(q->fqdir, sum); +} +EXPORT_SYMBOL(inet_frag_queue_flush); void inet_frag_destroy(struct inet_frag_queue *q) { @@ -327,7 +372,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - /* One reference for the timer, one for the hash table. */ + /* One reference for the timer, one for the hash table. + * We never take any extra references, only decrement this field. + */ refcount_set(&q->refcnt, 2); return q; @@ -441,6 +488,8 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, } FRAG_CB(skb)->ip_defrag_offset = offset; + if (offset) + nf_reset_ct(skb); return IPFRAG_OK; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index b7024e3d9ac3..9bfccc283fa6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -200,7 +200,7 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { - inet_sk(sk)->inet_num = port; + WRITE_ONCE(inet_sk(sk)->inet_num, port); inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -224,7 +224,7 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; - inet_sk(sk)->inet_num = 0; + WRITE_ONCE(inet_sk(sk)->inet_num, 0); sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); @@ -352,7 +352,7 @@ static inline int compute_score(struct sock *sk, const struct net *net, { int score = -1; - if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && + if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; @@ -720,8 +720,11 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); - ret = sk_nulls_del_node_init_rcu(osk); - } else if (found_dup_sk) { + ret = sk_nulls_replace_node_init_rcu(osk, sk); + goto unlock; + } + + if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; @@ -730,6 +733,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ret) __sk_nulls_add_node_rcu(sk, list); +unlock: spin_unlock(lock); return ret; @@ -1202,7 +1206,7 @@ error: sk->sk_hash = 0; inet_sk(sk)->inet_sport = 0; - inet_sk(sk)->inet_num = 0; + WRITE_ONCE(inet_sk(sk)->inet_num, 0); if (tw) inet_twsk_bind_unhash(tw, hinfo); @@ -1280,7 +1284,7 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, int inet_hashinfo2_init_mod(struct inet_hashinfo *h) { - h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); + h->lhash2 = kmalloc_objs(*h->lhash2, INET_LHTABLE_SIZE); if (!h->lhash2) return -ENOMEM; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c96d61d08854..d4c781a0667f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -88,12 +88,6 @@ void inet_twsk_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(inet_twsk_put); -static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, - struct hlist_nulls_head *list) -{ - hlist_nulls_add_head_rcu(&tw->tw_node, list); -} - static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, false); @@ -113,13 +107,12 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead, *bhead2; - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet->num != 0 MUST be bound in - binding cache, even if it is closed. + /* Put TW into bind hash. Original socket stays there too. + * Note, that any socket with inet->num != 0 MUST be bound in + * binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; @@ -141,19 +134,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_lock(lock); - /* Step 2: Hash TW into tcp ehash chain */ - inet_twsk_add_node_rcu(tw, &ehead->chain); - - /* Step 3: Remove SK from hash chain */ - if (__sk_nulls_del_node_init_rcu(sk)) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - - - /* Ensure above writes are committed into memory before updating the - * refcount. - * Provides ordering vs later refcount_inc(). - */ - smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. @@ -163,6 +143,15 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, */ refcount_set(&tw->tw_refcnt, 3); + /* Ensure tw_refcnt has been set before tw is published. + * smp_wmb() provides the necessary memory barrier to enforce this + * ordering. + */ + smp_wmb(); + + hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + inet_twsk_schedule(tw, timeo); spin_unlock(lock); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index f7012479713b..56b0f738d2f2 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -134,11 +134,6 @@ static void ip_expire(struct timer_list *t) net = qp->q.fqdir->net; rcu_read_lock(); - - /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ - if (READ_ONCE(qp->q.fqdir->dead)) - goto out_rcu_unlock; - spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) @@ -146,6 +141,13 @@ static void ip_expire(struct timer_list *t) qp->q.flags |= INET_FRAG_DROP; inet_frag_kill(&qp->q, &refs); + + /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ + if (READ_ONCE(qp->q.fqdir->dead)) { + inet_frag_queue_flush(&qp->q, 0); + goto out; + } + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -240,16 +242,10 @@ static int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { - unsigned int sum_truesize = 0; - - if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { - refcount_inc(&qp->q.refcnt); + if (!mod_timer_pending(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) return -ETIMEDOUT; - } - sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, - SKB_DROP_REASON_FRAG_TOO_FAR); - sub_frag_mem_limit(qp->q.fqdir, sum_truesize); + inet_frag_queue_flush(&qp->q, SKB_DROP_REASON_FRAG_TOO_FAR); qp->q.flags = 0; qp->q.len = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 761a53c6a89a..35f0baa99d40 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -330,6 +330,10 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!tun_dst) return PACKET_REJECT; + /* MUST set options_len before referencing options */ + info = &tun_dst->u.tun_info; + info->options_len = sizeof(*md); + /* skb can be uncloned in __iptunnel_pull_header, so * old pkt_md is no longer valid and we need to reset * it @@ -344,10 +348,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); - info = &tun_dst->u.tun_info; __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); - info->options_len = sizeof(*md); } skb_reset_mac_header(skb); @@ -889,10 +891,17 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); - struct iphdr *iph; struct gre_base_hdr *greh; + struct iphdr *iph; + int needed; + + needed = t->hlen + sizeof(*iph); + if (skb_headroom(skb) < needed && + pskb_expand_head(skb, HH_DATA_ALIGN(needed - skb_headroom(skb)), + 0, GFP_ATOMIC)) + return -needed; - iph = skb_push(skb, t->hlen + sizeof(*iph)); + iph = skb_push(skb, needed); greh = (struct gre_base_hdr *)(iph+1); greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); @@ -910,7 +919,8 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, return -(t->hlen + sizeof(*iph)); } -static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +static int ipgre_header_parse(const struct sk_buff *skb, const struct net_device *dev, + unsigned char *haddr) { const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); memcpy(haddr, &iph->saddr, 4); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 273578579a6b..19d3141dad1f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -141,6 +141,8 @@ #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> +#include <net/udp.h> +#include <net/tcp.h> /* * Process Router Attention IP option (RFC 2113) @@ -317,8 +319,6 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, ip_hdr(hint)->tos == iph->tos; } -int tcp_v4_early_demux(struct sk_buff *skb); -enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ff11d3a85a36..e4790cc7b5c2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1300,7 +1300,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, return -EFAULT; cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); if (!inetdev_valid_mtu(cork->fragsize)) return -ENETUNREACH; @@ -1439,7 +1439,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, pmtudisc = READ_ONCE(inet->pmtudisc); if (pmtudisc == IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_PROBE || - (skb->len <= dst_mtu(&rt->dst) && + (skb->len <= dst4_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); @@ -1606,7 +1606,8 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash) { - struct ip_options_data replyopts; + DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); @@ -1615,18 +1616,18 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, int err; int oif; - if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) + if (__ip_options_echo(net, &replyopts->opt, skb, sopt)) return; ipcm_init(&ipc); ipc.addr = daddr; ipc.sockc.transmit_time = transmit_time; - if (replyopts.opt.opt.optlen) { - ipc.opt = &replyopts.opt; + if (replyopts->opt.optlen) { + ipc.opt = replyopts; - if (replyopts.opt.opt.srr) - daddr = replyopts.opt.opt.faddr; + if (replyopts->opt.srr) + daddr = replyopts->opt.faddr; } oif = arg->bound_dev_if; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6d9c5c20b1c4..697e18242d6c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -350,7 +350,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; - new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + new_ra = on ? kmalloc_obj(*new_ra) : NULL; if (on && !new_ra) return -ENOMEM; @@ -1634,7 +1634,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, val = 0; dst = sk_dst_get(sk); if (dst) { - val = dst_mtu(dst); + val = dst4_mtu(dst); dst_release(dst); } if (!val) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 158a30ae7c5f..50d0f5fe4e4c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1281,7 +1281,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); -int ip_tunnel_init(struct net_device *dev) +int __ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -1308,10 +1308,9 @@ int ip_tunnel_init(struct net_device *dev) if (tunnel->collect_md) netif_keep_dst(dev); - netdev_lockdep_set_classes(dev); return 0; } -EXPORT_SYMBOL_GPL(ip_tunnel_init); +EXPORT_SYMBOL_GPL(__ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 2e61ac137128..5683c328990f 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -58,6 +58,19 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct iphdr *iph; int err; + if (unlikely(dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT)) { + if (dev) { + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + DEV_STATS_INC(dev, tx_errors); + } + ip_rt_put(rt); + kfree_skb(skb); + return; + } + + dev_xmit_recursion_inc(); + skb_scrub_packet(skb, xnet); skb_clear_hash_if_not_l4(skb); @@ -88,6 +101,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, pkt_len = 0; iptunnel_xmit_stats(dev, pkt_len); } + + dev_xmit_recursion_dec(); } EXPORT_SYMBOL_GPL(iptunnel_xmit); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 22a7889876c1..a35ffedacc7c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -244,7 +244,7 @@ static int __init ic_open_devs(void) dev->name); continue; } - if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { + if (!(d = kmalloc_obj(struct ic_device))) { rtnl_unlock(); return -ENOMEM; } @@ -679,8 +679,18 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; static void __init ic_dhcp_init_options(u8 *options, struct ic_device *d) { - u8 mt = ((ic_servaddr == NONE) - ? DHCPDISCOVER : DHCPREQUEST); + static const u8 ic_req_params[] = { + 1, /* Subnet mask */ + 3, /* Default gateway */ + 6, /* DNS server */ + 12, /* Host name */ + 15, /* Domain name */ + 17, /* Boot path */ + 26, /* MTU */ + 40, /* NIS domain name */ + 42, /* NTP servers */ + }; + u8 mt = (ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST; u8 *e = options; int len; @@ -705,51 +715,36 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d) e += 4; } - /* always? */ - { - static const u8 ic_req_params[] = { - 1, /* Subnet mask */ - 3, /* Default gateway */ - 6, /* DNS server */ - 12, /* Host name */ - 15, /* Domain name */ - 17, /* Boot path */ - 26, /* MTU */ - 40, /* NIS domain name */ - 42, /* NTP servers */ - }; - - *e++ = 55; /* Parameter request list */ - *e++ = sizeof(ic_req_params); - memcpy(e, ic_req_params, sizeof(ic_req_params)); - e += sizeof(ic_req_params); - - if (ic_host_name_set) { - *e++ = 12; /* host-name */ - len = strlen(utsname()->nodename); - *e++ = len; - memcpy(e, utsname()->nodename, len); - e += len; - } - if (*vendor_class_identifier) { - pr_info("DHCP: sending class identifier \"%s\"\n", - vendor_class_identifier); - *e++ = 60; /* Class-identifier */ - len = strlen(vendor_class_identifier); - *e++ = len; - memcpy(e, vendor_class_identifier, len); - e += len; - } - len = strlen(dhcp_client_identifier + 1); - /* the minimum length of identifier is 2, include 1 byte type, - * and can not be larger than the length of options - */ - if (len >= 1 && len < 312 - (e - options) - 1) { - *e++ = 61; - *e++ = len + 1; - memcpy(e, dhcp_client_identifier, len + 1); - e += len + 1; - } + *e++ = 55; /* Parameter request list */ + *e++ = sizeof(ic_req_params); + memcpy(e, ic_req_params, sizeof(ic_req_params)); + e += sizeof(ic_req_params); + + if (ic_host_name_set) { + *e++ = 12; /* host-name */ + len = strlen(utsname()->nodename); + *e++ = len; + memcpy(e, utsname()->nodename, len); + e += len; + } + if (*vendor_class_identifier) { + pr_info("DHCP: sending class identifier \"%s\"\n", + vendor_class_identifier); + *e++ = 60; /* Class-identifier */ + len = strlen(vendor_class_identifier); + *e++ = len; + memcpy(e, vendor_class_identifier, len); + e += len; + } + len = strlen(dhcp_client_identifier + 1); + /* the minimum length of identifier is 2, include 1 byte type, + * and can not be larger than the length of options + */ + if (len >= 1 && len < 312 - (e - options) - 1) { + *e++ = 61; + *e++ = len + 1; + memcpy(e, dhcp_client_identifier, len + 1); + e += len + 1; } *e++ = 255; /* End of the list */ @@ -1690,7 +1685,8 @@ static int __init ic_proto_name(char *name) *v = 0; if (kstrtou8(client_id, 0, dhcp_client_identifier)) pr_debug("DHCP: Invalid client identifier type\n"); - strncpy(dhcp_client_identifier + 1, v + 1, 251); + strscpy(dhcp_client_identifier + 1, v + 1, + sizeof(dhcp_client_identifier) - 1); *v = ','; } return 1; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1c..ff95b1b9908e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); } +static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); + const struct iphdr *tiph = &tunnel->parms.iph; + struct rtable *rt; + + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, + RT_SCOPE_UNIVERSE); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + path->type = DEV_PATH_TUN; + path->tun.src_v4.s_addr = tiph->saddr; + path->tun.dst_v4.s_addr = tiph->daddr; + path->tun.l3_proto = IPPROTO_IPIP; + path->dev = ctx->dev; + + ctx->dev = rt->dst.dev; + ip_rt_put(rt); + + return 0; +} + static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, + .ndo_fill_forward_path = ipip_fill_forward_path, }; #define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ca9eaee4c2ef..131382c388e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1895,7 +1895,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, return -1; } - if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst4_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear * to blackhole. diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 28d77d454d44..2d62526406ca 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -38,7 +38,7 @@ mr_table_alloc(struct net *net, u32 id, struct mr_table *mrt; int err; - mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + mrt = kzalloc_obj(*mrt); if (!mrt) return ERR_PTR(-ENOMEM); mrt->id = id; diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 8ddac1f595ed..c1463add48c4 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -73,7 +73,7 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, if (!fc_mx) return (struct dst_metrics *)&dst_default_metrics; - fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL); + fib_metrics = kzalloc_obj(*fib_metrics); if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); @@ -88,4 +88,4 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, return fib_metrics; } -EXPORT_SYMBOL_GPL(ip_fib_metrics_init); +EXPORT_IPV6_MOD_GPL(ip_fib_metrics_init); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index fae4aa4a5f09..fecf6621f679 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -303,7 +303,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, goto free_nskb; /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) + if (nskb->len > dst4_mtu(skb_dst(nskb))) goto free_nskb; nf_ct_attach(nskb, oldskb); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 7b9d70f9b31c..c942f1282236 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -116,7 +116,7 @@ static int nh_notifier_single_info_init(struct nh_notifier_info *info, struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; - info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); + info->nh = kzalloc_obj(*info->nh); if (!info->nh) return -ENOMEM; @@ -137,8 +137,7 @@ static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; - info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), - GFP_KERNEL); + info->nh_grp = kzalloc_flex(*info->nh_grp, nh_entries, num_nh); if (!info->nh_grp) return -ENOMEM; @@ -318,8 +317,7 @@ static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; - info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), - GFP_KERNEL); + info->nh_res_bucket = kzalloc_obj(*info->nh_res_bucket); if (!info->nh_res_bucket) return -ENOMEM; @@ -535,7 +533,7 @@ static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; - nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); + nh = kzalloc_obj(struct nexthop); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); @@ -550,7 +548,7 @@ static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; - nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); + nhg = kzalloc_flex(*nhg, nh_entries, num_nh); if (nhg) nhg->num_nh = num_nh; @@ -715,9 +713,8 @@ static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; - info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, - stats, nhg->num_nh), - GFP_KERNEL); + info->nh_grp_hw_stats = kzalloc_flex(*info->nh_grp_hw_stats, stats, + nhg->num_nh); if (!info->nh_grp_hw_stats) return -ENOMEM; @@ -2005,7 +2002,8 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg) } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, - struct nl_info *nlinfo) + struct nl_info *nlinfo, + struct list_head *deferred_free) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; @@ -2065,8 +2063,8 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); - free_percpu(nhge->stats); nexthop_put(nhge->nh); + list_add(&nhge->nh_list, deferred_free); /* Removal of a NH from a resilient group is notified through * bucket notifications. @@ -2086,6 +2084,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; + LIST_HEAD(deferred_free); /* If there is nothing to do, let's avoid the costly call to * synchronize_net() @@ -2094,10 +2093,16 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, return; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) - remove_nh_grp_entry(net, nhge, nlinfo); + remove_nh_grp_entry(net, nhge, nlinfo, &deferred_free); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); + + /* Now safe to free percpu stats — all RCU readers have finished */ + list_for_each_entry_safe(nhge, tmp, &deferred_free, nh_list) { + list_del(&nhge->nh_list); + free_percpu(nhge->stats); + } } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) @@ -2897,7 +2902,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, if (!nh) return ERR_PTR(-ENOMEM); - nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); + nhi = kzalloc_obj(*nhi); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 5321c5801c64..71d5e17719de 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -148,7 +148,7 @@ void ping_unhash(struct sock *sk) pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); if (sk_del_node_init_rcu(sk)) { - isk->inet_num = 0; + WRITE_ONCE(isk->inet_num, 0); isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } @@ -181,31 +181,35 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } sk_for_each_rcu(sk, hslot) { + int bound_dev_if; + if (!net_eq(sock_net(sk), net)) continue; isk = inet_sk(sk); pr_debug("iterate\n"); - if (isk->inet_num != ident) + if (READ_ONCE(isk->inet_num) != ident) continue; + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (skb->protocol == htons(ETH_P_IP) && sk->sk_family == AF_INET) { + __be32 rcv_saddr = READ_ONCE(isk->inet_rcv_saddr); + pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, - (int) isk->inet_num, &isk->inet_rcv_saddr, - sk->sk_bound_dev_if); + ident, &rcv_saddr, + bound_dev_if); - if (isk->inet_rcv_saddr && - isk->inet_rcv_saddr != ip_hdr(skb)->daddr) + if (rcv_saddr && rcv_saddr != ip_hdr(skb)->daddr) continue; #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, - (int) isk->inet_num, + ident, &sk->sk_v6_rcv_saddr, - sk->sk_bound_dev_if); + bound_dev_if); if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, @@ -216,8 +220,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) continue; } - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (bound_dev_if && bound_dev_if != dif && + bound_dev_if != sdif) continue; goto exit; @@ -286,7 +290,7 @@ void ping_close(struct sock *sk, long timeout) } EXPORT_IPV6_MOD_GPL(ping_close); -static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and @@ -301,7 +305,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, - struct sockaddr *uaddr, int addr_len) + struct sockaddr_unsized *uaddr, int addr_len) { struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { @@ -387,12 +391,14 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return 0; } -static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) +static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr) { if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) saddr; - isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; + + isk->inet_saddr = addr->sin_addr.s_addr; + WRITE_ONCE(isk->inet_rcv_saddr, addr->sin_addr.s_addr); #if IS_ENABLED(CONFIG_IPV6) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; @@ -407,7 +413,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) * Moreover, we don't allow binding to multi- and broadcast addresses. */ -int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *isk = inet_sk(sk); unsigned short snum; @@ -690,6 +696,8 @@ EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); @@ -697,7 +705,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; - struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; u8 scope; @@ -746,9 +753,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } @@ -828,10 +835,8 @@ out: out_free: if (free) kfree(ipc.opt); - if (!err) { - icmp_out_count(sock_net(sk), user_icmph.type); + if (!err) return len; - } return err; do_confirm: @@ -851,7 +856,8 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct sk_buff *skb; int copied, err; - pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, + READ_ONCE(isk->inet_num)); err = -EOPNOTSUPP; if (flags & MSG_OOB) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index d54ebb7df966..e20c41206e29 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -481,6 +481,8 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; @@ -491,7 +493,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be32 daddr; __be32 saddr; int uc_index, err; - struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; @@ -561,9 +562,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } @@ -697,7 +698,8 @@ static void raw_destroy(struct sock *sk) } /* This gets rid of all the nasties in af_inet. -DaveM */ -static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b549d6a57307..463236e0dc2d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -315,7 +315,7 @@ static int rt_acct_proc_show(struct seq_file *m, void *v) struct ip_rt_acct *dst, *src; unsigned int i, j; - dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + dst = kzalloc_objs(struct ip_rt_acct, 256); if (!dst) return -ENOMEM; @@ -659,7 +659,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { - hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); + hash = kzalloc_objs(*hash, FNHE_HASH_SIZE, GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); @@ -702,7 +702,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, depth--; } - fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + fnhe = kzalloc_obj(*fnhe, GFP_ATOMIC); if (!fnhe) goto out_unlock; @@ -1537,9 +1537,9 @@ void rt_add_uncached_list(struct rtable *rt) void rt_del_uncached_list(struct rtable *rt) { - if (!list_empty(&rt->dst.rt_uncached)) { - struct uncached_list *ul = rt->dst.rt_uncached_list; + struct uncached_list *ul = rt->dst.rt_uncached_list; + if (ul) { spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); @@ -1795,8 +1795,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - pr_warn("martian source %pI4 from %pI4, on dev %s\n", - &daddr, &saddr, dev->name); + pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n", + &saddr, &daddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, @@ -2475,8 +2475,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) - net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", - &daddr, &saddr, dev->name); + net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n", + &saddr, &daddr, dev->name); #endif goto out; @@ -3687,7 +3687,7 @@ static __net_initdata struct pernet_operations rt_genid_ops = { static int __net_init ipv4_inetpeer_init(struct net *net) { - struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + struct inet_peer_base *bp = kmalloc_obj(*bp); if (!bp) return -ENOMEM; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 569befcf021b..fc3affd9c801 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -203,7 +203,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, bool own_req; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, - NULL, &own_req); + NULL, &own_req, NULL); if (child) { refcount_set(&req->rsk_refcnt, 1); sock_rps_save_rxhash(child, skb); @@ -378,9 +378,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcp_ts_off(net, - ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr); + union tcp_seq_and_ts_off st; + + st = secure_tcp_seq_and_ts_off(net, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); + tsoff = st.ts_off; tcp_opt.rcv_tsecr -= tsoff; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 24dbc603cc44..5654cc9c8a0b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,7 +47,9 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; -static int tcp_ecn_mode_max = 2; +static int tcp_ecn_mode_max = 5; +static u32 icmp_errors_extension_mask_all = + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -484,7 +486,8 @@ static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) proc_fib_multipath_hash_rand_seed), }; - WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new); + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed, new.user_seed); + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed, new.mp_seed); } static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write, @@ -498,7 +501,7 @@ static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write int ret; mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; - user_seed = mphs->user_seed; + user_seed = READ_ONCE(mphs->user_seed); tmp = *table; tmp.data = &user_seed; @@ -675,6 +678,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_ONE }, { + .procname = "icmp_errors_extension_mask", + .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &icmp_errors_extension_mask_all, + }, + { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), @@ -738,7 +750,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = SYSCTL_THREE, }, { .procname = "tcp_ecn_option_beacon", @@ -1332,6 +1344,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, }, { + .procname = "tcp_rcvbuf_low_rtt", + .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, .maxlen = sizeof(u8), @@ -1441,6 +1462,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_doulongvec_minmax, }, { + .procname = "tcp_comp_sack_rtt_percent", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE_THOUSAND, + }, + { .procname = "tcp_comp_sack_slack_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, .maxlen = sizeof(unsigned long), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8a18aeca7ab0..202a4e57a218 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -243,7 +243,8 @@ #define pr_fmt(fmt) "TCP: " fmt -#include <crypto/hash.h> +#include <crypto/md5.h> +#include <crypto/utils.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> @@ -253,7 +254,6 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> -#include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> @@ -320,15 +320,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_IPV6_MOD(tcp_sockets_allocated); /* - * TCP splice context - */ -struct tcp_splice_state { - struct pipe_inode_info *pipe; - size_t len; - unsigned int flags; -}; - -/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting @@ -425,7 +416,6 @@ void tcp_md5_destruct_sock(struct sock *sk) tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); } } EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); @@ -503,6 +493,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) struct sk_buff *skb = tcp_write_queue_tail(sk); u32 tsflags = sockc->tsflags; + if (unlikely(!skb)) + skb = skb_rb_last(&sk->tcp_rtx_queue); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -519,6 +512,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +bool tcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); +} +EXPORT_SYMBOL(tcp_stream_memory_free); + static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) @@ -777,8 +783,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now, __tcp_push_pending_frames(sk, mss_now, nonagle); } -static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) +int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; @@ -904,6 +910,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_IPV6_MOD(tcp_splice_read); +/* We allow to exceed memory limits for FIN packets to expedite + * connection tear down and (memory) recovery. + * Otherwise tcp_send_fin() could be tempted to either delay FIN + * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() + */ +void sk_forced_mem_schedule(struct sock *sk, int size) +{ + int delta, amt; + + delta = size - sk->sk_forward_alloc; + if (delta <= 0) + return; + + amt = sk_mem_pages(delta); + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); + + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); + + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_add(sk, amt); +} + struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { @@ -928,7 +961,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; @@ -1044,8 +1078,8 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ - tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), - sk->sk_allocation); + tp->fastopen_req = kzalloc_obj(struct tcp_fastopen_request, + sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; @@ -1062,7 +1096,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, uaddr, + err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst @@ -1075,6 +1109,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, return err; } +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} +EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; @@ -1395,7 +1447,7 @@ out_err: err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { - sk->sk_write_space(sk); + READ_ONCE(sk->sk_write_space)(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } if (binding) @@ -1557,8 +1609,10 @@ void __tcp_cleanup_rbuf(struct sock *sk, int copied) time_to_ack = true; } } - if (time_to_ack) + if (time_to_ack) { + tcp_mstamp_refresh(tp); tcp_send_ack(sk); + } } void tcp_cleanup_rbuf(struct sock *sk, int copied) @@ -2586,7 +2640,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, if (err) goto out; - atomic_long_inc(&niov->pp_ref_count); + atomic_long_inc(&niov->desc.pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); sent += copy; @@ -2651,10 +2705,8 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (sk->sk_state == TCP_LISTEN) goto out; - if (tp->recvmsg_inq) { + if (tp->recvmsg_inq) *cmsg_flags = TCP_CMSG_INQ; - msg->msg_get_inq = 1; - } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ @@ -2928,10 +2980,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); - if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { + if ((cmsg_flags | msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); - if (msg->msg_get_inq) { + if ((cmsg_flags & TCP_CMSG_INQ) | msg->msg_get_inq) { msg->msg_inq = tcp_inq_hint(sk); if (cmsg_flags & TCP_CMSG_INQ) put_cmsg(msg, SOL_TCP, TCP_CM_INQ, @@ -3419,6 +3471,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; + tp->pkts_acked_ewma = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -3583,9 +3636,12 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_IPV6_MOD(tcp_tx_delay_enabled); -static void tcp_enable_tx_delay(void) +static void tcp_enable_tx_delay(struct sock *sk, int val) { - if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + struct tcp_sock *tp = tcp_sk(sk); + s32 delta = (val - tp->tcp_tx_delay) << 3; + + if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { @@ -3593,6 +3649,22 @@ static void tcp_enable_tx_delay(void) pr_info("TCP_TX_DELAY enabled\n"); } } + /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us, + * tp->rtt_min, icsk_rto and sk->sk_pacing_rate. + * This is best effort. + */ + if (delta && sk->sk_state == TCP_ESTABLISHED) { + s64 srtt = (s64)tp->srtt_us + delta; + + tp->srtt_us = clamp_t(s64, srtt, 1, ~0U); + + /* Note: does not deal with non zero icsk_backoff */ + tcp_set_rto(sk); + + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); + + tcp_update_pacing_rate(sk); + } } /* When set indicates to always queue non-full frames. Later the user clears @@ -4110,7 +4182,7 @@ ao_parse: break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); - sk->sk_write_space(sk); + READ_ONCE(sk->sk_write_space)(sk); break; case TCP_INQ: if (val > 1 || val < 0) @@ -4119,8 +4191,12 @@ ao_parse: tp->recvmsg_inq = val; break; case TCP_TX_DELAY: - if (val) - tcp_enable_tx_delay(); + /* tp->srtt_us is u32, and is shifted by 3 */ + if (val < 0 || val >= (1U << (31 - 3))) { + err = -EINVAL; + break; + } + tcp_enable_tx_delay(sk, val); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: @@ -4298,6 +4374,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + if (tcp_ecn_disabled(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED; + else if (tcp_ecn_mode_rfc3168(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168; + else if (tcp_ecn_mode_accecn(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN; + else if (tcp_ecn_mode_pending(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING; info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; @@ -4815,52 +4899,45 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG -int tcp_md5_sigpool_id = -1; -EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id); - -int tcp_md5_alloc_sigpool(void) +void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, + unsigned int header_len) { - size_t scratch_size; - int ret; + const unsigned int head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + struct sk_buff *frag_iter; + unsigned int i; - scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr); - ret = tcp_sigpool_alloc_ahash("md5", scratch_size); - if (ret >= 0) { - /* As long as any md5 sigpool was allocated, the return - * id would stay the same. Re-write the id only for the case - * when previously all MD5 keys were deleted and this call - * allocates the first MD5 key, which may return a different - * sigpool id than was used previously. - */ - WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */ - return 0; - } - return ret; -} + md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len); -void tcp_md5_release_sigpool(void) -{ - tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id)); -} + for (i = 0; i < shi->nr_frags; ++i) { + const skb_frag_t *f = &shi->frags[i]; + u32 p_off, p_len, copied; + const void *vaddr; + struct page *p; -void tcp_md5_add_sigpool(void) -{ - tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id)); + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), + p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + md5_update(ctx, vaddr + p_off, p_len); + kunmap_local(vaddr); + } + } + + skb_walk_frags(skb, frag_iter) + tcp_md5_hash_skb_data(ctx, frag_iter, 0); } +EXPORT_IPV6_MOD(tcp_md5_hash_skb_data); -int tcp_md5_hash_key(struct tcp_sigpool *hp, - const struct tcp_md5sig_key *key) +void tcp_md5_hash_key(struct md5_ctx *ctx, + const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ - struct scatterlist sg; - - sg_init_one(&sg, key->key, keylen); - ahash_request_set_crypt(hp->req, &sg, NULL, keylen); /* We use data_race() because tcp_md5_do_add() might change * key->key under us */ - return data_race(crypto_ahash_update(hp->req)); + data_race(({ md5_update(ctx, key->key, keylen), 0; })); } EXPORT_IPV6_MOD(tcp_md5_hash_key); @@ -4871,19 +4948,16 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, int family, int l3index, const __u8 *hash_location) { /* This gets called for each TCP segment that has TCP-MD5 option. - * We have 3 drop cases: - * o No MD5 hash and one expected. - * o MD5 hash and we're not expecting one. - * o MD5 hash and its wrong. + * We have 2 drop cases: + * o An MD5 signature is present, but we're not expecting one. + * o The MD5 signature is wrong. */ const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; u8 newhash[16]; - int genhash; key = tcp_md5_do_lookup(sk, l3index, saddr, family); - - if (!key && hash_location) { + if (!key) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; @@ -4894,11 +4968,10 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, * IPv4-mapped case. */ if (family == AF_INET) - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); + tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else - genhash = tp->af_specific->calc_md5_hash(newhash, key, - NULL, skb); - if (genhash || memcmp(hash_location, newhash, 16) != 0) { + tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); + if (crypto_memneq(hash_location, newhash, 16)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; @@ -5180,6 +5253,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 34b8450829d0..a97cdf3e6af4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/hash.h> +#include <crypto/utils.h> #include <linux/inetdevice.h> #include <linux/tcp.h> @@ -227,7 +228,7 @@ static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags) { struct tcp_ao_info *ao; - ao = kzalloc(sizeof(*ao), flags); + ao = kzalloc_obj(*ao, flags); if (!ao) return NULL; INIT_HLIST_HEAD(&ao->head); @@ -922,7 +923,7 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, /* XXX: make it per-AF callback? */ tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key, (phash - (u8 *)th), sne); - if (memcmp(phash, hash_buf, maclen)) { + if (crypto_memneq(phash, hash_buf, maclen)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a268e1595b22..813d2e498c93 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,6 +10,7 @@ #include <net/inet_common.h> #include <net/tls.h> +#include <asm/ioctls.h> void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { @@ -38,7 +39,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *tmp; int i, ret = 0; - tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); + tmp = kzalloc_obj(*tmp, __GFP_NOWARN | GFP_KERNEL); if (unlikely(!tmp)) return -ENOMEM; @@ -226,6 +227,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int peek = flags & MSG_PEEK; struct sk_psock *psock; struct tcp_sock *tcp; + int copied_from_self = 0; int copied = 0; u32 seq; @@ -262,7 +264,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, } msg_bytes_ready: - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. @@ -277,7 +279,7 @@ msg_bytes_ready: goto out; } } - seq += copied; + seq += copied_from_self; if (!copied) { long timeo; int data; @@ -331,6 +333,24 @@ unlock: return copied; } +static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) +{ + bool slow; + + if (cmd != SIOCINQ) + return tcp_ioctl(sk, cmd, karg); + + /* works similar as tcp_ioctl */ + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + slow = lock_sock_fast(sk); + *karg = sk_psock_msg_inq(sk); + unlock_sock_fast(sk, slow); + + return 0; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { @@ -406,8 +426,8 @@ more_data: msg->cork_bytes > msg->sg.size && !enospc) { psock->cork_bytes = msg->cork_bytes - msg->sg.size; if (!psock->cork) { - psock->cork = kzalloc(sizeof(*psock->cork), - GFP_ATOMIC | __GFP_NOWARN); + psock->cork = kzalloc_obj(*psock->cork, + GFP_ATOMIC | __GFP_NOWARN); if (!psock->cork) { sk_msg_free(sk, msg); *copied = 0; @@ -609,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; + prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; @@ -704,7 +725,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { - sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_write_space, psock->saved_write_space); /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, psock->sk_proto); } diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index fbad6c35dee9..ceabfd690a29 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -378,8 +378,8 @@ static void tcp_cdg_init(struct sock *sk) ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) - ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), - GFP_NOWAIT); + ca->gradients = kzalloc_objs(ca->gradients[0], window, + GFP_NOWAIT); ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index df758adbb445..e9f6c77e0631 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); @@ -227,7 +228,7 @@ void tcp_assign_congestion_control(struct sock *sk) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) - INET_ECN_xmit(sk); + INET_ECN_xmit_ect_1_negotiation(sk); else INET_ECN_dontxmit(sk); } @@ -257,7 +258,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) - INET_ECN_xmit(sk); + INET_ECN_xmit_ect_1_negotiation(sk); else INET_ECN_dontxmit(sk); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index d83efd91f461..7935702e394b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -509,7 +509,7 @@ next_chunk: if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && + if (r->id.idiag_sport != htons(READ_ONCE(sk->sk_num)) && r->id.idiag_sport) goto next_normal; if (r->id.idiag_dport != sk->sk_dport && diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 7d945a527daf..9fdc19accafd 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -5,6 +5,92 @@ #include <net/tcp.h> #include <net/busy_poll.h> +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. + * + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; + + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->tfo_listener = false; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + reqsk_put(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->rsk_timer.expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); +} + void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; @@ -63,7 +149,7 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, struct fastopen_queue *q; int err = 0; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kmalloc_obj(*ctx); if (!ctx) { err = -ENOMEM; goto out; @@ -247,7 +333,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, bool own_req; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - NULL, &own_req); + NULL, &own_req, NULL); if (!child) return NULL; @@ -463,8 +549,8 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) /* Alloc fastopen_req in order for FO option to be included * in SYN */ - tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req), - sk->sk_allocation); + tp->fastopen_req = kzalloc_obj(*tp->fastopen_req, + sk->sk_allocation); if (tp->fastopen_req) tp->fastopen_req->cookie = cookie; else diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e4a979b75cc6..cba89733d121 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -488,6 +488,10 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, tcp_count_delivered_ce(tp, delivered); } +#define PKTS_ACKED_WEIGHT 6 +#define PKTS_ACKED_PREC 6 +#define ACK_COMP_THRESH 4 + /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, @@ -499,6 +503,7 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delta, safe_delta, d_ceb; bool opt_deltas_valid; u32 corrected_ace; + u32 ewma; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) @@ -507,6 +512,18 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, opt_deltas_valid = tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (delivered_pkts) { + if (!tp->pkts_acked_ewma) { + ewma = delivered_pkts << PKTS_ACKED_PREC; + } else { + ewma = tp->pkts_acked_ewma; + ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) + + (delivered_pkts << PKTS_ACKED_PREC)) >> + PKTS_ACKED_WEIGHT; + } + tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU); + } + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -555,7 +572,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (d_ceb < safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) return delta; - } + } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC)) + return delta; return safe_delta; } @@ -896,6 +914,7 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval) const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 rcvwin, rcvbuf, cap, oldval; + u32 rtt_threshold, rtt_us; u64 grow; oldval = tp->rcvq_space.space; @@ -908,10 +927,19 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval) /* DRS is always one RTT late. */ rcvwin = newval << 1; - /* slow start: allow the sender to double its rate. */ - grow = (u64)rcvwin * (newval - oldval); - do_div(grow, oldval); - rcvwin += grow << 1; + rtt_us = tp->rcv_rtt_est.rtt_us >> 3; + rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt); + if (rtt_us < rtt_threshold) { + /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold. + * It might take few additional ms to reach 'line rate', + * but will avoid sk_rcvbuf inflation and poor cache use. + */ + grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold); + } else { + /* slow start: allow the sender to double its rate. */ + grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval); + } + rcvwin += grow; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; @@ -937,9 +965,15 @@ void tcp_rcv_space_adjust(struct sock *sk) trace_tcp_rcv_space_adjust(sk); - tcp_mstamp_refresh(tp); + if (unlikely(!tp->rcv_rtt_est.rtt_us)) + return; + + /* We do not refresh tp->tcp_mstamp here. + * Some platforms have expensive ktime_get() implementations. + * Using the last cached value is enough for DRS. + */ time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); - if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + if (time < (tp->rcv_rtt_est.rtt_us >> 3)) return; /* Number of bytes copied to user in last RTT */ @@ -1102,7 +1136,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -static void tcp_update_pacing_rate(struct sock *sk) +void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; @@ -1139,7 +1173,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1542,6 +1576,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, + u32 end_seq, u64 xmit_time) +{ + u32 rtt_us; + + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + return; + } + tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } +} + /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, @@ -1621,6 +1687,160 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. + */ + +/* Update the connection delivery information and generate a rate sample. */ +static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + bool is_sack_reneg, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = tp->tcp_mstamp; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; + } +} + +/* When an skb is sacked or acked, we fill in the rate sample with the (prior) + * delivery information when the skb was last transmitted. + * + * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is + * called multiple times. We favor the information from the most recently + * sent skb, i.e., the skb with the most recently sent time and the highest + * sequence. + */ +static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs) +{ + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct tcp_sock *tp = tcp_sk(sk); + u64 tx_tstamp; + + if (!scb->tx.delivered_mstamp) + return; + + tx_tstamp = tcp_skb_timestamp_us(skb); + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ + rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being + * used again when it's cumulatively acked. For acked packets + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) + scb->tx.delivered_mstamp = 0; +} + /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ @@ -3979,6 +4199,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, return delivered; } +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_STATIC_REO_WND) || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -4113,7 +4376,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, flag); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { @@ -4163,7 +4426,7 @@ no_queue: */ tcp_ack_probe(sk); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); return 1; @@ -4595,15 +4858,24 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, */ static enum skb_drop_reason tcp_sequence(const struct sock *sk, - u32 seq, u32 end_seq) + u32 seq, u32 end_seq, + const struct tcphdr *th) { const struct tcp_sock *tp = tcp_sk(sk); + u32 seq_limit; if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; - if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) { - if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + seq_limit = tp->rcv_nxt + tcp_receive_window(tp); + if (unlikely(after(end_seq, seq_limit))) { + /* Some stacks are known to handle FIN incorrectly; allow the + * FIN to extend beyond the window and check it in detail later. + */ + if (!after(end_seq - th->fin, seq_limit)) + return SKB_NOT_DROPPED_YET; + + if (after(seq, seq_limit)) return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; /* Only accept this packet if receive queue is empty. */ @@ -4783,8 +5055,11 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } -static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) +static void tcp_rcv_spurious_retrans(struct sock *sk, + const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. * If it seems our ACKs are not reaching the other side, @@ -4804,6 +5079,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) /* Save last flowlabel after a spurious retrans. */ tcp_save_lrcv_flowlabel(sk, skb); #endif + /* Check DSACK info to detect that the previous ACK carrying the + * AccECN option was lost after the second retransmision, and then + * stop sending AccECN option in all subsequent ACKs. + */ + if (tcp_ecn_mode_accecn(tp) && + tp->accecn_opt_sent_w_dsack && + TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND); } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -5091,25 +5374,11 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); -/* Check if this incoming skb can be added to socket receive queues - * while satisfying sk->sk_rcvbuf limit. - * - * In theory we should use skb->truesize, but this can cause problems - * when applications use too small SO_RCVBUF values. - * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, - * allowing RWIN to be close to available space. - * Whenever the receive queue gets full, we can receive a small packet - * filling RWIN, but with a high skb->truesize, because most NIC use 4K page - * plus sk_buff metadata even when receiving less than 1500 bytes of payload. - * - * Note that we use skb->len to decide to accept or drop this packet, - * but sk->sk_rmem_alloc is the sum of all skb->truesize. - */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return rmem + skb->len <= sk->sk_rcvbuf; + return rmem <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, @@ -5142,7 +5411,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } @@ -5352,7 +5621,7 @@ err: void tcp_data_ready(struct sock *sk) { if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE)) - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5408,7 +5677,7 @@ queue_and_out: inet_csk(sk)->icsk_ack.pending |= (ICSK_ACK_NOMEM | ICSK_ACK_NOW); inet_csk_schedule_ack(sk); - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; @@ -5511,25 +5780,6 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, return next; } -/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sk_buff *skb1; - - while (*p) { - parent = *p; - skb1 = rb_to_skb(parent); - if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&skb->rbnode, parent, p); - rb_insert_color(&skb->rbnode, root); -} - /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * @@ -5850,7 +6100,9 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); + INDIRECT_CALL_1(READ_ONCE(sk->sk_write_space), + sk_stream_write_space, + sk); } /* Caller made space either from: @@ -5863,16 +6115,11 @@ static void tcp_new_space(struct sock *sk) * small enough that tcp_stream_memory_free() decides it * is time to generate EPOLLOUT. */ -void tcp_check_space(struct sock *sk) +void __tcp_check_space(struct sock *sk) { - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } static inline void tcp_data_snd_check(struct sock *sk) @@ -5887,7 +6134,9 @@ static inline void tcp_data_snd_check(struct sock *sk) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); - unsigned long rtt, delay; + struct net *net = sock_net(sk); + unsigned long rtt; + u64 delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5906,7 +6155,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * Defer the ack until tcp_release_cb(). */ if (sock_owned_by_user_nocheck(sk) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) { + READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) { set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); return; } @@ -5921,7 +6170,7 @@ send_now: } if (!tcp_is_sack(tp) || - tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) + tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { @@ -5936,18 +6185,26 @@ send_now: if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; - /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ + /* compress ack timer : comp_sack_rtt_percent of rtt, + * but no more than tcp_comp_sack_delay_ns. + */ rtt = tp->rcv_rtt_est.rtt_us; if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), - rtt * (NSEC_PER_USEC >> 3)/20); + /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100 + * -> + * delay = rtt * 1.25 * comp_sack_rtt_percent + */ + delay = (u64)(rtt + (rtt >> 2)) * + READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent); + + delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns)); + sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), + READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns), HRTIMER_MODE_REL_PINNED_SOFT); } @@ -6056,7 +6313,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); } } } @@ -6119,7 +6376,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, step1: /* Step 1: check sequence number */ - reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, th); if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." @@ -6196,6 +6454,8 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); @@ -6817,7 +7077,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th, skb); + tcp_ecn_rcv_syn(sk, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -7222,7 +7482,8 @@ static void tcp_ecn_create_request(struct request_sock *req, u32 ecn_ok_dst; if (tcp_accecn_syn_requested(th) && - READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + (READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3 || + tcp_ca_needs_accecn(listen_sk))) { inet_rsk(req)->ecn_ok = 1; tcp_rsk(req)->accecn_ok = 1; tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & @@ -7335,8 +7596,7 @@ static void tcp_reqsk_record_syn(const struct sock *sk, mac_hdrlen = 0; } - saved_syn = kmalloc(struct_size(saved_syn, data, len), - GFP_ATOMIC); + saved_syn = kmalloc_flex(*saved_syn, data, len, GFP_ATOMIC); if (saved_syn) { saved_syn->mac_hdrlen = mac_hdrlen; saved_syn->network_hdrlen = skb_network_header_len(skb); @@ -7386,6 +7646,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; + union tcp_seq_and_ts_off st; struct request_sock *req; bool want_cookie = false; struct dst_entry *dst; @@ -7455,9 +7716,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!dst) goto drop_and_free; + if (tmp_opt.tstamp_ok || (!want_cookie && !isn)) + st = af_ops->init_seq_and_ts_off(net, skb); + if (tmp_opt.tstamp_ok) { tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); - tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); + tcp_rsk(req)->ts_off = st.ts_off; } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); @@ -7479,7 +7743,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb); + isn = st.seq; } tcp_ecn_create_request(req, skb, sk, dst); @@ -7520,20 +7784,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, sock_put(fastopen_sk); goto drop_and_free; } - sk->sk_data_ready(sk); + READ_ONCE(sk->sk_data_ready)(sk); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; - if (!want_cookie) { - req->timeout = tcp_timeout_init((struct sock *)req); - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, - req->timeout))) { - reqsk_free(req); - dst_release(dst); - return 0; - } - + if (!want_cookie && + unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) { + reqsk_free(req); + dst_release(dst); + return 0; } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b1fcf3e4e1ce..c7b2463c2e25 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -53,6 +53,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/cache.h> +#include <linux/fips.h> #include <linux/jhash.h> #include <linux/init.h> #include <linux/times.h> @@ -86,14 +87,14 @@ #include <linux/btf_ids.h> #include <linux/skbuff_ref.h> -#include <crypto/hash.h> -#include <linux/scatterlist.h> +#include <crypto/md5.h> +#include <crypto/utils.h> #include <trace/events/tcp.h> #ifdef CONFIG_TCP_MD5SIG -static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - __be32 daddr, __be32 saddr, const struct tcphdr *th); +static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, const struct tcphdr *th); #endif struct inet_hashinfo tcp_hashinfo; @@ -104,17 +105,14 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { static DEFINE_MUTEX(tcp_exit_batch_mutex); -static u32 tcp_v4_init_seq(const struct sk_buff *skb) +static union tcp_seq_and_ts_off +tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcp_seq(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); -} - -static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) -{ - return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); + return secure_tcp_seq_and_ts_off(net, + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -205,7 +203,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); -static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from tcp_v4_connect() and intended to @@ -221,7 +219,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, } /* This will initiate an outgoing connection. */ -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; @@ -326,15 +324,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = NULL; if (likely(!tp->repair)) { + union tcp_seq_and_ts_off st; + + st = secure_tcp_seq_and_ts_off(net, + inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port); if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port)); - WRITE_ONCE(tp->tsoffset, - secure_tcp_ts_off(net, inet->inet_saddr, - inet->inet_daddr)); + WRITE_ONCE(tp->write_seq, st.seq); + WRITE_ONCE(tp->tsoffset, st.ts_off); } atomic_set(&inet->inet_id, get_random_u16()); @@ -374,7 +373,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; - u32 mtu; + u32 mtu, dmtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; @@ -386,15 +385,14 @@ void tcp_v4_mtu_reduced(struct sock *sk) /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + dmtu = dst4_mtu(dst); + if (mtu < dmtu && ip_dont_fragment(sk, dst)) WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - mtu = dst_mtu(dst); - if (inet->pmtudisc != IP_PMTUDISC_DONT && ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - tcp_sync_mss(sk, mtu); + inet_csk(sk)->icsk_pmtu_cookie > dmtu) { + tcp_sync_mss(sk, dmtu); /* Resend the TCP packet because it's * clear that the old packet has been @@ -754,7 +752,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, struct tcp_md5sig_key *key = NULL; unsigned char newhash[16]; struct sock *sk1 = NULL; - int genhash; #endif u64 transmit_time = 0; struct sock *ctl_sk; @@ -840,11 +837,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, if (!key) goto out; - - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); - if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) + tcp_v4_md5_hash_skb(newhash, key, NULL, skb); + if (crypto_memneq(md5_hash_location, newhash, 16)) goto out; - } if (key) { @@ -1358,7 +1353,7 @@ static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - md5sig = kmalloc(sizeof(*md5sig), gfp); + md5sig = kmalloc_obj(*md5sig, gfp); if (!md5sig) return -ENOMEM; @@ -1425,13 +1420,13 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { - if (tcp_md5_alloc_sigpool()) - return -ENOMEM; + if (fips_enabled) { + pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); + return -EOPNOTSUPP; + } - if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { - tcp_md5_release_sigpool(); + if (tcp_md5sig_info_add(sk, GFP_KERNEL)) return -ENOMEM; - } if (!static_branch_inc(&tcp_md5_needed.key)) { struct tcp_md5sig_info *md5sig; @@ -1439,7 +1434,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); rcu_assign_pointer(tp->md5sig_info, NULL); kfree_rcu(md5sig, rcu); - tcp_md5_release_sigpool(); return -EUSERS; } } @@ -1456,12 +1450,9 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { - tcp_md5_add_sigpool(); - if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { - tcp_md5_release_sigpool(); + if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) return -ENOMEM; - } if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { struct tcp_md5sig_info *md5sig; @@ -1470,7 +1461,6 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); rcu_assign_pointer(tp->md5sig_info, NULL); kfree_rcu(md5sig, rcu); - tcp_md5_release_sigpool(); return -EUSERS; } } @@ -1578,66 +1568,44 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, cmd.tcpm_key, cmd.tcpm_keylen); } -static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, - __be32 daddr, __be32 saddr, - const struct tcphdr *th, int nbytes) +static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, + __be32 daddr, __be32 saddr, + const struct tcphdr *th, int nbytes) { - struct tcp4_pseudohdr *bp; - struct scatterlist sg; - struct tcphdr *_th; - - bp = hp->scratch; - bp->saddr = saddr; - bp->daddr = daddr; - bp->pad = 0; - bp->protocol = IPPROTO_TCP; - bp->len = cpu_to_be16(nbytes); - - _th = (struct tcphdr *)(bp + 1); - memcpy(_th, th, sizeof(*th)); - _th->check = 0; + struct { + struct tcp4_pseudohdr ip; + struct tcphdr tcp; + } h; - sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); - ahash_request_set_crypt(hp->req, &sg, NULL, - sizeof(*bp) + sizeof(*th)); - return crypto_ahash_update(hp->req); + h.ip.saddr = saddr; + h.ip.daddr = daddr; + h.ip.pad = 0; + h.ip.protocol = IPPROTO_TCP; + h.ip.len = cpu_to_be16(nbytes); + h.tcp = *th; + h.tcp.check = 0; + md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); } -static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - __be32 daddr, __be32 saddr, const struct tcphdr *th) +static noinline_for_stack void +tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, const struct tcphdr *th) { - struct tcp_sigpool hp; - - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; + struct md5_ctx ctx; -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } -int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, - const struct sock *sk, - const struct sk_buff *skb) +noinline_for_stack void +tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); - struct tcp_sigpool hp; __be32 saddr, daddr; + struct md5_ctx ctx; if (sk) { /* valid for establish/request sockets */ saddr = sk->sk_rcv_saddr; @@ -1648,30 +1616,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, daddr = iph->daddr; } - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - - if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) - goto clear_hash; - if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; - -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); + tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); @@ -1709,7 +1658,6 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { @@ -1727,8 +1675,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, - .init_seq = tcp_v4_init_seq, - .init_ts_off = tcp_v4_init_ts_off, + .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off, .send_synack = tcp_v4_send_synack, }; @@ -1756,7 +1703,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, - bool *own_req) + bool *own_req, + void (*opt_child_init)(struct sock *newsk, + const struct sock *sk)) { struct inet_request_sock *ireq; bool found_dup_sk = false; @@ -1808,9 +1757,13 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, } sk_setup_caps(newsk, dst); +#if IS_ENABLED(CONFIG_IPV6) + if (opt_child_init) + opt_child_init(newsk, sk); +#endif tcp_ca_openreq_child(newsk, dst); - tcp_sync_mss(newsk, dst_mtu(dst)); + tcp_sync_mss(newsk, dst4_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -2160,14 +2113,6 @@ no_coalesce: } EXPORT_IPV6_MOD(tcp_add_backlog); -int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) -{ - struct tcphdr *th = (struct tcphdr *)skb->data; - - return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); -} -EXPORT_IPV6_MOD(tcp_filter); - static void tcp_v4_restore_cb(struct sk_buff *skb) { memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, @@ -2919,13 +2864,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk_timeout(icsk); + timer_expires = tcp_timeout_expires(sk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk_timeout(icsk); - } else if (timer_pending(&sk->sk_timer)) { + timer_expires = tcp_timeout_expires(sk); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sk->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -3468,20 +3413,6 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* @wake is one when sk_stream_write_space() calls us. - * This sends EPOLLOUT only if notsent_bytes is half the limit. - * This mimics the strategy used in sock_def_write_space(). - */ -bool tcp_stream_memory_free(const struct sock *sk, int wake) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - - READ_ONCE(tp->snd_nxt); - - return (notsent_bytes << wake) < tcp_notsent_lowat(tp); -} -EXPORT_SYMBOL(tcp_stream_memory_free); - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -3524,6 +3455,8 @@ struct proto tcp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .freeptr_offset = offsetof(struct tcp_sock, + inet_conn.icsk_inet.sk.sk_freeptr), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, @@ -3616,6 +3549,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -3643,8 +3577,9 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; - net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; + net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; + net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; net->ipv4.sysctl_tcp_backlog_ack_defer = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 52fe17167460..976b56644a8a 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -23,9 +23,9 @@ * Original Author: * Aleksandar Kuzmanovic <akuzma@northwestern.edu> * Available from: - * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * https://users.cs.northwestern.edu/~akuzma/doc/TCP-LP-ToN.pdf * Original implementation for 2.4.19: - * http://www-ece.rice.edu/networks/TCP-LP/ + * https://users.cs.northwestern.edu/~akuzma/rice/TCP-LP/linux/tcp-lp-linux.htm * * 2.6.x module Authors: * Wong Hoi Sing, Edison <hswong3i@gmail.com> @@ -113,6 +113,8 @@ static void tcp_lp_init(struct sock *sk) /** * tcp_lp_cong_avoid * @sk: socket to avoid congesting + * @ack: current ack sequence number + * @acked: number of ACKed packets * * Implementation of cong_avoid. * Will only call newReno CA when away from inference. @@ -261,6 +263,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) /** * tcp_lp_pkts_acked * @sk: socket requiring congestion avoidance calculations + * @sample: ACK sample containing timing and rate information * * Implementation of pkts_acked. * Deal with active drop under Early Congestion Indication. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 45b6ecd16412..06b1d5d3b6df 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -197,7 +197,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, } tm = oldest; } else { - tm = kzalloc(sizeof(*tm), GFP_ATOMIC); + tm = kzalloc_obj(*tm, GFP_ATOMIC); if (!tm) goto out_unlock; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2ec8c6f1cdcc..dafb63b923d0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -312,7 +312,6 @@ static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; - tcp_md5_add_sigpool(); } return; out_free: @@ -338,7 +337,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -406,7 +404,6 @@ void tcp_twsk_destructor(struct sock *sk) if (twsk->tw_md5_key) { kfree(twsk->tw_md5_key); static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); } } #endif @@ -484,13 +481,18 @@ static void tcp_ecn_openreq_child(struct sock *sk, tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); tp->saw_accecn_opt = treq->saw_accecn_opt; + if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND); + if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk)) + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + else + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } } @@ -716,7 +718,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; + tmp_opt.ts_recent_stamp = ktime_get_seconds() - + tcp_reqsk_timeout(req) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -750,16 +753,28 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, - &tcp_rsk(req)->last_oow_ack_time) && - - !tcp_rtx_synack(sk, req)) { - unsigned long expires = jiffies; - - expires += reqsk_timeout(req, TCP_RTO_MAX); - if (!fastopen) - mod_timer_pending(&req->rsk_timer, expires); - else - req->rsk_timer.expires = expires; + &tcp_rsk(req)->last_oow_ack_time)) { + if (tcp_rsk(req)->accecn_ok) { + u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + + tcp_rsk(req)->syn_ect_rcv = ect_rcv; + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV; + } + if (!tcp_rtx_synack(sk, req)) { + unsigned long expires = jiffies; + + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; + + expires += tcp_reqsk_timeout(req); + if (!fastopen) + mod_timer_pending(&req->rsk_timer, + expires); + else + req->rsk_timer.expires = expires; + } } return NULL; } @@ -910,7 +925,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - req, &own_req); + req, &own_req, NULL); if (!child) goto listen_overflow; @@ -989,7 +1004,7 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) - parent->sk_data_ready(parent); + READ_ONCE(parent->sk_data_ready)(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2cb93da93abc..3b1fdcd3cb29 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -107,7 +107,8 @@ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { struct tcphdr *th = tcp_hdr(skb); - if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) + if ((skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) && + !(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) return __tcp4_gso_segment_list(skb, features); skb->ip_summed = CHECKSUM_NONE; @@ -282,33 +283,6 @@ struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) return NULL; } -struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) -{ - unsigned int thlen, hlen, off; - struct tcphdr *th; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*th); - th = skb_gro_header(skb, hlen, off); - if (unlikely(!th)) - return NULL; - - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - return NULL; - - hlen = off + thlen; - if (!skb_gro_may_pull(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - return NULL; - } - - skb_gro_pull(skb, thlen); - - return th; -} - struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { @@ -330,8 +304,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, goto out_check_final; th2 = tcp_hdr(p); - flush = (__force int)(flags & TCP_FLAG_CWR); - flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + flush = (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b94efb3050d2..326b58ff1118 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -40,6 +40,7 @@ #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/mptcp.h> +#include <net/smc.h> #include <net/proto_memory.h> #include <net/psp.h> @@ -65,6 +66,25 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_to_skb(parent); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { @@ -333,8 +353,11 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - if (!tcp_accecn_ace_fail_recv(tp)) + if (!tcp_accecn_ace_fail_recv(tp) && + !tcp_accecn_ace_fail_send(tp)) INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { @@ -711,9 +734,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, if (tp) { tp->accecn_minlen = 0; tp->accecn_opt_tstamp = tp->tcp_mstamp; + tp->accecn_opt_sent_w_dsack = tp->rx_opt.dsack; if (tp->accecn_opt_demand) tp->accecn_opt_demand--; } + } else if (tp) { + tp->accecn_opt_sent_w_dsack = 0; } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -802,34 +828,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, mptcp_options_write(th, ptr, tp, opts); } -static void smc_set_option(const struct tcp_sock *tp, +static void smc_set_option(struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) - if (static_branch_unlikely(&tcp_have_smc)) { - if (tp->syn_smc) { - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; - } + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) { + tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option); + /* re-check syn_smc */ + if (tp->syn_smc && + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, - const struct inet_request_sock *ireq, + struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) - if (static_branch_unlikely(&tcp_have_smc)) { - if (tp->syn_smc && ireq->smc_ok) { - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; - } + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) { + ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq); + /* re-check smc_ok */ + if (ireq->smc_ok && + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } #endif @@ -1103,7 +1131,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { + synack_type != TCP_SYNACK_RETRANS && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1183,7 +1211,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && + if (ecn_opt && tp->saw_accecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_PERSIST || + !tcp_accecn_opt_fail_send(tp)) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; @@ -1429,6 +1459,41 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +/* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include + * the full time the network needs to deliver all in-flight + * packets. If there are no packets in flight yet, then we + * know that any ACKs after now indicate that the network was + * able to deliver those packets completely in the sampling + * interval between now and the next ACK. + * + * Note that we use packets_out instead of tcp_packets_in_flight(tp) + * because the latter is a guess based on RTO and loss-marking + * heuristics. We don't want spurious RTOs or loss markings to cause + * a spuriously small time interval, causing a spuriously high + * bandwidth estimate. + */ + if (!tp->packets_out) { + u64 tstamp_us = tcp_skb_timestamp_us(skb); + + tp->first_tx_mstamp = tstamp_us; + tp->delivered_mstamp = tstamp_us; + } + + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; +} + INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); @@ -1527,7 +1592,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ skb->pfmemalloc = 0; - skb_push(skb, tcp_header_size); + __skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); @@ -3568,12 +3633,15 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback - * As AccECN uses the same SYN flags (+ AE), this check covers both - * cases. - */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) - tcp_ecn_clear_syn(sk, skb); + if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) { + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check + * covers both cases. + */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == + TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + } /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); @@ -3729,28 +3797,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk(sk)->icsk_rto, true); } -/* We allow to exceed memory limits for FIN packets to expedite - * connection tear down and (memory) recovery. - * Otherwise tcp_send_fin() could be tempted to either delay FIN - * or even be forced to close flow without any FIN. - * In general, we want to allow one skb per socket to avoid hangs - * with edge trigger epoll() - */ -void sk_forced_mem_schedule(struct sock *sk, int size) -{ - int delta, amt; - - delta = size - sk->sk_forward_alloc; - if (delta <= 0) - return; - amt = sk_mem_pages(delta); - sk_forward_alloc_add(sk, amt << PAGE_SHIFT); - sk_memory_allocated_add(sk, amt); - - if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); -} - /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ @@ -3910,6 +3956,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, switch (synack_type) { case TCP_SYNACK_NORMAL: + case TCP_SYNACK_RETRANS: skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: @@ -3992,7 +4039,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th); + tcp_ecn_make_synack(req, th, synack_type); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; @@ -4595,7 +4642,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_RETRANS, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c deleted file mode 100644 index a8f6d9d06f2e..000000000000 --- a/net/ipv4/tcp_rate.c +++ /dev/null @@ -1,209 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include <net/tcp.h> - -/* The bandwidth estimator estimates the rate at which the network - * can currently deliver outbound data packets for this flow. At a high - * level, it operates by taking a delivery rate sample for each ACK. - * - * A rate sample records the rate at which the network delivered packets - * for this flow, calculated over the time interval between the transmission - * of a data packet and the acknowledgment of that packet. - * - * Specifically, over the interval between each transmit and corresponding ACK, - * the estimator generates a delivery rate sample. Typically it uses the rate - * at which packets were acknowledged. However, the approach of using only the - * acknowledgment rate faces a challenge under the prevalent ACK decimation or - * compression: packets can temporarily appear to be delivered much quicker - * than the bottleneck rate. Since it is physically impossible to do that in a - * sustained fashion, when the estimator notices that the ACK rate is faster - * than the transmit rate, it uses the latter: - * - * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) - * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) - * bw = min(send_rate, ack_rate) - * - * Notice the estimator essentially estimates the goodput, not always the - * network bottleneck link rate when the sending or receiving is limited by - * other factors like applications or receiver window limits. The estimator - * deliberately avoids using the inter-packet spacing approach because that - * approach requires a large number of samples and sophisticated filtering. - * - * TCP flows can often be application-limited in request/response workloads. - * The estimator marks a bandwidth sample as application-limited if there - * was some moment during the sampled window of packets when there was no data - * ready to send in the write queue. - */ - -/* Snapshot the current delivery information in the skb, to generate - * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). - */ -void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - /* In general we need to start delivery rate samples from the - * time we received the most recent ACK, to ensure we include - * the full time the network needs to deliver all in-flight - * packets. If there are no packets in flight yet, then we - * know that any ACKs after now indicate that the network was - * able to deliver those packets completely in the sampling - * interval between now and the next ACK. - * - * Note that we use packets_out instead of tcp_packets_in_flight(tp) - * because the latter is a guess based on RTO and loss-marking - * heuristics. We don't want spurious RTOs or loss markings to cause - * a spuriously small time interval, causing a spuriously high - * bandwidth estimate. - */ - if (!tp->packets_out) { - u64 tstamp_us = tcp_skb_timestamp_us(skb); - - tp->first_tx_mstamp = tstamp_us; - tp->delivered_mstamp = tstamp_us; - } - - TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; - TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; - TCP_SKB_CB(skb)->tx.delivered = tp->delivered; - TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; - TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; -} - -/* When an skb is sacked or acked, we fill in the rate sample with the (prior) - * delivery information when the skb was last transmitted. - * - * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is - * called multiple times. We favor the information from the most recently - * sent skb, i.e., the skb with the most recently sent time and the highest - * sequence. - */ -void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u64 tx_tstamp; - - if (!scb->tx.delivered_mstamp) - return; - - tx_tstamp = tcp_skb_timestamp_us(skb); - if (!rs->prior_delivered || - tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, - scb->end_seq, rs->last_end_seq)) { - rs->prior_delivered_ce = scb->tx.delivered_ce; - rs->prior_delivered = scb->tx.delivered; - rs->prior_mstamp = scb->tx.delivered_mstamp; - rs->is_app_limited = scb->tx.is_app_limited; - rs->is_retrans = scb->sacked & TCPCB_RETRANS; - rs->last_end_seq = scb->end_seq; - - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = tx_tstamp; - /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); - - } - /* Mark off the skb delivered once it's sacked to avoid being - * used again when it's cumulatively acked. For acked packets - * we don't need to reset since it'll be freed soon. - */ - if (scb->sacked & TCPCB_SACKED_ACKED) - scb->tx.delivered_mstamp = 0; -} - -/* Update the connection delivery information and generate a rate sample. */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 snd_us, ack_us; - - /* Clear app limited if bubble is acked and gone. */ - if (tp->app_limited && after(tp->delivered, tp->app_limited)) - tp->app_limited = 0; - - /* TODO: there are multiple places throughout tcp_ack() to get - * current time. Refactor the code using a new "tcp_acktag_state" - * to carry current time, flags, stats like "tcp_sacktag_state". - */ - if (delivered) - tp->delivered_mstamp = tp->tcp_mstamp; - - rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ - rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available or - * in recovery from loss with SACK reneging. Rate samples taken during - * a SACK reneging event may overestimate bw by including packets that - * were SACKed before the reneg. - */ - if (!rs->prior_mstamp || is_sack_reneg) { - rs->delivered = -1; - rs->interval_us = -1; - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; - - rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; - /* delivered_ce occupies less than 32 bits in the skb control block */ - rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; - - /* Model sending data and receiving ACKs as separate pipeline phases - * for a window. Usually the ACK phase is longer, but with ACK - * compression the send phase can be longer. To be safe we use the - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - - /* Record both segment send and ack receive intervals */ - rs->snd_interval_us = snd_us; - rs->rcv_interval_us = ack_us; - - /* Normally we expect interval_us >= min-rtt. - * Note that rate may still be over-estimated when a spuriously - * retransmistted skb was first (s)acked because "interval_us" - * is under-estimated (up to an RTT). However continuously - * measuring the delivery rate during loss recovery is crucial - * for connections suffer heavy or prolonged losses. - */ - if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - if (!rs->is_retrans) - pr_debug("tcp rate: %ld %d %u %u %u\n", - rs->interval_us, rs->delivered, - inet_csk(sk)->icsk_ca_state, - tp->rx_opt.sack_ok, tcp_min_rtt(tp)); - rs->interval_us = -1; - return; - } - - /* Record the last non-app-limited or the highest app-limited bw */ - if (!rs->is_app_limited || - ((u64)rs->delivered * tp->rate_interval_us >= - (u64)tp->rate_delivered * rs->interval_us)) { - tp->rate_delivered = rs->delivered; - tp->rate_interval_us = rs->interval_us; - tp->rate_app_limited = rs->is_app_limited; - } -} - -/* If a gap is detected between sends, mark the socket application-limited. */ -void tcp_rate_check_app_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (/* We have less than one packet to send. */ - tp->write_seq - tp->snd_nxt < tp->mss_cache && - /* Nothing in sending host's qdisc queues or NIC tx queue. */ - sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && - /* We are not limited by CWND. */ - tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && - /* All lost packets have been retransmitted. */ - tp->lost_out <= tp->retrans_out) - tp->app_limited = - (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; -} -EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c52fd3254b6e..139646751073 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk) return !!timeout; } -/* Record the most recently (re)sent time among the (s)acked packets - * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from - * draft-cheng-tcpm-rack-00.txt - */ -void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - u64 xmit_time) -{ - u32 rtt_us; - - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { - /* If the sacked packet was retransmitted, it's ambiguous - * whether the retransmission or the original (or the prior - * retransmission) was sacked. - * - * If the original is lost, there is no ambiguity. Otherwise - * we assume the original can be delayed up to aRTT + min_rtt. - * the aRTT term is bounded by the fast recovery or timeout, - * so it's at least one RTT (i.e., retransmission is at least - * an RTT later). - */ - return; - } - tp->rack.advanced = 1; - tp->rack.rtt_us = rtt_us; - if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) { - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; - } -} - /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */ @@ -166,49 +134,6 @@ void tcp_rack_reo_timeout(struct sock *sk) tcp_rearm_rto(sk); } -/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. - * - * If a DSACK is received that seems like it may have been due to reordering - * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded - * by srtt), since there is possibility that spurious retransmission was - * due to reordering delay longer than reo_wnd. - * - * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) - * no. of successful recoveries (accounts for full DSACK-based loss - * recovery undo). After that, reset it to default (min_rtt/4). - * - * At max, reo_wnd is incremented only once per rtt. So that the new - * DSACK on which we are reacting, is due to the spurious retx (approx) - * after the reo_wnd has been updated last time. - * - * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than - * absolute value to account for change in rtt. - */ -void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & - TCP_RACK_STATIC_REO_WND) || - !rs->prior_delivered) - return; - - /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ - if (before(rs->prior_delivered, tp->rack.last_delivered)) - tp->rack.dsack_seen = 0; - - /* Adjust the reo_wnd if update is pending */ - if (tp->rack.dsack_seen) { - tp->rack.reo_wnd_steps = min_t(u32, 0xFF, - tp->rack.reo_wnd_steps + 1); - tp->rack.dsack_seen = 0; - tp->rack.last_delivered = tp->delivered; - tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; - } else if (!tp->rack.reo_wnd_persist) { - tp->rack.reo_wnd_steps = 1; - } -} - /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c index d8a4f192873a..10b2e5970c40 100644 --- a/net/ipv4/tcp_sigpool.c +++ b/net/ipv4/tcp_sigpool.c @@ -257,7 +257,7 @@ void tcp_sigpool_get(unsigned int id) } EXPORT_SYMBOL_GPL(tcp_sigpool_get); -int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH) +int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(0, RCU_BH) { struct crypto_ahash *hash; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 2dd73a4e8e51..5a14a53a3c9e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/rstreason.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) @@ -458,7 +459,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) struct tcp_sock *tp = tcp_sk(sk); int max_retries; - req->rsk_ops->syn_ack_timeout(req); + tcp_syn_ack_timeout(req); /* Add one more retry for fastopen. * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() @@ -479,6 +480,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * it's not good to give up too easily. */ tcp_rtx_synack(sk, req); + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) @@ -510,7 +513,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp; + rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -697,9 +700,9 @@ void tcp_write_timer_handler(struct sock *sk) !icsk->icsk_pending) return; - if (time_after(icsk_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk_timeout(icsk)); + if (time_after(tcp_timeout_expires(sk), jiffies)) { + sk_reset_timer(sk, &sk->tcp_retransmit_timer, + tcp_timeout_expires(sk)); return; } tcp_mstamp_refresh(tcp_sk(sk)); @@ -725,12 +728,10 @@ void tcp_write_timer_handler(struct sock *sk) static void tcp_write_timer(struct timer_list *t) { - struct inet_connection_sock *icsk = - timer_container_of(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; + struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer); /* Avoid locking the socket when there is no pending event. */ - if (!smp_load_acquire(&icsk->icsk_pending)) + if (!smp_load_acquire(&inet_csk(sk)->icsk_pending)) goto out; bh_lock_sock(sk); @@ -752,16 +753,15 @@ void tcp_syn_ack_timeout(const struct request_sock *req) __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } -EXPORT_IPV6_MOD(tcp_syn_ack_timeout); void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) { - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); + sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len); } static void tcp_delete_keepalive_timer(struct sock *sk) { - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer); } void tcp_set_keepalive(struct sock *sk, int val) @@ -778,8 +778,9 @@ EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); static void tcp_keepalive_timer(struct timer_list *t) { - struct sock *sk = timer_container_of(sk, t, sk_timer); - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + timer_container_of(icsk, t, icsk_keepalive_timer); + struct sock *sk = &icsk->icsk_inet.sk; struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 30dfbf73729d..b60fad393e18 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1193,7 +1193,7 @@ csum_partial: send: err = ip_send_skb(sock_net(sk), skb); - if (err) { + if (unlikely(err)) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), @@ -1269,6 +1269,8 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); @@ -1286,7 +1288,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; - struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) @@ -1368,9 +1369,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } @@ -1786,21 +1787,39 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * using prepare_to_wait_exclusive(). */ while (nb) { - INDIRECT_CALL_1(sk->sk_data_ready, + INDIRECT_CALL_1(READ_ONCE(sk->sk_data_ready), sock_def_readable, sk); nb--; } } if (unlikely(to_drop)) { + int err_ipv4 = 0; + int err_ipv6 = 0; + for (nb = 0; to_drop != NULL; nb++) { skb = to_drop; + if (skb->protocol == htons(ETH_P_IP)) + err_ipv4++; + else + err_ipv6++; to_drop = skb->next; skb_mark_not_on_list(skb); - /* TODO: update SNMP values. */ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); } numa_drop_add(&udp_sk(sk)->drop_counters, nb); + if (err_ipv4 > 0) { + SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS, + err_ipv4); + SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS, + err_ipv4); + } + if (err_ipv6 > 0) { + SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS, + err_ipv6); + SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS, + err_ipv6); + } } atomic_sub(total_size, &udp_prod_queue->rmem_alloc); @@ -1851,6 +1870,7 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) sk_peek_offset_bwd(sk, len); if (!skb_shared(skb)) { + skb_orphan(skb); skb_attempt_defer_free(skb); return; } @@ -2159,7 +2179,8 @@ csum_copy_err: goto try_again; } -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes @@ -2172,7 +2193,8 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_IPV6_MOD(udp_pre_connect); -static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { int res; @@ -2265,7 +2287,6 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); - udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -2299,19 +2320,25 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) if (udp_hashed4(sk)) { spin_lock_bh(&hslot->lock); - udp_rehash4(udptable, sk, newhash4); - if (hslot2 != nhslot2) { - spin_lock(&hslot2->lock); - udp_hash4_dec(hslot2); - spin_unlock(&hslot2->lock); - - spin_lock(&nhslot2->lock); - udp_hash4_inc(nhslot2); - spin_unlock(&nhslot2->lock); + if (inet_rcv_saddr_any(sk)) { + udp_unhash4(udptable, sk); + } else { + udp_rehash4(udptable, sk, newhash4); + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); + } } spin_unlock_bh(&hslot->lock); } + + udp_sk(sk)->udp_portaddr_hash = newhash; } } EXPORT_IPV6_MOD(udp_lib_rehash); @@ -2426,7 +2453,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { + if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && + UDP_SKB_CB(skb)->partial_cov)) { u16 pcrlen = READ_ONCE(up->pcrlen); /* @@ -3836,7 +3864,7 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent unsigned int slot_size; int i; - udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); + udptable = kmalloc_obj(*udptable); if (!udptable) goto out; @@ -3949,8 +3977,8 @@ static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, { union bpf_udp_iter_batch_item *new_batch; - new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), - flags | __GFP_NOWARN); + new_batch = kvmalloc_objs(*new_batch, new_batch_sz, + flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 0735d820e413..779a3a03762f 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -5,6 +5,7 @@ #include <net/sock.h> #include <net/udp.h> #include <net/inet_common.h> +#include <asm/ioctls.h> #include "udp_impl.h" @@ -111,12 +112,26 @@ enum { static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; +static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg) +{ + if (cmd != SIOCINQ) + return udp_ioctl(sk, cmd, karg); + + /* Since we don't hold a lock, sk_receive_queue may contain data. + * BPF might only be processing this data at the moment. We only + * care about the data in the ingress_msg here. + */ + *karg = sk_msg_first_len(sk); + return 0; +} + static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { - *prot = *base; - prot->close = sock_map_close; - prot->recvmsg = udp_bpf_recvmsg; - prot->sock_is_readable = sk_msg_is_readable; + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; + prot->ioctl = udp_bpf_ioctl; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -143,7 +158,7 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { - sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_write_space, psock->saved_write_space); sock_replace_proto(sk, psock->sk_proto); return 0; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 19d0b5b09ffa..6b1654c1ad4a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -483,11 +483,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; + __be16 newlen, msslen; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; - __be16 newlen; int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; @@ -514,7 +514,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { /* Detect modified geometry and pass those to skb_segment. */ - if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) + if ((skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) && + !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_DODGY)) return __udp_gso_segment_list(gso_skb, features, is_ipv6); ret = __skb_linearize(gso_skb); @@ -555,6 +556,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return segs; } + msslen = htons(sizeof(*uh) + mss); + /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. @@ -584,7 +587,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (!seg->next) break; - uh->len = newlen; + uh->len = msslen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 54386e06a813..b1f667c52cb2 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -29,7 +29,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; @@ -38,7 +38,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index 944b3cf25468..9944ed923ddf 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -753,7 +753,7 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, struct udp_tunnel_nic *utn; unsigned int i; - utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL); + utn = kzalloc_flex(*utn, entries, n_tables); if (!utn) return NULL; utn->n_tables = n_tables; @@ -761,8 +761,8 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, mutex_init(&utn->lock); for (i = 0; i < n_tables; i++) { - utn->entries[i] = kcalloc(info->tables[i].n_entries, - sizeof(*utn->entries[i]), GFP_KERNEL); + utn->entries[i] = kzalloc_objs(*utn->entries[i], + info->tables[i].n_entries); if (!utn->entries[i]) goto err_free_prev_entries; } @@ -820,7 +820,7 @@ static int udp_tunnel_nic_register(struct net_device *dev) /* Create UDP tunnel state structures */ if (info->shared) { - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = kzalloc_obj(*node); if (!node) return -ENOMEM; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index d3e621a11a1a..826e9e79eb19 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -20,10 +20,9 @@ EXPORT_SYMBOL(udplite_table); /* Designate sk as UDP-Lite socket */ static int udplite_sk_init(struct sock *sk) { - udp_init_sock(sk); pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); - return 0; + return udp_init_sock(sk); } static int udplite_rcv(struct sk_buff *skb) diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index d283c59df4c1..0492f1a0b491 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_IPV6_FOU) += fou6.o obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o \ - ip6_offload.o tcpv6_offload.o exthdrs_offload.o + ip6_offload.o exthdrs_offload.o obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 40e9c336f6c5..0e55f139e05d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -355,12 +355,11 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) } - idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device), - GFP_KERNEL); + idev->stats.icmpv6dev = kzalloc_obj(struct icmpv6_mib_device); if (!idev->stats.icmpv6dev) goto err_icmp; - idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), - GFP_KERNEL_ACCOUNT); + idev->stats.icmpv6msgdev = kzalloc_obj(struct icmpv6msg_mib_device, + GFP_KERNEL_ACCOUNT); if (!idev->stats.icmpv6msgdev) goto err_icmpmsg; @@ -385,7 +384,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); - ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT); + ndev = kzalloc_obj(*ndev, GFP_KERNEL_ACCOUNT); if (!ndev) return ERR_PTR(err); @@ -1013,7 +1012,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_for_each(p, &idev->addr_list) { struct inet6_ifaddr *ifa = list_entry(p, struct inet6_ifaddr, if_list); - if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) + if (ifp_scope > ipv6_addr_src_scope(&ifa->addr)) break; } @@ -1117,7 +1116,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT); + ifa = kzalloc_obj(*ifa, gfp_flags | __GFP_ACCOUNT); if (!ifa) { err = -ENOBUFS; goto out; @@ -1324,7 +1323,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) __in6_ifa_put(ifp); } - if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); list_del_rcu(&ifp->if_list); @@ -3112,12 +3111,12 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); - ipv6_del_addr(ifp); - if (!(ifp->flags & IFA_F_TEMPORARY) && (ifp->flags & IFA_F_MANAGETEMPADDR)) delete_tempaddrs(idev, ifp); + ipv6_del_addr(ifp); + addrconf_verify_rtnl(net); if (ipv6_addr_is_multicast(pfx)) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, @@ -3339,11 +3338,10 @@ static int ipv6_generate_stable_address(struct in6_addr *address, const struct inet6_dev *idev) { static DEFINE_SPINLOCK(lock); - static __u32 digest[SHA1_DIGEST_WORDS]; - static __u32 workspace[SHA1_WORKSPACE_WORDS]; + static struct sha1_ctx sha_ctx; static union { - char __data[SHA1_BLOCK_SIZE]; + u8 __data[SHA1_BLOCK_SIZE]; struct { struct in6_addr secret; __be32 prefix[2]; @@ -3368,20 +3366,26 @@ static int ipv6_generate_stable_address(struct in6_addr *address, retry: spin_lock_bh(&lock); - sha1_init_raw(digest); + sha1_init(&sha_ctx); + memset(&data, 0, sizeof(data)); - memset(workspace, 0, sizeof(workspace)); memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); data.prefix[0] = address->s6_addr32[0]; data.prefix[1] = address->s6_addr32[1]; data.secret = secret; data.dad_count = dad_count; - sha1_transform(digest, data.__data, workspace); + sha1_update(&sha_ctx, data.__data, sizeof(data)); + /* + * Note that the SHA-1 finalization is omitted here, and the digest is + * pulled directly from the internal SHA-1 state (making it incompatible + * with standard SHA-1). Unusual, but technically okay since the data + * length is fixed and is a multiple of the SHA-1 block size. + */ temp = *address; - temp.s6_addr32[2] = (__force __be32)digest[0]; - temp.s6_addr32[3] = (__force __be32)digest[1]; + temp.s6_addr32[2] = (__force __be32)sha_ctx.state.h[0]; + temp.s6_addr32[3] = (__force __be32)sha_ctx.state.h[1]; spin_unlock_bh(&lock); @@ -7393,9 +7397,8 @@ static int __net_init addrconf_init_net(struct net *net) spin_lock_init(&net->ipv6.addrconf_hash_lock); INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work); - net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE, - sizeof(struct hlist_head), - GFP_KERNEL); + net->ipv6.inet6_addr_lst = kzalloc_objs(struct hlist_head, + IN6_ADDR_HSIZE); if (!net->ipv6.inet6_addr_lst) goto err_alloc_addr; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 567efd626ab4..f4b2618446bd 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -180,7 +180,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, break; } - newp = kmalloc(sizeof(*newp), GFP_KERNEL); + newp = kmalloc_obj(*newp); if (!newp) return ERR_PTR(-ENOMEM); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1b0314644e0c..4cbd45b68088 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -86,8 +86,6 @@ struct ipv6_params ipv6_defaults = { .autoconf = 1, }; -static int disable_ipv6_mod; - module_param_named(disable, disable_ipv6_mod, int, 0444); MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); @@ -97,12 +95,6 @@ MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); -bool ipv6_mod_enabled(void) -{ - return disable_ipv6_mod == 0; -} -EXPORT_SYMBOL_GPL(ipv6_mod_enabled); - static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->ipv6_pinfo_offset; @@ -224,8 +216,8 @@ lookup_protocol: inet6_set_bit(MC6_LOOP, sk); inet6_set_bit(MC6_ALL, sk); np->pmtudisc = IPV6_PMTUDISC_WANT; - inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect & - FLOWLABEL_REFLECT_ESTABLISHED); + inet6_assign_bit(REPFLOW, sk, READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) & + FLOWLABEL_REFLECT_ESTABLISHED); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); @@ -277,7 +269,7 @@ out_sk_release: goto out; } -static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +static int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; @@ -438,7 +430,7 @@ out_unlock: goto out; } -int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; const struct proto *prot; @@ -465,7 +457,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) } /* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { return inet6_bind_sk(sock->sk, uaddr, addr_len); } @@ -824,45 +816,42 @@ EXPORT_SYMBOL(inet6_unregister_protosw); int inet6_sk_rebuild_header(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p; struct dst_entry *dst; + struct flowi6 *fl6; dst = __sk_dst_check(sk, np->dst_cookie); + if (dst) + return 0; + + fl6 = &inet->cork.fl.u.ip6; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = sk->sk_protocol; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = np->saddr; + fl6->flowlabel = np->flow_label; + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_dport = inet->inet_dport; + fl6->fl6_sport = inet->inet_sport; + fl6->flowi6_uid = sk_uid(sk); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (!dst) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p, final; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = sk->sk_protocol; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = np->saddr; - fl6.flowlabel = np->flow_label; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = inet->inet_dport; - fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk_uid(sk); - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), - &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - sk->sk_route_caps = 0; - WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); - return PTR_ERR(dst); - } + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final); + rcu_read_unlock(); - ip6_dst_store(sk, dst, false, false); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (IS_ERR(dst)) { + sk->sk_route_caps = 0; + WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); + return PTR_ERR(dst); } + ip6_dst_store(sk, dst, false, false); return 0; } -EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt) @@ -924,8 +913,7 @@ static int __net_init ipv6_init_mibs(struct net *net) net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); if (!net->mib.icmpv6_statistics) goto err_icmp_mib; - net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), - GFP_KERNEL); + net->mib.icmpv6msg_statistics = kzalloc_obj(struct icmpv6msg_mib); if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; return 0; @@ -955,11 +943,12 @@ static int __net_init inet6_net_init(struct net *net) int err = 0; net->ipv6.sysctl.bindv6only = 0; - net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.icmpv6_time = HZ / 10; net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0; + net->ipv6.sysctl.icmpv6_errors_extension_mask = 0; /* By default, rate limit error messages. * Except for pmtu discovery, it would break it. diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 95372e0f1d21..cb26beea4398 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -691,7 +691,7 @@ static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) goto error; } - ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); + ahp = kzalloc_obj(*ahp); if (!ahp) return -ENOMEM; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 52599584422b..67a42e01dfc3 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -279,7 +279,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, { struct ifacaddr6 *aca; - aca = kzalloc(sizeof(*aca), GFP_ATOMIC); + aca = kzalloc_obj(*aca, GFP_ATOMIC); if (!aca) return NULL; diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index df1986973430..c6a34334e657 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -133,9 +133,8 @@ static int __init calipso_cache_init(void) { u32 iter; - calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, - sizeof(struct calipso_map_cache_bkt), - GFP_KERNEL); + calipso_cache = kzalloc_objs(struct calipso_map_cache_bkt, + CALIPSO_CACHE_BUCKETS); if (!calipso_cache) return -ENOMEM; @@ -275,7 +274,7 @@ static int calipso_cache_add(const unsigned char *calipso_ptr, calipso_ptr_len = calipso_ptr[1]; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); @@ -1342,7 +1341,8 @@ static int calipso_skbuff_setattr(struct sk_buff *skb, /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ pad = ((new_end & 4) + (end & 7)) & 7; len_delta = new_end - (int)end + pad; - ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + ret_val = skb_cow(skb, + skb_headroom(skb) + (len_delta > 0 ? len_delta : 0)); if (ret_val < 0) return ret_val; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 33ebe93d80e3..c564b68a0562 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -72,12 +72,12 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; - struct in6_addr *final_p, final; - struct ipv6_txoptions *opt; - struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi6 fl6; + struct ipv6_txoptions *opt; + struct in6_addr *final_p; + struct dst_entry *dst; + struct flowi6 *fl6; int err = 0; if (inet6_test_bit(SNDFLOW, sk) && @@ -86,14 +86,15 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) if (IS_ERR(flowlabel)) return -EINVAL; } - ip6_datagram_flow_key_init(&fl6, sk); + fl6 = &inet_sk(sk)->cork.fl.u.ip6; + ip6_datagram_flow_key_init(fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); - final_p = fl6_update_dst(&fl6, opt, &final); + final_p = fl6_update_dst(fl6, opt, &np->final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; @@ -101,17 +102,17 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) - np->saddr = fl6.saddr; + np->saddr = fl6->saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - sk->sk_v6_rcv_saddr = fl6.saddr; + sk->sk_v6_rcv_saddr = fl6->saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } - ip6_sk_dst_store_flow(sk, dst, &fl6); + ip6_sk_dst_store_flow(sk, dst, fl6); out: fl6_sock_release(flowlabel); @@ -138,7 +139,7 @@ void ip6_datagram_release_cb(struct sock *sk) } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); -int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, +int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -194,7 +195,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, - (struct sockaddr *) &sin, + (struct sockaddr_unsized *)&sin, sizeof(sin)); ipv4_connected: @@ -271,7 +272,7 @@ out: } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); -int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; @@ -282,7 +283,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_SYMBOL_GPL(ip6_datagram_connect); -int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 22410243ebe8..22895521a57d 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -158,8 +158,8 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, - XFRM_MODE_SKB_CB(skb)->protocol); + struct xfrm_offload *xo = xfrm_offload(skb); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, xo->proto); __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP) : htons(ETH_P_IPV6); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index a23eb8734e15..95558fd6f447 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -314,7 +314,7 @@ fail_and_free: } extlen = (skb_transport_header(skb)[1] + 1) << 3; - if (extlen > net->ipv6.sysctl.max_dst_opts_len) + if (extlen > READ_ONCE(net->ipv6.sysctl.max_dst_opts_len)) goto fail_and_free; opt->lastopt = opt->dst1 = skb_network_header_len(skb); @@ -322,7 +322,8 @@ fail_and_free: dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { + if (ip6_parse_tlv(false, skb, + READ_ONCE(net->ipv6.sysctl.max_dst_opts_cnt))) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -378,6 +379,10 @@ static int ipv6_srh_rcv(struct sk_buff *skb) hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); idev = __in6_dev_get(skb->dev); + if (!idev) { + kfree_skb(skb); + return -1; + } accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled), READ_ONCE(idev->cnf.seg6_enabled)); @@ -930,6 +935,11 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) goto drop; + /* Inconsistent Pre-allocated Trace header */ + if (trace->nodelen != + ioam6_trace_compute_nodelen(be32_to_cpu(trace->type_be32))) + goto drop; + /* Ignore if the IOAM namespace is unknown */ ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id); if (!ns) @@ -1049,11 +1059,12 @@ fail_and_free: } extlen = (skb_transport_header(skb)[1] + 1) << 3; - if (extlen > net->ipv6.sysctl.max_hbh_opts_len) + if (extlen > READ_ONCE(net->ipv6.sysctl.max_hbh_opts_len)) goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { + if (ip6_parse_tlv(true, skb, + READ_ONCE(net->ipv6.sysctl.max_hbh_opts_cnt))) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); @@ -1072,9 +1083,9 @@ fail_and_free: * for headers. */ -static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, - struct ipv6_rt_hdr *opt, - struct in6_addr **addr_p, struct in6_addr *saddr) +static u8 ipv6_push_rthdr0(struct sk_buff *skb, u8 proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p, struct in6_addr *saddr) { struct rt0_hdr *phdr, *ihdr; int hops; @@ -1093,13 +1104,13 @@ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; - phdr->rt_hdr.nexthdr = *proto; - *proto = NEXTHDR_ROUTING; + phdr->rt_hdr.nexthdr = proto; + return NEXTHDR_ROUTING; } -static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, - struct ipv6_rt_hdr *opt, - struct in6_addr **addr_p, struct in6_addr *saddr) +static u8 ipv6_push_rthdr4(struct sk_buff *skb, u8 proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p, struct in6_addr *saddr) { struct ipv6_sr_hdr *sr_phdr, *sr_ihdr; int plen, hops; @@ -1142,58 +1153,61 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, } #endif - sr_phdr->nexthdr = *proto; - *proto = NEXTHDR_ROUTING; + sr_phdr->nexthdr = proto; + return NEXTHDR_ROUTING; } -static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, - struct ipv6_rt_hdr *opt, - struct in6_addr **addr_p, struct in6_addr *saddr) +static u8 ipv6_push_rthdr(struct sk_buff *skb, u8 proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p, struct in6_addr *saddr) { switch (opt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: - ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); + proto = ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: - ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); + proto = ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); break; default: break; } + return proto; } -static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +static u8 ipv6_push_exthdr(struct sk_buff *skb, u8 proto, u8 type, struct ipv6_opt_hdr *opt) { struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt)); memcpy(h, opt, ipv6_optlen(opt)); - h->nexthdr = *proto; - *proto = type; + h->nexthdr = proto; + return type; } -void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, - u8 *proto, - struct in6_addr **daddr, struct in6_addr *saddr) +u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 proto, + struct in6_addr **daddr, struct in6_addr *saddr) { if (opt->srcrt) { - ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); + proto = ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); /* * IPV6_RTHDRDSTOPTS is ignored * unless IPV6_RTHDR is set (RFC3542). */ if (opt->dst0opt) - ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); } if (opt->hopopt) - ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); + proto = ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); + return proto; } -void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 proto) { if (opt->dst1opt) - ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); + proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); + return proto; } EXPORT_SYMBOL(ipv6_push_frag_opts); @@ -1334,21 +1348,21 @@ struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, EXPORT_SYMBOL_GPL(__ipv6_fixup_options); /** - * fl6_update_dst - update flowi destination address with info given + * __fl6_update_dst - update flowi destination address with info given * by srcrt option, if any. * * @fl6: flowi6 for which daddr is to be updated * @opt: struct ipv6_txoptions in which to look for srcrt opt * @orig: copy of original daddr address if modified * - * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * Return: NULL if no srcrt or invalid srcrt type, otherwise returns orig * and initial value of fl6->daddr set in orig */ -struct in6_addr *fl6_update_dst(struct flowi6 *fl6, - const struct ipv6_txoptions *opt, - struct in6_addr *orig) +struct in6_addr *__fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) { - if (!opt || !opt->srcrt) + if (!opt->srcrt) return NULL; *orig = fl6->daddr; @@ -1372,4 +1386,4 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, return orig; } -EXPORT_SYMBOL_GPL(fl6_update_dst); +EXPORT_SYMBOL_GPL(__fl6_update_dst); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 56c974cf75d1..813d2e9edb8b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -217,16 +217,15 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else if (dev && (dev->flags & IFF_LOOPBACK)) { res = true; } else { - struct rt6_info *rt = dst_rt6_info(dst); - int tmo = net->ipv6.sysctl.icmpv6_time; + int tmo = READ_ONCE(net->ipv6.sysctl.icmpv6_time); struct inet_peer *peer; - /* Give more bandwidth to wider prefixes. */ - if (rt->rt6i_dst.plen < 128) - tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); - res = inet_peer_xrlim_allow(peer, tmo); + if (!tmo) { + res = true; + } else { + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); + res = inet_peer_xrlim_allow(peer, tmo); + } } rcu_read_unlock(); if (!res) @@ -444,6 +443,193 @@ static int icmp6_iif(const struct sk_buff *skb) return icmp6_dev(skb)->ifindex; } +struct icmp6_ext_iio_addr6_subobj { + __be16 afi; + __be16 reserved; + struct in6_addr addr6; +}; + +static unsigned int icmp6_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp6_ext_iio_addr6_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp6_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp6_ext_iio_len(); + + return ext_max_len; +} + +static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev) +{ + struct inet6_dev *in6_dev; + struct inet6_ifaddr *ifa; + + in6_dev = __in6_dev_get(dev); + if (!in6_dev) + return NULL; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED)) + continue; + if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST || + ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL) + continue; + return &ifa->addr; + } + + return NULL; +} + +static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + struct in6_addr *addr6; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + addr6 = icmp6_ext_iio_addr6_find(dev); + if (addr6) { + struct icmp6_ext_iio_addr6_subobj *addr6_subobj; + + addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj)); + addr6_subobj->afi = htons(ICMP_AFI_IP6); + addr6_subobj->addr6 = *addr6; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp6_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp6_ext_append(struct net *net, struct sk_buff *skb_in, + struct icmp6hdr *icmp6h, unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmp6h->icmp6_type) { + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + break; + default: + return NULL; + } + + /* Do not overwrite existing extensions. This can happen when we + * receive an ICMPv4 message with extensions from a tunnel and + * translate it to an ICMPv6 message towards an IPv6 host in the + * overlay network. + */ + if (icmp6h->icmp6_datagram_len) + return NULL; + + ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp6_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp6_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 64-bit words (RFC 4884). */ + icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a packet in error */ @@ -458,7 +644,9 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct dst_entry *dst; + unsigned int room; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -612,8 +800,13 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, msg.offset = skb_network_offset(skb); msg.type = type; - len = skb->len - msg.offset; - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); + room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr); + ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif); + if (ext_skb) + msg.skb = ext_skb; + + len = msg.skb->len - msg.offset; + len = min_t(unsigned int, len, room); if (len < 0) { net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); @@ -635,6 +828,8 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, } out_dst_release: + if (ext_skb) + consume_skb(ext_skb); dst_release(dst); out_unlock: icmpv6_xmit_unlock(sk); @@ -762,14 +957,17 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); - if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) + if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) & + FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = icmp6_iif(skb); + fl6.flowi6_oif = ipv6_addr_loopback(&fl6.daddr) ? + skb->dev->ifindex : + icmp6_iif(skb); fl6.fl6_icmp_type = type; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -868,6 +1066,12 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, if (reason != SKB_NOT_DROPPED_YET) goto out; + if (nexthdr == IPPROTO_RAW) { + /* Add a more specific reason later ? */ + reason = SKB_DROP_REASON_NOT_SPECIFIED; + goto out; + } + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed pmtu discovery. @@ -1171,6 +1375,10 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL + +static u32 icmpv6_errors_extension_mask_all = + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); + static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", @@ -1216,6 +1424,15 @@ static struct ctl_table ipv6_icmp_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "errors_extension_mask", + .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &icmpv6_errors_extension_mask_all, + }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) @@ -1233,6 +1450,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; + table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask; } return table; } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 1d41b2ab4884..8991805fc3d6 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -220,7 +220,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) return err; } - ila = kzalloc(sizeof(*ila), GFP_KERNEL); + ila = kzalloc_obj(*ila); if (!ila) return -ENOMEM; @@ -509,7 +509,7 @@ int ila_xlat_nl_dump_start(struct netlink_callback *cb) struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc_obj(*iter); if (!iter) return -ENOMEM; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index ea5cf3fdfdd6..11fc2f7de2fe 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -25,14 +25,14 @@ #include <net/sock_reuseport.h> struct dst_entry *inet6_csk_route_req(const struct sock *sk, + struct dst_entry *dst, struct flowi6 *fl6, const struct request_sock *req, u8 proto) { - struct inet_request_sock *ireq = inet_rsk(req); + const struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; - struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; @@ -48,25 +48,20 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); - if (IS_ERR(dst)) - return NULL; - + if (!dst) { + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (IS_ERR(dst)) + return NULL; + } return dst; } -static inline -struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) -{ - return __sk_dst_check(sk, cookie); -} - static struct dst_entry *inet6_csk_route_socket(struct sock *sk, struct flowi6 *fl6) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *final_p, final; + struct in6_addr *final_p; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); @@ -83,41 +78,41 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); - final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final); rcu_read_unlock(); - dst = __inet6_csk_dst_check(sk, np->dst_cookie); - if (!dst) { - dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + + if (!IS_ERR(dst)) + ip6_dst_store(sk, dst, false, false); - if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, false, false); - } return dst; } int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { + struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6; struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi6 fl6; struct dst_entry *dst; int res; - dst = inet6_csk_route_socket(sk, &fl6); - if (IS_ERR(dst)) { - WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); - sk->sk_route_caps = 0; - kfree_skb(skb); - return PTR_ERR(dst); + dst = __sk_dst_check(sk, np->dst_cookie); + if (unlikely(!dst)) { + dst = inet6_csk_route_socket(sk, fl6); + if (IS_ERR(dst)) { + WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); + sk->sk_route_caps = 0; + kfree_skb(skb); + return PTR_ERR(dst); + } + /* Restore final destination back after routing done */ + fl6->daddr = sk->sk_v6_daddr; } rcu_read_lock(); skb_dst_set_noref(skb, dst); - /* Restore final destination back after routing done */ - fl6.daddr = sk->sk_v6_daddr; - - res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), + res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; @@ -126,13 +121,15 @@ EXPORT_SYMBOL_GPL(inet6_csk_xmit); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) { - struct flowi6 fl6; - struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6); + struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6; + struct dst_entry *dst; + + dst = inet6_csk_route_socket(sk, fl6); if (IS_ERR(dst)) return NULL; dst->ops->update_pmtu(dst, sk, NULL, mtu, true); - dst = inet6_csk_route_socket(sk, &fl6); + dst = inet6_csk_route_socket(sk, fl6); return IS_ERR(dst) ? NULL : dst; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 5e1da088d8e1..182d38e6d6d8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -95,7 +95,8 @@ static inline int compute_score(struct sock *sk, const struct net *net, { int score = -1; - if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && + if (net_eq(sock_net(sk), net) && + READ_ONCE(inet_sk(sk)->inet_num) == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 9553a3200081..b76f89d92e7b 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -127,7 +127,7 @@ static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } - ns = kzalloc(sizeof(*ns), GFP_KERNEL); + ns = kzalloc_obj(*ns); if (!ns) { err = -ENOMEM; goto out_unlock; @@ -245,7 +245,7 @@ static int ioam6_genl_dumpns_start(struct netlink_callback *cb) struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc_obj(*iter); if (!iter) return -ENOMEM; @@ -431,7 +431,7 @@ static int ioam6_genl_dumpsc_start(struct netlink_callback *cb) struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc_obj(*iter); if (!iter) return -ENOMEM; @@ -690,6 +690,20 @@ struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id) return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); } +#define IOAM6_MASK_SHORT_FIELDS 0xff1ffc00 +#define IOAM6_MASK_WIDE_FIELDS 0x00e00000 + +u8 ioam6_trace_compute_nodelen(u32 trace_type) +{ + u8 nodelen = hweight32(trace_type & IOAM6_MASK_SHORT_FIELDS) + * (sizeof(__be32) / 4); + + nodelen += hweight32(trace_type & IOAM6_MASK_WIDE_FIELDS) + * (sizeof(__be64) / 4); + + return nodelen; +} + static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, struct ioam6_trace_hdr *trace, @@ -961,7 +975,7 @@ static int __net_init ioam6_net_init(struct net *net) struct ioam6_pernet_data *nsdata; int err = -ENOMEM; - nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL); + nsdata = kzalloc_obj(*nsdata); if (!nsdata) goto out; diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 1fe7894f14dd..b9f6d892a566 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -22,9 +22,6 @@ #include <net/ip6_route.h> #include <net/addrconf.h> -#define IOAM6_MASK_SHORT_FIELDS 0xff100000 -#define IOAM6_MASK_WIDE_FIELDS 0xe00000 - struct ioam6_lwt_encap { struct ipv6_hopopt_hdr eh; u8 pad[2]; /* 2-octet padding for 4n-alignment */ @@ -93,13 +90,8 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) trace->type.bit21 | trace->type.bit23) return false; - trace->nodelen = 0; fields = be32_to_cpu(trace->type_be32); - - trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) - * (sizeof(__be32) / 4); - trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) - * (sizeof(__be64) / 4); + trace->nodelen = ioam6_trace_compute_nodelen(fields); return true; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 02c16909f618..9058e71241dc 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -234,7 +234,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; - table = kzalloc(sizeof(*table), GFP_ATOMIC); + table = kzalloc_obj(*table, GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, @@ -494,7 +494,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb, unsigned int h; int err = 0; - w = kzalloc(sizeof(*w), GFP_ATOMIC); + w = kzalloc_obj(*w, GFP_ATOMIC); if (!w) return -ENOMEM; @@ -659,7 +659,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * * 1. allocate and initialize walker. */ - w = kzalloc(sizeof(*w), GFP_ATOMIC); + w = kzalloc_obj(*w, GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; @@ -731,7 +731,7 @@ void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) return; if (f6i->fib6_metrics == &dst_default_metrics) { - struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); + struct dst_metrics *p = kzalloc_obj(*p, GFP_ATOMIC); if (!p) return; @@ -1138,6 +1138,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } + if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT)) && + (iter->nh || !iter->fib6_nh->fib_nh_gw_family)) { + iter->fib6_flags &= ~RTF_ADDRCONF; + iter->fib6_flags &= ~RTF_PREFIX_RT; + } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1370,14 +1375,14 @@ static void fib6_start_gc(struct net *net, struct fib6_info *rt) if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, - jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); + jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval)); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, - jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); + jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval)); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, @@ -2409,6 +2414,7 @@ static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; + int ip6_rt_gc_interval; unsigned long now; if (force) { @@ -2417,8 +2423,8 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } - gc_args.timeout = expires ? (int)expires : - net->ipv6.sysctl.ip6_rt_gc_interval; + ip6_rt_gc_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval); + gc_args.timeout = expires ? (int)expires : ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); @@ -2427,8 +2433,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, - round_jiffies(now - + net->ipv6.sysctl.ip6_rt_gc_interval)); + round_jiffies(now + ip6_rt_gc_interval)); else timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); @@ -2459,7 +2464,7 @@ static int __net_init fib6_net_init(struct net *net) INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); - net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); + net->ipv6.rt6_stats = kzalloc_obj(*net->ipv6.rt6_stats); if (!net->ipv6.rt6_stats) goto out_notifier; @@ -2472,8 +2477,7 @@ static int __net_init fib6_net_init(struct net *net) spin_lock_init(&net->ipv6.fib_table_hash_lock); - net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), - GFP_KERNEL); + net->ipv6.fib6_main_tbl = kzalloc_obj(*net->ipv6.fib6_main_tbl); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; @@ -2486,8 +2490,7 @@ static int __net_init fib6_net_init(struct net *net) INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), - GFP_KERNEL); + net->ipv6.fib6_local_tbl = kzalloc_obj(*net->ipv6.fib6_local_tbl); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a3ff575798dd..7c12bf75beed 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -66,8 +66,8 @@ EXPORT_SYMBOL(ipv6_flowlabel_exclusive); fl != NULL; \ fl = rcu_dereference(fl->next)) -#define for_each_sk_fl_rcu(np, sfl) \ - for (sfl = rcu_dereference(np->ipv6_fl_list); \ +#define for_each_sk_fl_rcu(sk, sfl) \ + for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) @@ -262,12 +262,11 @@ static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; - struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { @@ -283,16 +282,16 @@ EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ipv6_fl_socklist *sfl; - if (!rcu_access_pointer(np->ipv6_fl_list)) + if (!rcu_access_pointer(inet->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); - while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { - np->ipv6_fl_list = sfl->next; + inet->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); @@ -387,7 +386,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, goto done; err = -ENOMEM; - fl = kzalloc(sizeof(*fl), GFP_KERNEL); + fl = kzalloc_obj(*fl); if (!fl) goto done; @@ -470,16 +469,15 @@ done: static int mem_check(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); + struct ipv6_fl_socklist *sfl; int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) + for_each_sk_fl_rcu(sk, sfl) count++; rcu_read_unlock(); @@ -492,13 +490,15 @@ static int mem_check(struct sock *sk) return 0; } -static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, - struct ip6_flowlabel *fl) +static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) { + struct inet_sock *inet = inet_sk(sk); + spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; - sfl->next = np->ipv6_fl_list; - rcu_assign_pointer(np->ipv6_fl_list, sfl); + sfl->next = inet->ipv6_fl_list; + rcu_assign_pointer(inet->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } @@ -520,7 +520,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; @@ -559,7 +559,7 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) } spin_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; + for (sflp = &inet_sk(sk)->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) @@ -579,13 +579,12 @@ found: static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); @@ -614,7 +613,6 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; @@ -640,12 +638,12 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (!fl) return err; - sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + sfl1 = kmalloc_obj(*sfl1); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); @@ -682,7 +680,7 @@ recheck: fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; - fl_link(np, sfl1, fl1); + fl_link(sk, sfl1, fl1); fl_free(fl); return 0; @@ -716,7 +714,7 @@ release: } } - fl_link(np, sfl1, fl); + fl_link(sk, sfl1, fl); return 0; done: fl_free(fl); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c82a75510c0e..dafcc0dcd77a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -535,6 +535,10 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (!tun_dst) return PACKET_REJECT; + /* MUST set options_len before referencing options */ + info = &tun_dst->u.tun_info; + info->options_len = sizeof(*md); + /* skb can be uncloned in __iptunnel_pull_header, so * old pkt_md is no longer valid and we need to reset * it @@ -543,7 +547,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, skb_network_header_len(skb); pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + sizeof(*ershdr)); - info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); md->version = ver; md2 = &md->u.md2; @@ -551,7 +554,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, ERSPAN_V2_MDSIZE); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); - info->options_len = sizeof(*md); ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); @@ -1055,7 +1057,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst) { mtu = READ_ONCE(dst_dev(dst)->mtu); - if (dst_mtu(dst) > mtu) + if (dst6_mtu(dst) > mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, @@ -1366,9 +1368,16 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; + int needed; __be16 *p; - ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + needed = t->hlen + sizeof(*ipv6h); + if (skb_headroom(skb) < needed && + pskb_expand_head(skb, HH_DATA_ALIGN(needed - skb_headroom(skb)), + 0, GFP_ATOMIC)) + return -needed; + + ipv6h = skb_push(skb, needed); ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, t->fl.u.ip6.flowlabel, true, &t->fl.u.ip6)); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 168ec07e31cc..2bcb981c91aa 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -262,7 +262,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); - pkt_len = ntohs(hdr->payload_len); + pkt_len = ipv6_payload_len(skb, hdr); /* pkt_len may be zero if Jumbo payload option is present */ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index fce91183797a..bd7f780e37a5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -19,23 +19,7 @@ #include <net/gso.h> #include "ip6_offload.h" - -/* All GRO functions are always builtin, except UDP over ipv6, which lays in - * ipv6 module, as it depends on UDPv6 lookup function, so we need special care - * when ipv6 is built as a module - */ -#if IS_BUILTIN(CONFIG_IPV6) -#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) -#else -#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) -#endif - -#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ -({ \ - unlikely(gro_recursion_inc_test(skb)) ? \ - NAPI_GRO_CB(skb)->flush |= 1, NULL : \ - INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ -}) +#include "tcpv6_offload.c" static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { @@ -110,7 +94,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; - int proto, err; + int proto; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; @@ -120,9 +104,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, bool gso_partial; skb_reset_network_header(skb); - err = ipv6_hopopt_jumbo_remove(skb); - if (err) - return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; @@ -298,9 +279,19 @@ not_same_flow: skb_gro_postpull_rcsum(skb, iph, nlen); - pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, - ops->callbacks.gro_receive, head, skb); + if (unlikely(gro_recursion_inc_test(skb))) { + flush = 1; + goto out; + } + if (likely(proto == IPPROTO_TCP)) + pp = tcp6_gro_receive(head, skb); +#if IS_BUILTIN(CONFIG_IPV6) + else if (likely(proto == IPPROTO_UDP)) + pp = udp6_gro_receive(head, skb); +#endif + else + pp = ops->callbacks.gro_receive(head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -342,48 +333,28 @@ INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; - u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } - payload_len = skb->len - nhoff - sizeof(*iph); - if (unlikely(payload_len > IPV6_MAXPLEN)) { - struct hop_jumbo_hdr *hop_jumbo; - int hoplen = sizeof(*hop_jumbo); - - /* Move network header left */ - memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), - skb->transport_header - skb->mac_header); - skb->data -= hoplen; - skb->len += hoplen; - skb->mac_header -= hoplen; - skb->network_header -= hoplen; - iph = (struct ipv6hdr *)(skb->data + nhoff); - hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); - - /* Build hop-by-hop options */ - hop_jumbo->nexthdr = iph->nexthdr; - hop_jumbo->hdrlen = 0; - hop_jumbo->tlv_type = IPV6_TLV_JUMBO; - hop_jumbo->tlv_len = 4; - hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); - - iph->nexthdr = NEXTHDR_HOP; - iph->payload_len = 0; - } else { - iph = (struct ipv6hdr *)(skb->data + nhoff); - iph->payload_len = htons(payload_len); - } + iph = (struct ipv6hdr *)(skb->data + nhoff); + ipv6_set_payload_len(iph, skb->len - nhoff - sizeof(*iph)); nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); + + if (likely(ops == &net_hotdata.tcpv6_offload)) + return tcp6_gro_complete(skb, nhoff); +#if IS_BUILTIN(CONFIG_IPV6) + if (ops == &net_hotdata.udpv6_offload) + return udp6_gro_complete(skb, nhoff); +#endif + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; - err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, - udp6_gro_complete, skb, nhoff); + err = ops->callbacks.gro_complete(skb, nhoff); out: return err; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f904739e99b9..8e2a6b28cea7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -80,7 +80,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * hdr = ipv6_hdr(skb); daddr = &hdr->daddr; - if (ipv6_addr_is_multicast(daddr)) { + if (unlikely(ipv6_addr_is_multicast(daddr))) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || @@ -179,8 +179,7 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, static int ip6_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { - if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && - !skb_gso_validate_network_len(skb, mtu)) + if (unlikely(!skb_gso_validate_network_len(skb, mtu))) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); return ip6_finish_output2(net, sk, skb); @@ -202,8 +201,8 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff if (skb_is_gso(skb)) return ip6_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || - (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) + if (unlikely(skb->len > mtu || + (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) return ip6_fragment(net, sk, skb, ip6_finish_output2); return ip6_finish_output2(net, sk, skb); @@ -273,8 +272,6 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct inet6_dev *idev = ip6_dst_idev(dst); - struct hop_jumbo_hdr *hop_jumbo; - int hoplen = sizeof(*hop_jumbo); struct net *net = sock_net(sk); unsigned int head_room; struct net_device *dev; @@ -287,7 +284,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, rcu_read_lock(); dev = dst_dev_rcu(dst); - head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; @@ -301,32 +298,22 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, } } - if (opt) { + if (unlikely(opt)) { seg_len += opt->opt_nflen + opt->opt_flen; if (opt->opt_flen) - ipv6_push_frag_opts(skb, opt, &proto); + proto = ipv6_push_frag_opts(skb, opt, proto); if (opt->opt_nflen) - ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, - &fl6->saddr); + proto = ipv6_push_nfrag_opts(skb, opt, proto, + &first_hop, + &fl6->saddr); } - if (unlikely(seg_len > IPV6_MAXPLEN)) { - hop_jumbo = skb_push(skb, hoplen); - - hop_jumbo->nexthdr = proto; - hop_jumbo->hdrlen = 0; - hop_jumbo->tlv_type = IPV6_TLV_JUMBO; - hop_jumbo->tlv_len = 4; - hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); - - proto = IPPROTO_HOPOPTS; + if (unlikely(seg_len > IPV6_MAXPLEN)) seg_len = 0; - IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; - } - skb_push(skb, sizeof(struct ipv6hdr)); + __skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); @@ -352,8 +339,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, skb->priority = priority; skb->mark = mark; - mtu = dst_mtu(dst); - if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { + mtu = dst6_mtu(dst); + if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the @@ -382,7 +369,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); unlock: rcu_read_unlock(); return ret; @@ -653,7 +640,7 @@ int ip6_forward(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (ip6_pkt_too_big(skb, mtu)) { + if (unlikely(ip6_pkt_too_big(skb, mtu))) { /* Again, force OUTPUT device used as source address */ skb->dev = dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -1352,12 +1339,13 @@ static void ip6_append_data_mtu(unsigned int *mtu, } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, - struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, + struct ipcm6_cookie *ipc6, struct rt6_info *rt) { + struct ipv6_txoptions *nopt, *opt = ipc6->opt; + struct inet6_cork *v6_cork = &cork->base6; struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu, frag_size; - struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so * ip6_cork_release() can put it down even in case of an error. @@ -1367,11 +1355,11 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, /* * setup for corking */ - if (opt) { + if (unlikely(opt)) { if (WARN_ON(v6_cork->opt)) return -EINVAL; - nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); + nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation); if (unlikely(!nopt)) return -ENOBUFS; @@ -1402,10 +1390,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? - READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); else mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? - READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); + READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) @@ -1430,17 +1418,17 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, static int __ip6_append_data(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork_full, - struct inet6_cork *v6_cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, unsigned int flags) { - struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct inet6_cork *v6_cork = &cork_full->base6; struct inet_cork *cork = &cork_full->base; struct flowi6 *fl6 = &cork_full->fl.u.ip6; - unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct sk_buff *skb, *skb_prev = NULL; struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; @@ -1843,7 +1831,6 @@ int ip6_append_data(struct sock *sk, struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); int exthdrlen; int err; @@ -1854,7 +1841,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ dst_hold(&rt->dst); - err = ip6_setup_cork(sk, &inet->cork, &np->cork, + err = ip6_setup_cork(sk, &inet->cork, ipc6, rt); if (err) return err; @@ -1868,7 +1855,7 @@ int ip6_append_data(struct sock *sk, } return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, - &np->cork, sk_page_frag(sk), getfrag, + sk_page_frag(sk), getfrag, from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1881,10 +1868,11 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) skb_dst_set(skb, dst); } -static void ip6_cork_release(struct inet_cork_full *cork, - struct inet6_cork *v6_cork) +static void ip6_cork_release(struct inet_cork_full *cork) { - if (v6_cork->opt) { + struct inet6_cork *v6_cork = &cork->base6; + + if (unlikely(v6_cork->opt)) { struct ipv6_txoptions *opt = v6_cork->opt; kfree(opt->dst0opt); @@ -1903,15 +1891,14 @@ static void ip6_cork_release(struct inet_cork_full *cork, struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, - struct inet_cork_full *cork, - struct inet6_cork *v6_cork) + struct inet_cork_full *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; struct net *net = sock_net(sk); struct ipv6hdr *hdr; - struct ipv6_txoptions *opt = v6_cork->opt; + struct ipv6_txoptions *opt; struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; @@ -1940,19 +1927,22 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, __skb_pull(skb, skb_network_header_len(skb)); final_dst = &fl6->daddr; - if (opt && opt->opt_flen) - ipv6_push_frag_opts(skb, opt, &proto); - if (opt && opt->opt_nflen) - ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); - + opt = cork->base6.opt; + if (unlikely(opt)) { + if (opt->opt_flen) + proto = ipv6_push_frag_opts(skb, opt, proto); + if (opt->opt_nflen) + proto = ipv6_push_nfrag_opts(skb, opt, proto, + &final_dst, &fl6->saddr); + } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, v6_cork->tclass, + ip6_flow_hdr(hdr, cork->base6.tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); - hdr->hop_limit = v6_cork->hop_limit; + hdr->hop_limit = cork->base6.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; @@ -1966,7 +1956,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); - if (proto == IPPROTO_ICMPV6) { + if (unlikely(proto == IPPROTO_ICMPV6)) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; @@ -1979,7 +1969,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } - ip6_cork_release(cork, v6_cork); + ip6_cork_release(cork); out: return skb; } @@ -2018,8 +2008,7 @@ EXPORT_SYMBOL_GPL(ip6_push_pending_frames); static void __ip6_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, - struct inet_cork_full *cork, - struct inet6_cork *v6_cork) + struct inet_cork_full *cork) { struct sk_buff *skb; @@ -2030,13 +2019,13 @@ static void __ip6_flush_pending_frames(struct sock *sk, kfree_skb(skb); } - ip6_cork_release(cork, v6_cork); + ip6_cork_release(cork); } void ip6_flush_pending_frames(struct sock *sk) { __ip6_flush_pending_frames(sk, &sk->sk_write_queue, - &inet_sk(sk)->cork, &inet6_sk(sk)->cork); + &inet_sk(sk)->cork); } EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); @@ -2047,9 +2036,8 @@ struct sk_buff *ip6_make_skb(struct sock *sk, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork) { - struct inet6_cork v6_cork; - struct sk_buff_head queue; int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); + struct sk_buff_head queue; int err; if (flags & MSG_PROBE) { @@ -2062,21 +2050,21 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork->base.flags = 0; cork->base.addr = 0; cork->base.opt = NULL; - v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); + cork->base6.opt = NULL; + err = ip6_setup_cork(sk, cork, ipc6, rt); if (err) { - ip6_cork_release(cork, &v6_cork); + ip6_cork_release(cork); return ERR_PTR(err); } - err = __ip6_append_data(sk, &queue, cork, &v6_cork, + err = __ip6_append_data(sk, &queue, cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, flags); if (err) { - __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); + __ip6_flush_pending_frames(sk, &queue, cork); return ERR_PTR(err); } - return __ip6_make_skb(sk, &queue, cork, &v6_cork); + return __ip6_make_skb(sk, &queue, cork); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6405072050e0..4c29aa94e86e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -638,7 +638,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { - if (rel_info > dst_mtu(skb_dst(skb2))) + if (rel_info > dst6_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); @@ -844,7 +844,7 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, skb_reset_network_header(skb); - if (!pskb_inet_may_pull(skb)) { + if (skb_vlan_inet_prepare(skb, true)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; @@ -1187,7 +1187,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; + mtu = dst6_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1265,7 +1265,7 @@ route_lookup: if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); - ipv6_push_frag_opts(skb, &opt.ops, &proto); + proto = ipv6_push_frag_opts(skb, &opt.ops, proto); } skb_push(skb, sizeof(struct ipv6hdr)); @@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t, } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); +static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip6_tnl *t = netdev_priv(ctx->dev); + struct flowi6 fl6 = { + .daddr = t->parms.raddr, + }; + struct dst_entry *dst; + int err; + + dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6); + if (!dst->error) { + path->type = DEV_PATH_TUN; + path->tun.src_v6 = t->parms.laddr; + path->tun.dst_v6 = t->parms.raddr; + path->tun.l3_proto = IPPROTO_IPV6; + path->dev = ctx->dev; + ctx->dev = dst->dev; + } + + err = dst->error; + dst_release(dst); + + return err; +} + static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, @@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, + .ndo_fill_forward_path = ip6_tnl_fill_forward_path, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 0ff547a4bff7..cef3e0210744 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -40,7 +40,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; @@ -52,7 +52,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, + (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a61e742794f9..02c4cab60c69 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -66,7 +66,7 @@ int ip6_ra_control(struct sock *sk, int sel) if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; - new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + new_ra = (sel >= 0) ? kmalloc_obj(*new_ra) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; @@ -1184,7 +1184,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) - val = dst_mtu(dst); + val = dst6_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; @@ -1283,7 +1283,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) - mtuinfo.ip6m_mtu = dst_mtu(dst); + mtuinfo.ip6m_mtu = dst6_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 016b572e7d6f..3330adcf26db 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -740,7 +740,7 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); + pmc = kzalloc_obj(*pmc); if (!pmc) return; @@ -868,7 +868,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, mc_assert_locked(idev); - mc = kzalloc(sizeof(*mc), GFP_KERNEL); + mc = kzalloc_obj(*mc); if (!mc) return NULL; @@ -2415,7 +2415,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, psf_prev = psf; } if (!psf) { - psf = kzalloc(sizeof(*psf), GFP_KERNEL); + psf = kzalloc_obj(*psf); if (!psf) return -ENOBUFS; @@ -2500,7 +2500,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) &psf->sf_addr)) break; if (!dpsf) { - dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL); + dpsf = kmalloc_obj(*dpsf); if (!dpsf) continue; *dpsf = *psf; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f427e41e9c49..f6a5d8c73af9 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1449,7 +1449,7 @@ skip_defrtr: BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); - in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + neigh_set_reach_time(in6_dev->nd_parms); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } @@ -1555,8 +1555,8 @@ skip_routeinfo: memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); - if (in6_dev->ra_mtu != mtu) { - in6_dev->ra_mtu = mtu; + if (READ_ONCE(in6_dev->ra_mtu) != mtu) { + WRITE_ONCE(in6_dev->ra_mtu, mtu); send_ifinfo_notify = true; } @@ -1948,9 +1948,9 @@ int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buf ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { - if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) - idev->nd_parms->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); + if (ctl->data == NEIGH_VAR_PTR(idev->nd_parms, BASE_REACHABLE_TIME)) + neigh_set_reach_time(idev->nd_parms); + WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 1c9b283a4132..cba1684a3f30 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -125,12 +125,7 @@ EXPORT_SYMBOL(ip6_dst_hoplimit); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; - ipv6_hdr(skb)->payload_len = htons(len); + ipv6_set_payload_len(ipv6_hdr(skb), skb->len - sizeof(struct ipv6hdr)); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); /* if egress device is enslaved to an L3 master device pass the diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index d7a2cdaa2631..e4afc651731a 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -45,7 +45,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } -static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int ping_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip6_datagram_connect() and diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e369f54844dd..27a268059168 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -90,23 +90,24 @@ EXPORT_SYMBOL_GPL(raw_v6_match); * 0 - deliver * 1 - block */ -static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) +static int icmpv6_filter(const struct sock *sk, struct sk_buff *skb) { - struct icmp6hdr _hdr; const struct icmp6hdr *hdr; + const __u32 *data; + unsigned int type; /* We require only the four bytes of the ICMPv6 header, not any * additional bytes of message body in "struct icmp6hdr". */ - hdr = skb_header_pointer(skb, skb_transport_offset(skb), - ICMPV6_HDRLEN, &_hdr); - if (hdr) { - const __u32 *data = &raw6_sk(sk)->filter.data[0]; - unsigned int type = hdr->icmp6_type; + if (!pskb_may_pull(skb, ICMPV6_HDRLEN)) + return 1; - return (data[type >> 5] & (1U << (type & 31))) != 0; - } - return 1; + hdr = (struct icmp6hdr *)skb->data; + type = hdr->icmp6_type; + + data = &raw6_sk(sk)->filter.data[0]; + + return (data[type >> 5] & (1U << (type & 31))) != 0; } #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -141,15 +142,13 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); - const struct in6_addr *saddr; - const struct in6_addr *daddr; + const struct ipv6hdr *ip6h; struct hlist_head *hlist; - struct sock *sk; bool delivered = false; + struct sock *sk; __u8 hash; - saddr = &ipv6_hdr(skb)->saddr; - daddr = saddr + 1; + ip6h = ipv6_hdr(skb); hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; @@ -157,7 +156,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) sk_for_each_rcu(sk, hlist) { int filtered; - if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, + if (!raw_v6_match(net, sk, nexthdr, &ip6h->daddr, &ip6h->saddr, inet6_iif(skb), inet6_sdif(skb))) continue; @@ -171,6 +170,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); + ip6h = ipv6_hdr(skb); break; #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -214,7 +214,8 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) } /* This cleans up af_inet6 a bit. -DaveM */ -static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -528,7 +529,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; - opt = inet6_sk(sk)->cork.opt; + opt = inet_sk(sk)->cork.base6.opt; total_len -= opt ? opt->opt_flen : 0; if (offset >= total_len - 1) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aee6a10b112a..08cd86f49bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -148,9 +148,9 @@ void rt6_uncached_list_add(struct rt6_info *rt) void rt6_uncached_list_del(struct rt6_info *rt) { - if (!list_empty(&rt->dst.rt_uncached)) { - struct uncached_list *ul = rt->dst.rt_uncached_list; + struct uncached_list *ul = rt->dst.rt_uncached_list; + if (ul) { spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); @@ -684,14 +684,14 @@ static void rt6_probe(struct fib6_nh *fib6_nh) time_after(jiffies, neigh->updated + READ_ONCE(idev->cnf.rtr_probe_interval))) { - work = kmalloc(sizeof(*work), GFP_ATOMIC); + work = kmalloc_obj(*work, GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + READ_ONCE(idev->cnf.rtr_probe_interval))) { - work = kmalloc(sizeof(*work), GFP_ATOMIC); + work = kmalloc_obj(*work, GFP_ATOMIC); } if (!work || cmpxchg(&fib6_nh->last_probe, @@ -1063,7 +1063,8 @@ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) - dev = l3mdev_master_dev_rcu(dev); + dev = l3mdev_master_dev_rcu(dev) ? : + dev_net(dev)->loopback_dev; else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which @@ -1470,7 +1471,18 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); - BUG_ON(prev); + if (unlikely(prev)) { + /* + * Another task on this CPU already installed a pcpu_rt. + * This can happen on PREEMPT_RT where preemption is possible. + * Free our allocation and return the existing one. + */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RT)); + + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + return prev; + } if (res->f6i->fib6_destroying) { struct fib6_info *from; @@ -1712,8 +1724,8 @@ static int rt6_insert_exception(struct rt6_info *nrt, bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { - bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), - GFP_ATOMIC); + bucket = kzalloc_objs(*bucket, FIB6_EXCEPTION_BUCKET_SIZE, + GFP_ATOMIC); if (!bucket) { err = -ENOMEM; goto out; @@ -1748,7 +1760,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, if (rt6_ex) rt6_remove_exception(bucket, rt6_ex); - rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); + rt6_ex = kzalloc_obj(*rt6_ex, GFP_ATOMIC); if (!rt6_ex) { err = -ENOMEM; goto out; @@ -2038,6 +2050,8 @@ unlock: static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { + u32 dmtu = dst6_mtu(&rt->dst); + /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. @@ -2048,10 +2062,10 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, * handle this. */ - if (dst_mtu(&rt->dst) >= mtu) + if (dmtu >= mtu) return true; - if (dst_mtu(&rt->dst) == idev->cnf.mtu6) + if (dmtu == idev->cnf.mtu6) return true; return false; @@ -2884,7 +2898,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; - rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); + rt6_update_expires(rt, READ_ONCE(net->ipv6.sysctl.ip6_rt_mtu_expires)); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) @@ -2921,7 +2935,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (mtu < IPV6_MIN_MTU) return; - if (mtu >= dst_mtu(dst)) + if (mtu >= dst6_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { @@ -3237,7 +3251,7 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - unsigned int mtu = dst_mtu(dst); + unsigned int mtu = dst6_mtu(dst); struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); @@ -3245,8 +3259,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) rcu_read_lock(); net = dst_dev_net_rcu(dst); - if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) - mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + mtu = max_t(unsigned int, mtu, + READ_ONCE(net->ipv6.sysctl.ip6_rt_min_advmss)); rcu_read_unlock(); @@ -3348,10 +3362,10 @@ out: static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); - int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; - int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; - unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int rt_min_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_min_interval); + int rt_elasticity = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_elasticity); + int rt_gc_timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_timeout); + unsigned long rt_last_gc = READ_ONCE(net->ipv6.ip6_rt_last_gc); unsigned int val; int entries; @@ -3408,11 +3422,8 @@ static int ip6_route_check_nh_onlink(struct net *net, err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); if (!err && !(res.fib6_flags & RTF_REJECT) && - /* ignore match if it is the default route */ - !ipv6_addr_any(&res.f6i->fib6_dst.addr) && - (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); + res.fib6_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); err = -EINVAL; } @@ -3572,7 +3583,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; - int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; @@ -3614,11 +3624,10 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, fib6_nh->fib_nh_weight = 1; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes + /* Reset the nexthop device to the loopback device in case of reject + * routes. */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { + if (cfg->fc_flags & RTF_REJECT) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { @@ -4997,7 +5006,7 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event) }; struct net *net = dev_net(dev); - if (net->ipv6.sysctl.skip_notify_on_dev_down) + if (READ_ONCE(net->ipv6.sysctl.skip_notify_on_dev_down)) fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); else fib6_clean_all(net, fib6_ifdown, &arg); @@ -5321,7 +5330,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, return -EEXIST; } - nh = kzalloc(sizeof(*nh), GFP_KERNEL); + nh = kzalloc_obj(*nh); if (!nh) return -ENOMEM; @@ -6397,6 +6406,7 @@ errout: void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed) { + u8 fib_notify_on_flag_change; struct sk_buff *skb; int err; @@ -6408,8 +6418,9 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, WRITE_ONCE(f6i->offload, offload); WRITE_ONCE(f6i->trap, trap); + fib_notify_on_flag_change = READ_ONCE(net->ipv6.sysctl.fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ - if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && + if (fib_notify_on_flag_change == 2 && READ_ONCE(f6i->offload_failed) == offload_failed) return; @@ -6421,7 +6432,7 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, */ return; - if (!net->ipv6.sysctl.fib_notify_on_flag_change) + if (!fib_notify_on_flag_change) return; skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); @@ -6518,7 +6529,7 @@ static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, return ret; net = (struct net *)ctl->extra1; - delay = net->ipv6.sysctl.flush_delay; + delay = READ_ONCE(net->ipv6.sysctl.flush_delay); fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } @@ -6766,7 +6777,7 @@ static struct pernet_operations ip6_route_net_ops = { static int __net_init ipv6_inetpeer_init(struct net *net) { - struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + struct inet_peer_base *bp = kmalloc_obj(*bp); if (!bp) return -ENOMEM; diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index a5c4c629b788..1c3ad25700c4 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -202,7 +202,7 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) secret = (char *)nla_data(info->attrs[SEG6_ATTR_SECRET]); - hinfo = kzalloc(sizeof(*hinfo), GFP_KERNEL); + hinfo = kzalloc_obj(*hinfo); if (!hinfo) { err = -ENOMEM; goto out_unlock; @@ -339,7 +339,7 @@ static int seg6_genl_dumphmac_start(struct netlink_callback *cb) iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc_obj(*iter); if (!iter) return -ENOMEM; @@ -421,13 +421,13 @@ static int __net_init seg6_net_init(struct net *net) { struct seg6_pernet_data *sdata; - sdata = kzalloc(sizeof(*sdata), GFP_KERNEL); + sdata = kzalloc_obj(*sdata); if (!sdata) return -ENOMEM; mutex_init(&sdata->lock); - sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL); + sdata->tun_src = kzalloc_obj(*sdata->tun_src); if (!sdata->tun_src) { kfree(sdata); return -ENOMEM; diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index ee6bac0160ac..e6964c6b0d38 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -184,6 +184,8 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) int require_hmac; idev = __in6_dev_get(skb->dev); + if (!idev) + return false; srh = (struct ipv6_sr_hdr *)skb_transport_header(skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index cf37ad9686e6..6a7b8abb0477 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -323,7 +323,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? - kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : + kzalloc_objs(*kp, cmax, GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; ca = min(t->prl_count, cmax); @@ -334,8 +334,8 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u * For root users, retry allocating enough memory for * the answer. */ - kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | - __GFP_NOWARN); + kp = kzalloc_objs(*kp, ca, + GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; @@ -394,7 +394,7 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) goto out; } - p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); + p = kzalloc_obj(struct ip_tunnel_prl_entry); if (!p) { err = -ENOBUFS; goto out; @@ -962,7 +962,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (df) { - mtu = dst_mtu(&rt->dst) - t_hlen; + mtu = dst4_mtu(&rt->dst) - t_hlen; if (mtu < IPV4_MIN_MTU) { DEV_STATS_INC(dev, collisions); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7e007f013ec8..4f6f0d751d6c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -151,9 +151,14 @@ static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcpv6_ts_off(net, - ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32); + union tcp_seq_and_ts_off st; + + st = secure_tcpv6_seq_and_ts_off(net, + ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); + tsoff = st.ts_off; tcp_opt.rcv_tsecr -= tsoff; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 59c4977a811a..bb09d5ccf599 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -67,8 +67,8 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <crypto/hash.h> -#include <linux/scatterlist.h> +#include <crypto/md5.h> +#include <crypto/utils.h> #include <trace/events/tcp.h> @@ -105,21 +105,17 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) } } -static u32 tcp_v6_init_seq(const struct sk_buff *skb) +static union tcp_seq_and_ts_off +tcp_v6_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source); + return secure_tcpv6_seq_and_ts_off(net, + ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); } -static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) -{ - return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32); -} - -static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to @@ -134,20 +130,20 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); - struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); + struct in6_addr *saddr = NULL, *final_p; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; - struct flowi6 fl6; + struct flowi6 *fl6; int addr_type; int err; @@ -157,14 +153,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - memset(&fl6, 0, sizeof(fl6)); + fl6 = &inet_sk(sk)->cork.fl.u.ip6; + memset(fl6, 0, sizeof(*fl6)); if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + fl6->flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6->flowlabel); + if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); @@ -213,7 +210,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; + np->flow_label = fl6->flowlabel; /* * TCP over IPv4 @@ -239,7 +236,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif - err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; @@ -261,24 +258,24 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; - fl6.flowi6_proto = IPPROTO_TCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) - fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; - fl6.flowi6_uid = sk_uid(sk); + fl6->flowi6_proto = IPPROTO_TCP; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = saddr ? *saddr : np->saddr; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_dport = usin->sin6_port; + fl6->fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6->fl6_sport) + fl6->flowi6_flags = FLOWI_FLAG_ANY_SPORT; + fl6->flowi6_uid = sk_uid(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); + final_p = fl6_update_dst(fl6, opt, &np->final); - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(net, sk, fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -288,7 +285,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { - saddr = &fl6.saddr; + saddr = &fl6->saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) @@ -319,14 +316,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk_set_txhash(sk); if (likely(!tp->repair)) { + union tcp_seq_and_ts_off st; + + st = secure_tcpv6_seq_and_ts_off(net, + np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport)); - tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32); + WRITE_ONCE(tp->write_seq, st.seq); + tp->tsoffset = st.ts_off; } if (tcp_fastopen_defer_connect(sk, &err)) @@ -352,7 +351,7 @@ failure: static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; - u32 mtu; + u32 mtu, dmtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; @@ -369,8 +368,9 @@ static void tcp_v6_mtu_reduced(struct sock *sk) if (!dst) return; - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); + dmtu = dst6_mtu(dst); + if (inet_csk(sk)->icsk_pmtu_cookie > dmtu) { + tcp_sync_mss(sk, dmtu); tcp_simple_retransmit(sk); } } @@ -538,7 +538,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, u8 tclass; /* First, grab a route. */ - if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, + if (!dst && (dst = inet6_csk_route_req(sk, NULL, fl6, req, IPPROTO_TCP)) == NULL) goto done; @@ -691,69 +691,45 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, cmd.tcpm_key, cmd.tcpm_keylen); } -static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - const struct tcphdr *th, int nbytes) +static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct tcphdr *th, int nbytes) { - struct tcp6_pseudohdr *bp; - struct scatterlist sg; - struct tcphdr *_th; - - bp = hp->scratch; - /* 1. TCP pseudo-header (RFC2460) */ - bp->saddr = *saddr; - bp->daddr = *daddr; - bp->protocol = cpu_to_be32(IPPROTO_TCP); - bp->len = cpu_to_be32(nbytes); - - _th = (struct tcphdr *)(bp + 1); - memcpy(_th, th, sizeof(*th)); - _th->check = 0; - - sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); - ahash_request_set_crypt(hp->req, &sg, NULL, - sizeof(*bp) + sizeof(*th)); - return crypto_ahash_update(hp->req); + struct { + struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */ + struct tcphdr tcp; + } h; + + h.ip.saddr = *saddr; + h.ip.daddr = *daddr; + h.ip.protocol = cpu_to_be32(IPPROTO_TCP); + h.ip.len = cpu_to_be32(nbytes); + h.tcp = *th; + h.tcp.check = 0; + md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); } -static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - const struct in6_addr *daddr, struct in6_addr *saddr, - const struct tcphdr *th) +static noinline_for_stack void +tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + const struct in6_addr *daddr, struct in6_addr *saddr, + const struct tcphdr *th) { - struct tcp_sigpool hp; - - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; + struct md5_ctx ctx; -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } -static int tcp_v6_md5_hash_skb(char *md5_hash, - const struct tcp_md5sig_key *key, - const struct sock *sk, - const struct sk_buff *skb) +static noinline_for_stack void +tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; - struct tcp_sigpool hp; + struct md5_ctx ctx; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; @@ -764,30 +740,11 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, daddr = &ip6h->daddr; } - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - - if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) - goto clear_hash; - if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; - -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); + tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } #endif @@ -831,7 +788,7 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk, if (security_inet_conn_request(sk, skb, req)) return NULL; - return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); + return inet6_csk_route_req(sk, NULL, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { @@ -840,7 +797,6 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { @@ -859,8 +815,7 @@ const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, - .init_seq = tcp_v6_init_seq, - .init_ts_off = tcp_v6_init_ts_off, + .init_seq_and_ts_off = tcp_v6_init_seq_and_ts_off, .send_synack = tcp_v6_send_synack, }; @@ -1032,7 +987,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; - int genhash; struct sock *sk1 = NULL; #endif @@ -1091,8 +1045,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, goto out; key.type = TCP_KEY_MD5; - genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); - if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) + tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); + if (crypto_memneq(md5_hash_location, newhash, 16)) goto out; } #endif @@ -1131,7 +1085,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, txhash = inet_twsk(sk)->tw_txhash; } } else { - if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) + if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) & + FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } @@ -1355,18 +1310,55 @@ static void tcp_v6_restore_cb(struct sk_buff *skb) sizeof(struct inet6_skb_parm)); } +/* Called from tcp_v4_syn_recv_sock() for v6_mapped children. */ +static void tcp_v6_mapped_child_init(struct sock *newsk, const struct sock *sk) +{ + struct inet_sock *newinet = inet_sk(newsk); + struct ipv6_pinfo *newnp; + + newinet->pinet6 = newnp = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; + + memcpy(newnp, tcp_inet6_sk(sk), sizeof(struct ipv6_pinfo)); + + newnp->saddr = newsk->sk_v6_rcv_saddr; + + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + if (sk_is_mptcp(newsk)) + mptcpv6_handle_mapped(newsk, true); + newsk->sk_backlog_rcv = tcp_v4_do_rcv; +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) + tcp_sk(newsk)->af_specific = &tcp_sock_ipv6_mapped_specific; +#endif + + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->pktoptions = NULL; + newnp->opt = NULL; + + /* tcp_v4_syn_recv_sock() has initialized newinet->mc_{index,ttl} */ + newnp->mcast_oif = newinet->mc_index; + newnp->mcast_hops = newinet->mc_ttl; + + newnp->rcv_flowinfo = 0; + if (inet6_test_bit(REPFLOW, sk)) + newnp->flow_label = 0; +} + static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, - bool *own_req) + bool *own_req, + void (*opt_child_init)(struct sock *newsk, + const struct sock *sk)) { - struct inet_request_sock *ireq; - struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); + struct inet_request_sock *ireq; struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; + struct ipv6_pinfo *newnp; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG @@ -1375,70 +1367,18 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * #endif struct flowi6 fl6; - if (skb->protocol == htons(ETH_P_IP)) { - /* - * v6 mapped - */ - - newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, - req_unhash, own_req); - - if (!newsk) - return NULL; - - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); - - newnp = tcp_inet6_sk(newsk); - newtp = tcp_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = newsk->sk_v6_rcv_saddr; - - inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; - if (sk_is_mptcp(newsk)) - mptcpv6_handle_mapped(newsk, true); - newsk->sk_backlog_rcv = tcp_v4_do_rcv; -#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) - newtp->af_specific = &tcp_sock_ipv6_mapped_specific; -#endif - - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->mcast_oif = inet_iif(skb); - newnp->mcast_hops = ip_hdr(skb)->ttl; - newnp->rcv_flowinfo = 0; - if (inet6_test_bit(REPFLOW, sk)) - newnp->flow_label = 0; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks count - * here, tcp_create_openreq_child now does this for us, see the comment in - * that function for the gory details. -acme - */ - - /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 icsk.icsk_af_ops. - Sync it now. - */ - tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); - - return newsk; - } - + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_syn_recv_sock(sk, skb, req, dst, + req_unhash, own_req, + tcp_v6_mapped_child_init); ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst) { - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); - if (!dst) - goto exit; - } + dst = inet6_csk_route_req(sk, dst, &fl6, req, IPPROTO_TCP); + if (!dst) + goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) @@ -1453,10 +1393,13 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->cork.fl.u.ip6 = fl6; + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; + newinet->inet_opt = NULL; newtp = tcp_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); @@ -1469,10 +1412,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * First: no IPv4 options. */ - newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; @@ -1511,7 +1452,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_ca_openreq_child(newsk, dst); - tcp_sync_mss(newsk, dst_mtu(dst)); + tcp_sync_mss(newsk, dst6_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -2208,13 +2149,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk_timeout(icsk); + timer_expires = tcp_timeout_expires(sp); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk_timeout(icsk); - } else if (timer_pending(&sp->sk_timer)) { + timer_expires = tcp_timeout_expires(sp); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -2376,6 +2317,8 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .freeptr_offset = offsetof(struct tcp6_sock, + tcp.inet_conn.icsk_inet.sk.sk_freeptr), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index effeba58630b..f2a659cd6183 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -24,9 +24,6 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, struct net *net; int iif, sdif; - if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) - return; - p = tcp_gro_lookup(head, th); if (p) { NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; @@ -45,8 +42,8 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, #endif /* IS_ENABLED(CONFIG_IPV6) */ } -INDIRECT_CALLABLE_SCOPE -struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) +static __always_inline struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct tcphdr *th; @@ -60,7 +57,8 @@ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) if (!th) goto flush; - tcp6_check_fraglist_gro(head, skb, th); + if (unlikely(skb->dev->features & NETIF_F_GRO_FRAGLIST)) + tcp6_check_fraglist_gro(head, skb, th); return tcp_gro_receive(head, skb, th); @@ -69,7 +67,7 @@ flush: return NULL; } -INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) +static __always_inline int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); @@ -170,7 +168,8 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { struct tcphdr *th = tcp_hdr(skb); - if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) + if ((skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) && + !(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) return __tcp6_gso_segment_list(skb, features); skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 813a2ba75824..010b909275dd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -875,7 +875,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ - if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { + if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && + UDP_SKB_CB(skb)->partial_cov)) { u16 pcrlen = READ_ONCE(up->pcrlen); if (pcrlen == 0) { /* full coverage was set */ @@ -1282,7 +1283,7 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } -static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int udpv6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { if (addr_len < offsetofend(struct sockaddr, sa_family)) @@ -1303,7 +1304,8 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } -static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int udpv6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { int res; @@ -1438,7 +1440,7 @@ csum_partial: send: err = ip6_send_skb(skb); - if (err) { + if (unlikely(err)) { if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 046f13b1d77a..e003b8494dc0 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -132,7 +132,6 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, sdif, net->ipv4.udp_table, NULL); } -INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); @@ -165,7 +164,7 @@ flush: return NULL; } -INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) +int udp6_gro_complete(struct sk_buff *skb, int nhoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 2cec542437f7..e867721cda4d 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -16,10 +16,9 @@ static int udplitev6_sk_init(struct sock *sk) { - udpv6_init_sock(sk); pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); - return 0; + return udpv6_init_sock(sk); } static int udplitev6_rcv(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 1f19b6f14484..125ea9a5b8a0 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -57,6 +57,7 @@ static int xfrm6_get_saddr(xfrm_address_t *saddr, struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; + int err; dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) @@ -68,9 +69,11 @@ static int xfrm6_get_saddr(xfrm_address_t *saddr, return -EHOSTUNREACH; } dev = idev->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, ¶ms->daddr->in6, 0, - &saddr->in6); + err = ipv6_dev_get_saddr(dev_net(dev), dev, ¶ms->daddr->in6, 0, + &saddr->in6); dst_release(dst); + if (err) + return -EHOSTUNREACH; return 0; } diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6c717a7ef292..6554d2cffc19 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -10,8 +10,7 @@ * Ursula Braun <ursula.braun@de.ibm.com> */ -#define KMSG_COMPONENT "af_iucv" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "af_iucv: " fmt #include <linux/filter.h> #include <linux/module.h> @@ -553,16 +552,17 @@ static void __iucv_auto_name(struct iucv_sock *iucv) { char name[12]; - sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); + scnprintf(name, sizeof(name), + "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); while (__iucv_get_sock_by_name(name)) { - sprintf(name, "%08x", - atomic_inc_return(&iucv_sk_list.autobind_name)); + scnprintf(name, sizeof(name), "%08x", + atomic_inc_return(&iucv_sk_list.autobind_name)); } memcpy(iucv->src_name, name, 8); } /* Bind an unbound socket */ -static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, +static int iucv_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); @@ -667,7 +667,7 @@ static int iucv_sock_autobind(struct sock *sk) return err; } -static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr) +static int afiucv_path_connect(struct socket *sock, struct sockaddr_unsized *addr) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); struct sock *sk = sock->sk; @@ -713,7 +713,7 @@ done: } /* Connect an unconnected socket */ -static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr, +static int iucv_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); @@ -1719,7 +1719,7 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) goto out_unlock; save_message: - save_msg = kzalloc(sizeof(struct sock_msg_q), GFP_ATOMIC | GFP_DMA); + save_msg = kzalloc_obj(struct sock_msg_q, GFP_ATOMIC | GFP_DMA); if (!save_msg) goto out_unlock; save_msg->path = path; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 473a7847d80b..6641c49b09ac 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -20,8 +20,7 @@ * CP Programming Service, IBM document # SC24-5760 */ -#define KMSG_COMPONENT "iucv" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "iucv: " fmt #include <linux/kernel_stat.h> #include <linux/export.h> @@ -91,11 +90,11 @@ struct device *iucv_alloc_device(const struct attribute_group **attrs, char buf[20]; int rc; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc_obj(*dev); if (!dev) goto out_error; va_start(vargs, fmt); - vsnprintf(buf, sizeof(buf), fmt, vargs); + vscnprintf(buf, sizeof(buf), fmt, vargs); rc = dev_set_name(dev, "%s", buf); va_end(vargs); if (rc) @@ -314,13 +313,12 @@ static union iucv_param *iucv_param[NR_CPUS]; static union iucv_param *iucv_param_irq[NR_CPUS]; /** - * __iucv_call_b2f0 + * __iucv_call_b2f0 - Calls CP to execute IUCV commands. + * * @command: identifier of IUCV call to CP. * @parm: pointer to a struct iucv_parm block * - * Calls CP to execute IUCV commands. - * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ static inline int __iucv_call_b2f0(int command, union iucv_param *parm) { @@ -349,11 +347,10 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm) } /* - * iucv_query_maxconn + * iucv_query_maxconn - Determine the maximum number of connections that + * may be established. * - * Determines the maximum number of connections that may be established. - * - * Returns the maximum number of connections or -EPERM is IUCV is not + * Returns: the maximum number of connections or -EPERM is IUCV is not * available. */ static int __iucv_query_maxconn(void *param, unsigned long *max_pathid) @@ -381,7 +378,7 @@ static int iucv_query_maxconn(void) void *param; int ccode; - param = kzalloc(sizeof(union iucv_param), GFP_KERNEL | GFP_DMA); + param = kzalloc_obj(union iucv_param, GFP_KERNEL | GFP_DMA); if (!param) return -ENOMEM; ccode = __iucv_query_maxconn(param, &max_pathid); @@ -392,10 +389,9 @@ static int iucv_query_maxconn(void) } /** - * iucv_allow_cpu - * @data: unused + * iucv_allow_cpu - Allow iucv interrupts on this cpu. * - * Allow iucv interrupts on this cpu. + * @data: unused */ static void iucv_allow_cpu(void *data) { @@ -433,10 +429,9 @@ static void iucv_allow_cpu(void *data) } /** - * iucv_block_cpu - * @data: unused + * iucv_block_cpu - Block iucv interrupts on this cpu. * - * Block iucv interrupts on this cpu. + * @data: unused */ static void iucv_block_cpu(void *data) { @@ -453,10 +448,9 @@ static void iucv_block_cpu(void *data) } /** - * iucv_declare_cpu - * @data: unused + * iucv_declare_cpu - Declare a interrupt buffer on this cpu. * - * Declare a interrupt buffer on this cpu. + * @data: unused */ static void iucv_declare_cpu(void *data) { @@ -508,10 +502,9 @@ static void iucv_declare_cpu(void *data) } /** - * iucv_retrieve_cpu - * @data: unused + * iucv_retrieve_cpu - Retrieve interrupt buffer on this cpu. * - * Retrieve interrupt buffer on this cpu. + * @data: unused */ static void iucv_retrieve_cpu(void *data) { @@ -533,9 +526,7 @@ static void iucv_retrieve_cpu(void *data) } /* - * iucv_setmask_mp - * - * Allow iucv interrupts on all cpus. + * iucv_setmask_mp - Allow iucv interrupts on all cpus. */ static void iucv_setmask_mp(void) { @@ -552,9 +543,7 @@ static void iucv_setmask_mp(void) } /* - * iucv_setmask_up - * - * Allow iucv interrupts on a single cpu. + * iucv_setmask_up - Allow iucv interrupts on a single cpu. */ static void iucv_setmask_up(void) { @@ -569,12 +558,11 @@ static void iucv_setmask_up(void) } /* - * iucv_enable + * iucv_enable - Make the iucv ready for use * - * This function makes iucv ready for use. It allocates the pathid - * table, declares an iucv interrupt buffer and enables the iucv - * interrupts. Called when the first user has registered an iucv - * handler. + * It allocates the pathid table, declares an iucv interrupt buffer and + * enables the iucv interrupts. Called when the first user has registered + * an iucv handler. */ static int iucv_enable(void) { @@ -604,11 +592,10 @@ out: } /* - * iucv_disable + * iucv_disable - Shuts down iucv. * - * This function shuts down iucv. It disables iucv interrupts, retrieves - * the iucv interrupt buffer and frees the pathid table. Called after the - * last user unregister its iucv handler. + * It disables iucv interrupts, retrieves the iucv interrupt buffer and frees + * the pathid table. Called after the last user unregister its iucv handler. */ static void iucv_disable(void) { @@ -696,11 +683,10 @@ __free_cpumask: } /** - * iucv_sever_pathid + * iucv_sever_pathid - Sever an iucv path to free up the pathid. Used internally. + * * @pathid: path identification number. * @userdata: 16-bytes of user data. - * - * Sever an iucv path to free up the pathid. Used internally. */ static int iucv_sever_pathid(u16 pathid, u8 *userdata) { @@ -715,22 +701,20 @@ static int iucv_sever_pathid(u16 pathid, u8 *userdata) } /** - * __iucv_cleanup_queue - * @dummy: unused dummy argument + * __iucv_cleanup_queue - Nop function called via smp_call_function to force + * work items from pending external iucv interrupts to the work queue. * - * Nop function called via smp_call_function to force work items from - * pending external iucv interrupts to the work queue. + * @dummy: unused dummy argument */ static void __iucv_cleanup_queue(void *dummy) { } /** - * iucv_cleanup_queue + * iucv_cleanup_queue - Called after a path has been severed to find all + * remaining work items for the now stale pathid. * - * Function called after a path has been severed to find all remaining - * work items for the now stale pathid. The caller needs to hold the - * iucv_table_lock. + * The caller needs to hold the iucv_table_lock. */ static void iucv_cleanup_queue(void) { @@ -758,13 +742,12 @@ static void iucv_cleanup_queue(void) } /** - * iucv_register: + * iucv_register - Registers a driver with IUCV. + * * @handler: address of iucv handler structure * @smp: != 0 indicates that the handler can deal with out of order messages * - * Registers a driver with IUCV. - * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp) @@ -795,11 +778,10 @@ out_mutex: EXPORT_SYMBOL(iucv_register); /** - * iucv_unregister + * iucv_unregister - Unregister driver from IUCV. + * * @handler: address of iucv handler structure * @smp: != 0 indicates that the handler can deal with out of order messages - * - * Unregister driver from IUCV. */ void iucv_unregister(struct iucv_handler *handler, int smp) { @@ -853,7 +835,8 @@ static struct notifier_block iucv_reboot_notifier = { }; /** - * iucv_path_accept + * iucv_path_accept - Complete the IUCV communication path + * * @path: address of iucv path structure * @handler: address of iucv handler structure * @userdata: 16 bytes of data reflected to the communication partner @@ -862,7 +845,7 @@ static struct notifier_block iucv_reboot_notifier = { * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private) @@ -897,7 +880,8 @@ out: EXPORT_SYMBOL(iucv_path_accept); /** - * iucv_path_connect + * iucv_path_connect - Establish an IUCV path + * * @path: address of iucv path structure * @handler: address of iucv handler structure * @userid: 8-byte user identification @@ -909,7 +893,7 @@ EXPORT_SYMBOL(iucv_path_accept); * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -965,14 +949,14 @@ out: EXPORT_SYMBOL(iucv_path_connect); /** - * iucv_path_quiesce: + * iucv_path_quiesce - Temporarily suspend incoming messages * @path: address of iucv path structure * @userdata: 16 bytes of data reflected to the communication partner * * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata) { @@ -997,14 +981,15 @@ out: EXPORT_SYMBOL(iucv_path_quiesce); /** - * iucv_path_resume: + * iucv_path_resume - Resume incoming messages on a suspended IUCV path + * * @path: address of iucv path structure * @userdata: 16 bytes of data reflected to the communication partner * * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata) { @@ -1028,13 +1013,12 @@ out: } /** - * iucv_path_sever + * iucv_path_sever - Terminates an IUCV path. + * * @path: address of iucv path structure * @userdata: 16 bytes of data reflected to the communication partner * - * This function terminates an IUCV path. - * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata) { @@ -1059,14 +1043,13 @@ out: EXPORT_SYMBOL(iucv_path_sever); /** - * iucv_message_purge + * iucv_message_purge - Cancels a message you have sent. + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @srccls: source class of message * - * Cancels a message you have sent. - * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls) @@ -1097,13 +1080,15 @@ out: EXPORT_SYMBOL(iucv_message_purge); /** - * iucv_message_receive_iprmdata + * iucv_message_receive_iprmdata - Internal function to receive RMDATA + * stored in &struct iucv_message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is received (IUCV_IPBUFLST) * @buffer: address of data buffer or address of struct iucv_array * @size: length of data buffer - * @residual: + * @residual: number of bytes remaining in the data buffer * * Internal function used by iucv_message_receive and __iucv_message_receive * to receive RMDATA data stored in struct iucv_message. @@ -1141,10 +1126,11 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path, } /** - * __iucv_message_receive + * __iucv_message_receive - Receives messages on an established path (no locking) + * * @path: address of iucv path structure * @msg: address of iucv msg structure - * @flags: how the message is received (IUCV_IPBUFLST) + * @flags: flags that affect how the message is received (IUCV_IPBUFLST) * @buffer: address of data buffer or address of struct iucv_array * @size: length of data buffer * @residual: @@ -1155,7 +1141,7 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path, * * Locking: no locking * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual) @@ -1189,10 +1175,11 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, EXPORT_SYMBOL(__iucv_message_receive); /** - * iucv_message_receive + * iucv_message_receive - Receives messages on an established path, with locking + * * @path: address of iucv path structure * @msg: address of iucv msg structure - * @flags: how the message is received (IUCV_IPBUFLST) + * @flags: flags that affect how the message is received (IUCV_IPBUFLST) * @buffer: address of data buffer or address of struct iucv_array * @size: length of data buffer * @residual: @@ -1203,7 +1190,7 @@ EXPORT_SYMBOL(__iucv_message_receive); * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual) @@ -1221,7 +1208,8 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, EXPORT_SYMBOL(iucv_message_receive); /** - * iucv_message_reject + * iucv_message_reject - Refuses a specified message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @@ -1229,7 +1217,7 @@ EXPORT_SYMBOL(iucv_message_receive); * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg) { @@ -1255,7 +1243,8 @@ out: EXPORT_SYMBOL(iucv_message_reject); /** - * iucv_message_reply + * iucv_message_reply - Replies to a specified message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) @@ -1263,11 +1252,11 @@ EXPORT_SYMBOL(iucv_message_reject); * @size: length of reply data buffer * * This function responds to the two-way messages that you receive. You - * must identify completely the message to which you wish to reply. ie, + * must identify completely the message to which you wish to reply. I.e., * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size) @@ -1304,7 +1293,8 @@ out: EXPORT_SYMBOL(iucv_message_reply); /** - * __iucv_message_send + * __iucv_message_send - Transmits a one-way message, no locking + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) @@ -1318,7 +1308,7 @@ EXPORT_SYMBOL(iucv_message_reply); * * Locking: no locking * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size) @@ -1358,7 +1348,8 @@ out: EXPORT_SYMBOL(__iucv_message_send); /** - * iucv_message_send + * iucv_message_send - Transmits a one-way message, with locking + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) @@ -1372,7 +1363,7 @@ EXPORT_SYMBOL(__iucv_message_send); * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size) @@ -1387,7 +1378,8 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, EXPORT_SYMBOL(iucv_message_send); /** - * iucv_message_send2way + * iucv_message_send2way - Transmits a two-way message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is sent and the reply is received @@ -1404,7 +1396,7 @@ EXPORT_SYMBOL(iucv_message_send); * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, @@ -1463,11 +1455,11 @@ struct iucv_path_pending { } __packed; /** - * iucv_path_pending + * iucv_path_pending - Process connection pending work item + * * @data: Pointer to external interrupt buffer * - * Process connection pending work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_pending(struct iucv_irq_data *data) { @@ -1524,11 +1516,11 @@ struct iucv_path_complete { } __packed; /** - * iucv_path_complete + * iucv_path_complete - Process connection complete work item + * * @data: Pointer to external interrupt buffer * - * Process connection complete work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_complete(struct iucv_irq_data *data) { @@ -1554,11 +1546,11 @@ struct iucv_path_severed { } __packed; /** - * iucv_path_severed + * iucv_path_severed - Process connection severed work item. + * * @data: Pointer to external interrupt buffer * - * Process connection severed work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_severed(struct iucv_irq_data *data) { @@ -1590,11 +1582,11 @@ struct iucv_path_quiesced { } __packed; /** - * iucv_path_quiesced + * iucv_path_quiesced - Process connection quiesced work item. + * * @data: Pointer to external interrupt buffer * - * Process connection quiesced work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_quiesced(struct iucv_irq_data *data) { @@ -1618,11 +1610,11 @@ struct iucv_path_resumed { } __packed; /** - * iucv_path_resumed + * iucv_path_resumed - Process connection resumed work item. + * * @data: Pointer to external interrupt buffer * - * Process connection resumed work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_resumed(struct iucv_irq_data *data) { @@ -1649,11 +1641,11 @@ struct iucv_message_complete { } __packed; /** - * iucv_message_complete + * iucv_message_complete - Process message complete work item. + * * @data: Pointer to external interrupt buffer * - * Process message complete work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_message_complete(struct iucv_irq_data *data) { @@ -1696,11 +1688,11 @@ struct iucv_message_pending { } __packed; /** - * iucv_message_pending + * iucv_message_pending - Process message pending work item. + * * @data: Pointer to external interrupt buffer * - * Process message pending work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_message_pending(struct iucv_irq_data *data) { @@ -1723,7 +1715,7 @@ static void iucv_message_pending(struct iucv_irq_data *data) } /* - * iucv_tasklet_fn: + * iucv_tasklet_fn - Process the queue of IRQ buffers * * This tasklet loops over the queue of irq buffers created by * iucv_external_interrupt, calls the appropriate action handler @@ -1767,7 +1759,7 @@ static void iucv_tasklet_fn(unsigned long ignored) } /* - * iucv_work_fn: + * iucv_work_fn - Process the queue of path pending IRQ blocks * * This work function loops over the queue of path pending irq blocks * created by iucv_external_interrupt, calls the appropriate action @@ -1798,9 +1790,8 @@ static void iucv_work_fn(struct work_struct *work) } /* - * iucv_external_interrupt + * iucv_external_interrupt - Handles external interrupts coming in from CP. * - * Handles external interrupts coming in from CP. * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). */ static void iucv_external_interrupt(struct ext_code ext_code, @@ -1817,7 +1808,7 @@ static void iucv_external_interrupt(struct ext_code ext_code, return; } BUG_ON(p->iptype < 0x01 || p->iptype > 0x09); - work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); + work = kmalloc_obj(struct iucv_irq_list, GFP_ATOMIC); if (!work) { pr_warn("iucv_external_interrupt: out of memory\n"); return; @@ -1858,10 +1849,9 @@ struct iucv_interface iucv_if = { EXPORT_SYMBOL(iucv_if); static enum cpuhp_state iucv_online; + /** - * iucv_init - * - * Allocates and initializes various data structures. + * iucv_init - Allocates and initializes various data structures. */ static int __init iucv_init(void) { @@ -1925,9 +1915,7 @@ out: } /** - * iucv_exit - * - * Frees everything allocated from iucv_init. + * iucv_exit - Frees everything allocated from iucv_init. */ static void __exit iucv_exit(void) { diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 16f39f2565d9..66660a06cacf 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -7,5 +7,5 @@ config AF_KCM select STREAM_PARSER help KCM (Kernel Connection Multiplexor) sockets provide a method - for multiplexing messages of a message based application - protocol over kernel connectons (e.g. TCP connections). + for multiplexing messages of a message-based application + protocol over kernel connections (e.g. TCP connections). diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index b4f01cb07561..3912e75079f5 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -628,7 +628,7 @@ retry: skb = txm->frag_skb; } - if (WARN_ON(!skb_shinfo(skb)->nr_frags) || + if (WARN_ON_ONCE(!skb_shinfo(skb)->nr_frags) || WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { ret = -EINVAL; goto out; @@ -749,7 +749,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); - struct sk_buff *skb = NULL, *head = NULL; + struct sk_buff *skb = NULL, *head = NULL, *frag_prev = NULL; size_t copy, copied = 0; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); int eor = (sock->type == SOCK_DGRAM) ? @@ -824,6 +824,7 @@ start: else skb->next = tskb; + frag_prev = skb; skb = tskb; skb->ip_summed = CHECKSUM_UNNECESSARY; continue; @@ -933,6 +934,22 @@ partial_message: out_error: kcm_push(kcm); + /* When MAX_SKB_FRAGS was reached, a new skb was allocated and + * linked into the frag_list before data copy. If the copy + * subsequently failed, this skb has zero frags. Remove it from + * the frag_list to prevent kcm_write_msgs from later hitting + * WARN_ON(!skb_shinfo(skb)->nr_frags). + */ + if (frag_prev && !skb_shinfo(skb)->nr_frags) { + if (head == frag_prev) + skb_shinfo(head)->frag_list = NULL; + else + frag_prev->next = NULL; + kfree_skb(skb); + /* Update skb as it may be saved in partial_message via goto */ + skb = frag_prev; + } + if (sock->type == SOCK_SEQPACKET) { /* Wrote some bytes before encountering an * error, return partial success. @@ -1560,24 +1577,16 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct file *file; - info.fd = get_unused_fd_flags(0); - if (unlikely(info.fd < 0)) - return info.fd; + FD_PREPARE(fdf, 0, kcm_clone(sock)); + if (fdf.err) + return fdf.err; - file = kcm_clone(sock); - if (IS_ERR(file)) { - put_unused_fd(info.fd); - return PTR_ERR(file); - } - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - put_unused_fd(info.fd); - fput(file); + info.fd = fd_prepare_fd(fdf); + if (copy_to_user((void __user *)arg, &info, sizeof(info))) return -EFAULT; - } - fd_install(info.fd, file); + + fd_publish(fdf); err = 0; break; } diff --git a/net/key/af_key.c b/net/key/af_key.c index 2ebde0352245..0756bac62f7c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1196,7 +1196,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, err = -ENOSYS; goto out; } - x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); + x->calg = kmalloc_obj(*x->calg); if (!x->calg) { err = -ENOMEM; goto out; @@ -1261,7 +1261,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; - x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); + x->encap = kzalloc_obj(*x->encap); if (!x->encap) { err = -ENOMEM; goto out; @@ -1855,7 +1855,7 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms mutex_unlock(&pfk->dump_lock); return -EINVAL; } - filter = kmalloc(sizeof(*filter), GFP_KERNEL); + filter = kmalloc_obj(*filter); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); return -ENOMEM; @@ -3903,6 +3903,8 @@ static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); + pr_warn_once("PFKEY is deprecated and scheduled to be removed in 2027, " + "please contact the netdev mailing list\n"); if (err != 0) goto out; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 0710281dd95a..c89ae52764b8 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -487,7 +487,7 @@ static int l2tp_session_collision_add(struct l2tp_net *pn, /* First collision. Allocate list to manage the collided sessions * and add the existing session to the list. */ - clist = kmalloc(sizeof(*clist), GFP_ATOMIC); + clist = kmalloc_obj(*clist, GFP_ATOMIC); if (!clist) return -ENOMEM; @@ -1086,8 +1086,10 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) tunnel = session->tunnel; /* Check protocol version */ - if (version != tunnel->version) + if (version != tunnel->version) { + l2tp_session_put(session); goto invalid; + } if (version == L2TP_HDR_VER_3 && l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { @@ -1414,8 +1416,6 @@ static void l2tp_tunnel_del_work(struct work_struct *work) { struct l2tp_tunnel *tunnel = container_of(work, struct l2tp_tunnel, del_work); - struct sock *sk = tunnel->sock; - struct socket *sock = sk->sk_socket; l2tp_tunnel_closeall(tunnel); @@ -1423,6 +1423,8 @@ static void l2tp_tunnel_del_work(struct work_struct *work) * the sk API to release it here. */ if (tunnel->fd < 0) { + struct socket *sock = tunnel->sock->sk_socket; + if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); @@ -1503,7 +1505,7 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; @@ -1513,7 +1515,7 @@ static int l2tp_tunnel_sock_create(struct net *net, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, - (struct sockaddr *)&ip6_addr, + (struct sockaddr_unsized *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; @@ -1530,7 +1532,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *)&ip_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; @@ -1538,7 +1540,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; - err = kernel_connect(sock, (struct sockaddr *)&ip_addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; @@ -1570,7 +1572,7 @@ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, if (cfg) encap = cfg->encap; - tunnel = kzalloc(sizeof(*tunnel), GFP_KERNEL); + tunnel = kzalloc_obj(*tunnel); if (!tunnel) { err = -ENOMEM; goto err; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 2d0c8275a3a8..b26986fda9d6 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -163,7 +163,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) seq_printf(m, " %d sessions, refcnt %d/%d\n", session_count, tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0, refcount_read(&tunnel->ref_count)); - seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", + seq_printf(m, " %08x tx %ld/%ld/%ld rx %ld/%ld/%ld\n", 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), @@ -269,7 +269,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) struct seq_file *seq; int rc = -ENOMEM; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); + pd = kzalloc_obj(*pd); if (!pd) goto out; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 29795d2839e8..cac1ff59cb83 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -267,7 +267,8 @@ static void l2tp_ip_destroy_sock(struct sock *sk) } } -static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int l2tp_ip_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; @@ -328,7 +329,8 @@ out: return ret; } -static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int l2tp_ip_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ea232f338dcb..05a396ba6a3e 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -280,7 +280,8 @@ static void l2tp_ip6_destroy_sock(struct sock *sk) } } -static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -383,7 +384,7 @@ out_unlock: return err; } -static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, +static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 5e12e7ce17d8..ae4543d5597b 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -684,7 +684,7 @@ static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ -static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, +static int pppol2tp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index a0596e1f91da..670e0859a3e6 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -110,7 +110,7 @@ static struct lapb_cb *lapb_devtostruct(struct net_device *dev) */ static struct lapb_cb *lapb_create_cb(void) { - struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC); + struct lapb_cb *lapb = kzalloc_obj(*lapb, GFP_ATOMIC); if (!lapb) goto out; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5958a80fe14c..59d593bb5d18 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -337,7 +337,7 @@ out: * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ -static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) +static int llc_ui_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; @@ -477,7 +477,7 @@ out: * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ -static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, +static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 4f16d9c88350..d73f5414d8ce 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -32,7 +32,7 @@ static DEFINE_SPINLOCK(llc_sap_list_lock); */ static struct llc_sap *llc_sap_alloc(void) { - struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC); + struct llc_sap *sap = kzalloc_obj(*sap, GFP_ATOMIC); int i; if (sap) { diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index a33884967f21..b0e392eb7753 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -36,7 +36,7 @@ mac80211-y := \ tdls.o \ ocb.o \ airtime.o \ - eht.o + eht.o uhr.o mac80211-$(CONFIG_MAC80211_LEDS) += led.o mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index 48c04f89de20..0827965455dc 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -16,56 +16,48 @@ #include "key.h" #include "aes_cmac.h" -#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */ -#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */ #define AAD_LEN 20 -static const u8 zero[CMAC_TLEN_256]; +static const u8 zero[IEEE80211_CMAC_256_MIC_LEN]; -void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic) +int ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic, + unsigned int mic_len) { + int err; SHASH_DESC_ON_STACK(desc, tfm); u8 out[AES_BLOCK_SIZE]; const __le16 *fc; desc->tfm = tfm; - crypto_shash_init(desc); - crypto_shash_update(desc, aad, AAD_LEN); + err = crypto_shash_init(desc); + if (err) + return err; + err = crypto_shash_update(desc, aad, AAD_LEN); + if (err) + return err; fc = (const __le16 *)aad; if (ieee80211_is_beacon(*fc)) { /* mask Timestamp field to zero */ - crypto_shash_update(desc, zero, 8); - crypto_shash_update(desc, data + 8, data_len - 8 - CMAC_TLEN); + err = crypto_shash_update(desc, zero, 8); + if (err) + return err; + err = crypto_shash_update(desc, data + 8, + data_len - 8 - mic_len); + if (err) + return err; } else { - crypto_shash_update(desc, data, data_len - CMAC_TLEN); + err = crypto_shash_update(desc, data, data_len - mic_len); + if (err) + return err; } - crypto_shash_finup(desc, zero, CMAC_TLEN, out); + err = crypto_shash_finup(desc, zero, mic_len, out); + if (err) + return err; + memcpy(mic, out, mic_len); - memcpy(mic, out, CMAC_TLEN); -} - -void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic) -{ - SHASH_DESC_ON_STACK(desc, tfm); - const __le16 *fc; - - desc->tfm = tfm; - - crypto_shash_init(desc); - crypto_shash_update(desc, aad, AAD_LEN); - fc = (const __le16 *)aad; - if (ieee80211_is_beacon(*fc)) { - /* mask Timestamp field to zero */ - crypto_shash_update(desc, zero, 8); - crypto_shash_update(desc, data + 8, - data_len - 8 - CMAC_TLEN_256); - } else { - crypto_shash_update(desc, data, data_len - CMAC_TLEN_256); - } - crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic); + return 0; } struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h index 76817446fb83..5f971a8298cb 100644 --- a/net/mac80211/aes_cmac.h +++ b/net/mac80211/aes_cmac.h @@ -11,10 +11,9 @@ struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], size_t key_len); -void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic); -void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic); +int ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic, + unsigned int mic_len); void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm); #endif /* AES_CMAC_H */ diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c index 512cab073f2e..811a83d8d525 100644 --- a/net/mac80211/aes_gmac.c +++ b/net/mac80211/aes_gmac.c @@ -24,15 +24,16 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, const __le16 *fc; int ret; - if (data_len < GMAC_MIC_LEN) + if (data_len < IEEE80211_GMAC_MIC_LEN) return -EINVAL; - aead_req = kzalloc(reqsize + GMAC_MIC_LEN + GMAC_AAD_LEN, GFP_ATOMIC); + aead_req = kzalloc(reqsize + IEEE80211_GMAC_MIC_LEN + GMAC_AAD_LEN, + GFP_ATOMIC); if (!aead_req) return -ENOMEM; zero = (u8 *)aead_req + reqsize; - __aad = zero + GMAC_MIC_LEN; + __aad = zero + IEEE80211_GMAC_MIC_LEN; memcpy(__aad, aad, GMAC_AAD_LEN); fc = (const __le16 *)aad; @@ -41,15 +42,16 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, sg_init_table(sg, 5); sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN); sg_set_buf(&sg[1], zero, 8); - sg_set_buf(&sg[2], data + 8, data_len - 8 - GMAC_MIC_LEN); - sg_set_buf(&sg[3], zero, GMAC_MIC_LEN); - sg_set_buf(&sg[4], mic, GMAC_MIC_LEN); + sg_set_buf(&sg[2], data + 8, + data_len - 8 - IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[3], zero, IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[4], mic, IEEE80211_GMAC_MIC_LEN); } else { sg_init_table(sg, 4); sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN); - sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN); - sg_set_buf(&sg[2], zero, GMAC_MIC_LEN); - sg_set_buf(&sg[3], mic, GMAC_MIC_LEN); + sg_set_buf(&sg[1], data, data_len - IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[2], zero, IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[3], mic, IEEE80211_GMAC_MIC_LEN); } memcpy(iv, nonce, GMAC_NONCE_LEN); @@ -78,7 +80,7 @@ struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[], err = crypto_aead_setkey(tfm, key, key_len); if (!err) - err = crypto_aead_setauthsize(tfm, GMAC_MIC_LEN); + err = crypto_aead_setauthsize(tfm, IEEE80211_GMAC_MIC_LEN); if (!err) return tfm; diff --git a/net/mac80211/aes_gmac.h b/net/mac80211/aes_gmac.h index c739356bae2a..206136b60bca 100644 --- a/net/mac80211/aes_gmac.h +++ b/net/mac80211/aes_gmac.h @@ -9,7 +9,6 @@ #include <linux/crypto.h> #define GMAC_AAD_LEN 20 -#define GMAC_MIC_LEN 16 #define GMAC_NONCE_LEN 12 struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[], diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e38f46ffebfa..f301a8622bee 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ /** @@ -206,7 +206,10 @@ u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, if (elem_len <= 0) return 0; - elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL); + elems = ieee802_11_parse_elems(elem_data, elem_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems || elems->parse_error || !elems->addba_ext_ie) goto free; @@ -392,7 +395,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, } /* prepare A-MPDU MLME for Rx aggregation */ - tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL); + tid_agg_rx = kzalloc_obj(*tid_agg_rx); if (!tid_agg_rx) goto end; @@ -408,7 +411,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, /* prepare reordering buffer */ tid_agg_rx->reorder_buf = - kcalloc(buf_size, sizeof(struct sk_buff_head), GFP_KERNEL); + kzalloc_objs(struct sk_buff_head, buf_size); tid_agg_rx->reorder_time = kcalloc(buf_size, sizeof(unsigned long), GFP_KERNEL); if (!tid_agg_rx->reorder_buf || !tid_agg_rx->reorder_time) { diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index d981b0fc57bf..93b47a7ba9c4 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -710,7 +710,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, } /* prepare A-MPDU MLME for Tx aggregation */ - tid_tx = kzalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC); + tid_tx = kzalloc_obj(struct tid_ampdu_tx, GFP_ATOMIC); if (!tid_tx) { ret = -ENOMEM; goto err_unlock_sta; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c52b0456039d..b85375ceb575 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/ieee80211.h> @@ -63,12 +63,14 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, memcpy(sdata->vif.bss_conf.mu_group.position, params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, WLAN_USER_POSITION_LEN); - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - BSS_CHANGED_MU_GROUPS); + /* don't care about endianness - just check for 0 */ memcpy(&membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); mu_mimo_groups = membership != 0; + + /* Unset following if configured explicitly */ + eth_broadcast_addr(sdata->u.mntr.mu_follow_addr); } if (params->vht_mumimo_follow_addr) { @@ -76,16 +78,26 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, is_valid_ether_addr(params->vht_mumimo_follow_addr); ether_addr_copy(sdata->u.mntr.mu_follow_addr, params->vht_mumimo_follow_addr); + + /* Unset current membership until a management frame is RXed */ + memset(sdata->vif.bss_conf.mu_group.membership, 0, + WLAN_MEMBERSHIP_LEN); } sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; + + /* Notify only after setting mu_mimo_owner */ + if (sdata->vif.bss_conf.mu_mimo_owner && + sdata->flags & IEEE80211_SDATA_IN_DRIVER) + ieee80211_link_info_change_notify(sdata, &sdata->deflink, + BSS_CHANGED_MU_GROUPS); } static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { struct ieee80211_local *local = sdata->local; - struct ieee80211_sub_if_data *monitor_sdata; + struct ieee80211_sub_if_data *monitor_sdata = NULL; /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { @@ -103,23 +115,28 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, return -EBUSY; } - /* also validate MU-MIMO change */ - if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) - monitor_sdata = sdata; - else - monitor_sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); - - if (!monitor_sdata && + /* validate whether MU-MIMO can be configured */ + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) return -EOPNOTSUPP; + /* Also update dependent monitor_sdata if required */ + if (test_bit(SDATA_STATE_RUNNING, &sdata->state) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + monitor_sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); + /* apply all changes now - no failures allowed */ - if (monitor_sdata && - (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || - ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) - ieee80211_set_mu_mimo_follow(monitor_sdata, params); + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { + /* This is copied in when the VIF is activated */ + ieee80211_set_mu_mimo_follow(sdata, params); + + if (monitor_sdata) + ieee80211_set_mu_mimo_follow(monitor_sdata, params); + } if (params->flags) { if (ieee80211_sdata_running(sdata)) { @@ -663,10 +680,18 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, * association has completed, this rejects that attempt * so it will set the key again after association. * + * With (re)association frame encryption enabled, cfg80211 + * may deliver keys to mac80211 before the station has + * associated. In that case, accept the key if the station + * is an Enhanced Privacy Protection (EPP) peer. + * If (re)association frame encryption support is not present, + * cfg80211 will not allow key installation in non‑AP STA mode. + * * TODO: accept the key if we have a station entry and - * add it to the device after the station. + * add it to the device after the station associates. */ - if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { + if (!sta || (!sta->sta.epp_peer && + !test_sta_flag(sta, WLAN_STA_ASSOC))) { ieee80211_key_free_unused(key); return -ENOENT; } @@ -1328,7 +1353,6 @@ ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, size = sizeof(*new) + new_head_len + new_tail_len; - /* new or old multiple BSSID elements? */ if (params->mbssid_ies) { mbssid = params->mbssid_ies; size += struct_size(new->mbssid_ies, elem, mbssid->cnt); @@ -1338,15 +1362,6 @@ ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, } size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, mbssid->cnt); - } else if (old && old->mbssid_ies) { - mbssid = old->mbssid_ies; - size += struct_size(new->mbssid_ies, elem, mbssid->cnt); - if (old && old->rnr_ies) { - rnr = old->rnr_ies; - size += struct_size(new->rnr_ies, elem, rnr->cnt); - } - size += ieee80211_get_mbssid_beacon_len(mbssid, rnr, - mbssid->cnt); } new = kzalloc(size, GFP_KERNEL); @@ -1593,6 +1608,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->eht_mu_beamformer = false; } + if (params->uhr_oper) { + if (!link_conf->eht_support) + return -EOPNOTSUPP; + + link_conf->uhr_support = true; + } + if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, @@ -1882,12 +1904,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, __sta_info_flush(sdata, true, link_id, NULL); - ieee80211_remove_link_keys(link, &keys); - if (!list_empty(&keys)) { - synchronize_net(); - ieee80211_free_key_list(local, &keys); - } - ieee80211_stop_mbssid(sdata); RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); @@ -1899,9 +1915,15 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); + ieee80211_remove_link_keys(link, &keys); + if (!list_empty(&keys)) { + synchronize_net(); + ieee80211_free_key_list(local, &keys); + } + if (sdata->wdev.links[link_id].cac_started) { chandef = link_conf->chanreq.oper; - wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); + wiphy_hrtimer_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); @@ -2070,6 +2092,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, params->vht_capa || params->he_capa || params->eht_capa || + params->uhr_capa || params->s1g_capa || params->opmode_notif_used; @@ -2118,8 +2141,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, if (params->supported_rates && params->supported_rates_len && - !ieee80211_parse_bitrates(link->conf->chanreq.oper.width, - sband, params->supported_rates, + !ieee80211_parse_bitrates(sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band])) return -EINVAL; @@ -2149,6 +2171,12 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, params->eht_capa_len, link_sta); + if (params->uhr_capa) + ieee80211_uhr_cap_ie_to_sta_uhr_cap(sdata, sband, + params->uhr_capa, + params->uhr_capa_len, + link_sta); + if (params->s1g_capa) ieee80211_s1g_cap_to_sta_s1g_cap(sdata, params->s1g_capa, link_sta); @@ -2192,6 +2220,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, mask = params->sta_flags_mask; set = params->sta_flags_set; + if (params->epp_peer) + sta->sta.epp_peer = true; + if (ieee80211_vif_is_mesh(&sdata->vif)) { /* * In mesh mode, ASSOCIATED isn't part of the nl80211 @@ -2980,8 +3011,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, return -EINVAL; if (params->basic_rates) { - if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width, - wiphy->bands[sband->band], + if (!ieee80211_parse_bitrates(sband, params->basic_rates, params->basic_rates_len, &link->conf->basic_rates)) @@ -3858,8 +3888,8 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, if (err) return err; - wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work, - msecs_to_jiffies(cac_time_ms)); + wiphy_hrtimer_work_queue(wiphy, &link_data->dfs_cac_timer_work, + ms_to_ktime(cac_time_ms)); return 0; } @@ -3878,7 +3908,7 @@ static void ieee80211_end_cac(struct wiphy *wiphy, if (!link_data) continue; - wiphy_delayed_work_cancel(wiphy, + wiphy_hrtimer_work_cancel(wiphy, &link_data->dfs_cac_timer_work); if (sdata->wdev.links[link_id].cac_started) { @@ -3910,9 +3940,8 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) { new_beacon->mbssid_ies = - kzalloc(struct_size(new_beacon->mbssid_ies, - elem, beacon->mbssid_ies->cnt), - GFP_KERNEL); + kzalloc_flex(*new_beacon->mbssid_ies, elem, + beacon->mbssid_ies->cnt); if (!new_beacon->mbssid_ies) { kfree(new_beacon); return NULL; @@ -3920,9 +3949,8 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) if (beacon->rnr_ies && beacon->rnr_ies->cnt) { new_beacon->rnr_ies = - kzalloc(struct_size(new_beacon->rnr_ies, - elem, beacon->rnr_ies->cnt), - GFP_KERNEL); + kzalloc_flex(*new_beacon->rnr_ies, elem, + beacon->rnr_ies->cnt); if (!new_beacon->rnr_ies) { kfree(new_beacon->mbssid_ies); kfree(new_beacon); @@ -4144,12 +4172,21 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; + int link_id = -1; if (__ieee80211_csa_finalize(link_data)) { sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n", link_data->link_id); - cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, - GFP_KERNEL); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_P2P_GO) + /* + * link_id is expected only for AP/P2P_GO type + * currently + */ + link_id = link_data->link_id; + + cfg80211_stop_link(sdata->local->hw.wiphy, &sdata->wdev, + link_id, GFP_KERNEL); } } @@ -4393,7 +4430,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; /* if reservation is invalid then this will fail */ - err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1); + err = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; @@ -4705,7 +4742,7 @@ static int ieee80211_set_qos_map(struct wiphy *wiphy, struct mac80211_qos_map *new_qos_map, *old_qos_map; if (qos_map) { - new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL); + new_qos_map = kzalloc_obj(*new_qos_map); if (!new_qos_map) return -ENOMEM; memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map)); diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 7f8799fd673e..05f45e66999b 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -12,15 +12,134 @@ #include "driver-ops.h" #include "rate.h" +struct ieee80211_chanctx_user_iter { + struct ieee80211_chan_req *chanreq; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; + enum nl80211_iftype iftype; + bool reserved, radar_required, done; + enum { + CHANCTX_ITER_POS_ASSIGNED, + CHANCTX_ITER_POS_RESERVED, + CHANCTX_ITER_POS_DONE, + } per_link; +}; + +enum ieee80211_chanctx_iter_type { + CHANCTX_ITER_ALL, + CHANCTX_ITER_RESERVED, + CHANCTX_ITER_ASSIGNED, +}; + +static void ieee80211_chanctx_user_iter_next(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_chanctx_user_iter *iter, + enum ieee80211_chanctx_iter_type type, + bool start) +{ + lockdep_assert_wiphy(local->hw.wiphy); + + if (start) { + memset(iter, 0, sizeof(*iter)); + goto next_interface; + } + +next_link: + for (int link_id = iter->link ? iter->link->link_id : 0; + link_id < ARRAY_SIZE(iter->sdata->link); + link_id++) { + struct ieee80211_link_data *link; + + link = sdata_dereference(iter->sdata->link[link_id], + iter->sdata); + if (!link) + continue; + + switch (iter->per_link) { + case CHANCTX_ITER_POS_ASSIGNED: + iter->per_link = CHANCTX_ITER_POS_RESERVED; + if (type != CHANCTX_ITER_RESERVED && + rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { + iter->link = link; + iter->reserved = false; + iter->radar_required = link->radar_required; + iter->chanreq = &link->conf->chanreq; + return; + } + fallthrough; + case CHANCTX_ITER_POS_RESERVED: + iter->per_link = CHANCTX_ITER_POS_DONE; + if (type != CHANCTX_ITER_ASSIGNED && + link->reserved_chanctx == ctx) { + iter->link = link; + iter->reserved = true; + iter->radar_required = + link->reserved_radar_required; + + iter->chanreq = &link->reserved; + return; + } + fallthrough; + case CHANCTX_ITER_POS_DONE: + iter->per_link = CHANCTX_ITER_POS_ASSIGNED; + continue; + } + } + +next_interface: + /* next (or first) interface */ + iter->sdata = list_prepare_entry(iter->sdata, &local->interfaces, list); + list_for_each_entry_continue(iter->sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(iter->sdata)) + continue; + + /* AP_VLAN has a chanctx pointer but follows AP */ + if (iter->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; + + iter->link = NULL; + iter->per_link = CHANCTX_ITER_POS_ASSIGNED; + iter->iftype = iter->sdata->vif.type; + goto next_link; + } + + iter->done = true; +} + +#define for_each_chanctx_user_assigned(local, ctx, iter) \ + for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ASSIGNED, \ + true); \ + !((iter)->done); \ + ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ASSIGNED, \ + false)) + +#define for_each_chanctx_user_reserved(local, ctx, iter) \ + for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_RESERVED, \ + true); \ + !((iter)->done); \ + ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_RESERVED, \ + false)) + +#define for_each_chanctx_user_all(local, ctx, iter) \ + for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ALL, \ + true); \ + !((iter)->done); \ + ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ALL, \ + false)) + static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; int num = 0; - lockdep_assert_wiphy(local->hw.wiphy); - - list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) + for_each_chanctx_user_assigned(local, ctx, &iter) num++; return num; @@ -29,12 +148,10 @@ static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; int num = 0; - lockdep_assert_wiphy(local->hw.wiphy); - - list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) + for_each_chanctx_user_reserved(local, ctx, &iter) num++; return num; @@ -43,8 +160,13 @@ static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - return ieee80211_chanctx_num_assigned(local, ctx) + - ieee80211_chanctx_num_reserved(local, ctx); + struct ieee80211_chanctx_user_iter iter; + int num = 0; + + for_each_chanctx_user_all(local, ctx, &iter) + num++; + + return num; } static int ieee80211_num_chanctx(struct ieee80211_local *local, int radio_idx) @@ -143,15 +265,15 @@ ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local, const struct ieee80211_chan_req *req, struct ieee80211_chan_req *tmp) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!req)) return NULL; - list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { - req = ieee80211_chanreq_compatible(&link->reserved, req, tmp); + for_each_chanctx_user_reserved(local, ctx, &iter) { + req = ieee80211_chanreq_compatible(iter.chanreq, req, tmp); if (!req) break; } @@ -165,18 +287,16 @@ ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local, const struct ieee80211_chan_req *compat, struct ieee80211_chan_req *tmp) { - struct ieee80211_link_data *link; const struct ieee80211_chan_req *comp_def = compat; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { - struct ieee80211_bss_conf *link_conf = link->conf; - - if (link->reserved_chanctx) + for_each_chanctx_user_assigned(local, ctx, &iter) { + if (iter.link->reserved_chanctx) continue; - comp_def = ieee80211_chanreq_compatible(&link_conf->chanreq, + comp_def = ieee80211_chanreq_compatible(iter.chanreq, comp_def, tmp); if (!comp_def) break; @@ -200,7 +320,7 @@ ieee80211_chanctx_can_reserve(struct ieee80211_local *local, if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req, &tmp)) return false; - if (!list_empty(&ctx->reserved_links) && + if (ieee80211_chanctx_num_reserved(local, ctx) != 0 && ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp)) return true; @@ -389,10 +509,10 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, * channel context. */ static u32 -_ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for, - bool check_reserved) +__ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for, + bool check_reserved) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; @@ -441,14 +561,16 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { - struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_sub_if_data *sdata; enum ieee80211_sta_rx_bandwidth new_sta_bw; unsigned int link_id; if (!ieee80211_sdata_running(sta->sdata)) continue; - for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) { + sdata = get_bss_sdata(sta->sdata); + + for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link = rcu_dereference(sdata->link[link_id]); struct ieee80211_bss_conf *link_conf; @@ -497,13 +619,14 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, * the max of min required widths of all the interfaces bound to this * channel context. */ -void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for, - bool check_reserved) +static void +_ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for, + bool check_reserved) { - u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, - check_reserved); + u32 changed = __ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, + check_reserved); if (!changed) return; @@ -517,6 +640,12 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ieee80211_chan_bw_change(local, ctx, false, false); } +void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); +} + static void _ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, @@ -530,8 +659,19 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, }; u32 changed = 0; - /* expected to handle only 20/40/80/160/320 channel widths */ + /* 5/10 MHz not handled here */ switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: + /* + * mac80211 currently only supports sharing identical + * chanctx's for S1G interfaces. + */ + WARN_ON(!ieee80211_chanreq_identical(&ctx_req, chanreq)); + return; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: @@ -551,7 +691,7 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, ieee80211_chan_bw_change(local, old_ctx, false, true); if (ieee80211_chanreq_identical(&ctx_req, chanreq)) { - ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false); + _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false); return; } @@ -572,7 +712,8 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, ctx->conf.ap = chanreq->ap; /* check if min chanctx also changed */ - changed |= _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false); + changed |= __ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, + false); ieee80211_add_wbrf(local, &ctx->conf.def); @@ -633,8 +774,6 @@ ieee80211_find_chanctx(struct ieee80211_local *local, * context to actually be removed. */ link->reserved_chanctx = ctx; - list_add(&link->reserved_chanctx_list, - &ctx->reserved_links); ieee80211_change_chanctx(local, ctx, ctx, compat); @@ -675,17 +814,13 @@ static bool ieee80211_chanctx_radar_required(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - struct ieee80211_chanctx_conf *conf = &ctx->conf; - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); - for_each_sdata_link(local, link) { - if (rcu_access_pointer(link->conf->chanctx_conf) != conf) - continue; - if (!link->radar_required) - continue; - return true; + for_each_chanctx_user_assigned(local, ctx, &iter) { + if (iter.radar_required) + return true; } return false; @@ -705,8 +840,6 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, if (!ctx) return NULL; - INIT_LIST_HEAD(&ctx->assigned_links); - INIT_LIST_HEAD(&ctx->reserved_links); ctx->conf.def = chanreq->oper; ctx->conf.ap = chanreq->ap; ctx->conf.rx_chains_static = 1; @@ -715,7 +848,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, ctx->conf.radar_enabled = false; ctx->conf.radio_idx = radio_idx; ctx->radar_detected = false; - _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); + __ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); return ctx; } @@ -804,27 +937,17 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, { struct ieee80211_chanctx_conf *conf = &ctx->conf; const struct ieee80211_chan_req *compat = NULL; - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; struct ieee80211_chan_req tmp; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); - for_each_sdata_link(local, link) { - struct ieee80211_bss_conf *link_conf; - - if (link->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - continue; - - link_conf = link->conf; - - if (rcu_access_pointer(link_conf->chanctx_conf) != conf) - continue; - + for_each_chanctx_user_assigned(local, ctx, &iter) { if (!compat) - compat = &link_conf->chanreq; + compat = iter.chanreq; - compat = ieee80211_chanreq_compatible(&link_conf->chanreq, + compat = ieee80211_chanreq_compatible(iter.chanreq, compat, &tmp); if (WARN_ON_ONCE(!compat)) return; @@ -837,6 +960,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, list_for_each_entry(sta, &local->sta_list, list) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_chan_req tdls_chanreq = {}; + struct ieee80211_link_data *link; int tdls_link_id; if (!sta->uploaded || @@ -904,12 +1028,11 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx); conf = NULL; - list_del(&link->assigned_chanctx_list); } if (new_ctx) { /* recalc considering the link we'll use it for now */ - ieee80211_recalc_chanctx_min_def(local, new_ctx, link, false); + _ieee80211_recalc_chanctx_min_def(local, new_ctx, link, false); ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); if (assign_on_failure || !ret) { @@ -919,9 +1042,6 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, /* succeeded, so commit it to the data structures */ conf = &new_ctx->conf; - if (!local->in_reconfig) - list_add(&link->assigned_chanctx_list, - &new_ctx->assigned_links); } } else { ret = 0; @@ -933,12 +1053,12 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, ieee80211_recalc_chanctx_chantype(local, curr_ctx); ieee80211_recalc_smps_chanctx(local, curr_ctx); ieee80211_recalc_radar_chanctx(local, curr_ctx); - ieee80211_recalc_chanctx_min_def(local, curr_ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, curr_ctx); } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { ieee80211_recalc_txpower(link, false); - ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, new_ctx); } if (conf) { @@ -971,21 +1091,21 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { + struct ieee80211_chanctx_user_iter iter; struct ieee80211_sub_if_data *sdata; u8 rx_chains_static, rx_chains_dynamic; - struct ieee80211_link_data *link; lockdep_assert_wiphy(local->hw.wiphy); rx_chains_static = 1; rx_chains_dynamic = 1; - for_each_sdata_link(local, link) { + for_each_chanctx_user_assigned(local, chanctx, &iter) { u8 needed_static, needed_dynamic; - switch (link->sdata->vif.type) { + switch (iter.iftype) { case NL80211_IFTYPE_STATION: - if (!link->sdata->u.mgd.associated) + if (!iter.sdata->u.mgd.associated) continue; break; case NL80211_IFTYPE_MONITOR: @@ -1001,26 +1121,23 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, continue; } - if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) - continue; - - if (link->sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (iter.iftype == NL80211_IFTYPE_MONITOR) { rx_chains_dynamic = rx_chains_static = local->rx_chains; break; } - switch (link->smps_mode) { + switch (iter.link->smps_mode) { default: WARN_ONCE(1, "Invalid SMPS mode %d\n", - link->smps_mode); + iter.link->smps_mode); fallthrough; case IEEE80211_SMPS_OFF: - needed_static = link->needed_rx_chains; - needed_dynamic = link->needed_rx_chains; + needed_static = iter.link->needed_rx_chains; + needed_dynamic = iter.link->needed_rx_chains; break; case IEEE80211_SMPS_DYNAMIC: needed_static = 1; - needed_dynamic = link->needed_rx_chains; + needed_dynamic = iter.link->needed_rx_chains; break; case IEEE80211_SMPS_STATIC: needed_static = 1; @@ -1108,7 +1225,6 @@ void ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) if (WARN_ON(!ctx)) return; - list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) { @@ -1142,9 +1258,9 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, struct wiphy *wiphy = local->hw.wiphy; const struct wiphy_radio *radio; - if (!curr_ctx || (curr_ctx->replace_state == - IEEE80211_CHANCTX_WILL_BE_REPLACED) || - !list_empty(&curr_ctx->reserved_links)) { + if (!curr_ctx || + curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED || + ieee80211_chanctx_num_reserved(local, curr_ctx) != 0) { /* * Another link already requested this context for a * reservation. Find another one hoping all links assigned @@ -1167,7 +1283,7 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, IEEE80211_CHANCTX_REPLACE_NONE) continue; - if (!list_empty(&ctx->reserved_links)) + if (ieee80211_chanctx_num_reserved(local, ctx) != 0) continue; if (ctx->conf.radio_idx >= 0) { @@ -1185,9 +1301,9 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, * If that's true then all available contexts already have reservations * and cannot be used. */ - if (!curr_ctx || (curr_ctx->replace_state == - IEEE80211_CHANCTX_WILL_BE_REPLACED) || - !list_empty(&curr_ctx->reserved_links)) + if (!curr_ctx || + curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED || + ieee80211_chanctx_num_reserved(local, curr_ctx) != 0) return ERR_PTR(-EBUSY); new_ctx = ieee80211_alloc_chanctx(local, chanreq, mode, -1); @@ -1267,7 +1383,6 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, return PTR_ERR(new_ctx); } - list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links); link->reserved_chanctx = new_ctx; link->reserved = *chanreq; link->reserved_radar_required = radar_required; @@ -1381,7 +1496,6 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) vif_chsw[0].new_ctx = &new_ctx->conf; vif_chsw[0].link_conf = link->conf; - list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = drv_switch_vif_chanctx(local, vif_chsw, 1, @@ -1394,7 +1508,6 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) } link->radar_required = link->reserved_radar_required; - list_move(&link->assigned_chanctx_list, &new_ctx->assigned_links); rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf); if (sdata->vif.type == NL80211_IFTYPE_AP) @@ -1405,7 +1518,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx, false); - ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, new_ctx); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); @@ -1451,7 +1564,6 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) ieee80211_change_chanctx(local, new_ctx, new_ctx, chanreq); - list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = ieee80211_assign_link_chanctx(link, new_ctx, false); @@ -1497,18 +1609,19 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, int n_vifs) { struct ieee80211_vif_chanctx_switch *vif_chsw; - struct ieee80211_link_data *link; struct ieee80211_chanctx *ctx, *old_ctx; int i, err; lockdep_assert_wiphy(local->hw.wiphy); - vif_chsw = kcalloc(n_vifs, sizeof(vif_chsw[0]), GFP_KERNEL); + vif_chsw = kzalloc_objs(vif_chsw[0], n_vifs); if (!vif_chsw) return -ENOMEM; i = 0; list_for_each_entry(ctx, &local->chanctx_list, list) { + struct ieee80211_chanctx_user_iter iter; + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1517,16 +1630,15 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, goto out; } - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (!ieee80211_link_has_in_place_reservation(link)) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (!ieee80211_link_has_in_place_reservation(iter.link)) continue; - old_ctx = ieee80211_link_get_chanctx(link); - vif_chsw[i].vif = &link->sdata->vif; + old_ctx = ieee80211_link_get_chanctx(iter.link); + vif_chsw[i].vif = &iter.sdata->vif; vif_chsw[i].old_ctx = &old_ctx->conf; vif_chsw[i].new_ctx = &ctx->conf; - vif_chsw[i].link_conf = link->conf; + vif_chsw[i].link_conf = iter.link->conf; i++; } @@ -1551,7 +1663,7 @@ static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; - if (!list_empty(&ctx->replace_ctx->assigned_links)) + if (ieee80211_chanctx_num_assigned(local, ctx) != 0) continue; ieee80211_del_chanctx(local, ctx->replace_ctx, false); @@ -1568,7 +1680,7 @@ err: if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; - if (!list_empty(&ctx->replace_ctx->assigned_links)) + if (ieee80211_chanctx_num_assigned(local, ctx) != 0) continue; ieee80211_del_chanctx(local, ctx, false); @@ -1603,7 +1715,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) */ list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1619,12 +1731,11 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) n_reserved = 0; n_ready = 0; - list_for_each_entry(link, &ctx->replace_ctx->assigned_links, - assigned_chanctx_list) { + for_each_chanctx_user_assigned(local, ctx->replace_ctx, &iter) { n_assigned++; - if (link->reserved_chanctx) { + if (iter.link->reserved_chanctx) { n_reserved++; - if (link->reserved_ready) + if (iter.link->reserved_ready) n_ready++; } } @@ -1641,13 +1752,12 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) } ctx->conf.radar_enabled = false; - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (ieee80211_link_has_in_place_reservation(link) && - !link->reserved_ready) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (ieee80211_link_has_in_place_reservation(iter.link) && + !iter.link->reserved_ready) return -EAGAIN; - old_ctx = ieee80211_link_get_chanctx(link); + old_ctx = ieee80211_link_get_chanctx(iter.link); if (old_ctx) { if (old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) @@ -1658,7 +1768,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) n_vifs_ctxless++; } - if (link->reserved_radar_required) + if (iter.radar_required) ctx->conf.radar_enabled = true; } } @@ -1673,7 +1783,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) /* update station rate control and min width before switch */ list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1683,17 +1793,16 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) goto err; } - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (!ieee80211_link_has_in_place_reservation(link)) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (!ieee80211_link_has_in_place_reservation(iter.link)) continue; ieee80211_chan_bw_change(local, - ieee80211_link_get_chanctx(link), + ieee80211_link_get_chanctx(iter.link), true, true); } - ieee80211_recalc_chanctx_min_def(local, ctx, NULL, true); + _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, true); } /* @@ -1718,7 +1827,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * context(s). */ list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link, *link_tmp; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1728,9 +1837,9 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) goto err; } - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - struct ieee80211_sub_if_data *sdata = link->sdata; + for_each_chanctx_user_reserved(local, ctx, &iter) { + struct ieee80211_link_data *link = iter.link; + struct ieee80211_sub_if_data *sdata = iter.sdata; struct ieee80211_bss_conf *link_conf = link->conf; u64 changed = 0; @@ -1746,9 +1855,9 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_check_fast_xmit_iface(sdata); - link->radar_required = link->reserved_radar_required; + link->radar_required = iter.radar_required; - if (link_conf->chanreq.oper.width != link->reserved.oper.width) + if (link_conf->chanreq.oper.width != iter.chanreq->oper.width) changed = BSS_CHANGED_BANDWIDTH; ieee80211_link_update_chanreq(link, &link->reserved); @@ -1763,19 +1872,15 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); - ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, ctx); - list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, - reserved_chanctx_list) { - if (ieee80211_link_get_chanctx(link) != ctx) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (ieee80211_link_get_chanctx(iter.link) != ctx) continue; - list_del(&link->reserved_chanctx_list); - list_move(&link->assigned_chanctx_list, - &ctx->assigned_links); - link->reserved_chanctx = NULL; + iter.link->reserved_chanctx = NULL; - ieee80211_link_chanctx_reservation_complete(link); + ieee80211_link_chanctx_reservation_complete(iter.link); ieee80211_chan_bw_change(local, ctx, false, false); } @@ -1786,12 +1891,10 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * reservation for originally requested interface has already * succeeded at this point. */ - list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, - reserved_chanctx_list) { - if (WARN_ON(ieee80211_link_has_in_place_reservation(link))) - continue; + for_each_chanctx_user_reserved(local, ctx, &iter) { + struct ieee80211_link_data *link = iter.link; - if (WARN_ON(link->reserved_chanctx != ctx)) + if (WARN_ON(ieee80211_link_has_in_place_reservation(link))) continue; if (!link->reserved_ready) @@ -1834,15 +1937,14 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) err: list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link, *link_tmp; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; - list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, - reserved_chanctx_list) { - ieee80211_link_unreserve_chanctx(link); - ieee80211_link_chanctx_reservation_complete(link); + for_each_chanctx_user_reserved(local, ctx, &iter) { + ieee80211_link_unreserve_chanctx(iter.link); + ieee80211_link_chanctx_reservation_complete(iter.link); } } @@ -1949,7 +2051,6 @@ int _ieee80211_link_use_channel(struct ieee80211_link_data *link, /* remove reservation */ WARN_ON(link->reserved_chanctx != ctx); link->reserved_chanctx = NULL; - list_del(&link->reserved_chanctx_list); } if (ret) { @@ -2046,29 +2147,17 @@ ieee80211_chanctx_recheck(struct ieee80211_local *local, struct ieee80211_chan_req *tmp) { const struct ieee80211_chan_req *ret = req; - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); - for_each_sdata_link(local, link) { - if (link == skip_link) + for_each_chanctx_user_all(local, ctx, &iter) { + if (iter.link == skip_link) continue; - if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { - ret = ieee80211_chanreq_compatible(ret, - &link->conf->chanreq, - tmp); - if (!ret) - return NULL; - } - - if (link->reserved_chanctx == ctx) { - ret = ieee80211_chanreq_compatible(ret, - &link->reserved, - tmp); - if (!ret) - return NULL; - } + ret = ieee80211_chanreq_compatible(ret, iter.chanreq, tmp); + if (!ret) + return NULL; } *tmp = *ret; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index d02f07368c51..687a66cd4943 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -320,7 +320,6 @@ static ssize_t aql_enable_read(struct file *file, char __user *user_buf, static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - bool aql_disabled = static_key_false(&aql_disable.key); char buf[3]; size_t len; @@ -335,15 +334,12 @@ static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, if (len > 0 && buf[len - 1] == '\n') buf[len - 1] = 0; - if (buf[0] == '0' && buf[1] == '\0') { - if (!aql_disabled) - static_branch_inc(&aql_disable); - } else if (buf[0] == '1' && buf[1] == '\0') { - if (aql_disabled) - static_branch_dec(&aql_disable); - } else { + if (buf[0] == '0' && buf[1] == '\0') + static_branch_enable(&aql_disable); + else if (buf[0] == '1' && buf[1] == '\0') + static_branch_disable(&aql_disable); + else return -EINVAL; - } return count; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 30a5a978a678..f3c6a41e4911 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/device.h> +#include <linux/hex.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/interrupt.h> diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index ba9fba165926..49753b73aba2 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -476,8 +476,12 @@ void drv_link_info_changed(struct ieee80211_local *local, if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !sdata->vif.bss_conf.mu_mimo_owner && - !(changed & BSS_CHANGED_TXPOWER)))) + changed & ~(BSS_CHANGED_TXPOWER | + BSS_CHANGED_MU_GROUPS)))) + return; + + if (WARN_ON_ONCE(changed & BSS_CHANGED_MU_GROUPS && + !sdata->vif.bss_conf.mu_mimo_owner)) return; if (!check_sdata_in_driver(sdata)) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 55105d238d6b..51bf3c7822a7 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1772,4 +1772,25 @@ drv_prep_add_interface(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline int drv_set_eml_op_mode(struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_eml_params *eml_params) +{ + struct ieee80211_local *local = sdata->local; + int ret = -EOPNOTSUPP; + + might_sleep(); + lockdep_assert_wiphy(local->hw.wiphy); + + trace_drv_set_eml_op_mode(local, sdata, sta, eml_params->link_id, + eml_params->control, + eml_params->link_bitmap); + if (local->ops->set_eml_op_mode) + ret = local->ops->set_eml_op_mode(&local->hw, &sdata->vif, + sta, eml_params); + trace_drv_return_int(local, ret); + + return ret; +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h index eb9ab310f91c..f06a8aa905c5 100644 --- a/net/mac80211/drop.h +++ b/net/mac80211/drop.h @@ -2,7 +2,7 @@ /* * mac80211 drop reason list * - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2024, 2026 Intel Corporation */ #ifndef MAC80211_DROP_H @@ -65,6 +65,49 @@ typedef unsigned int __bitwise ieee80211_rx_result; /* 0x30 */ \ R(RX_DROP_U_BAD_MGMT_KEYIDX) \ R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \ + R(RX_DROP_U_MESH_DS_BITS) \ + R(RX_DROP_U_MESH_A3_MISMATCH) \ + R(RX_DROP_U_MESH_NO_A4) \ + R(RX_DROP_U_MESH_A4_MISMATCH) \ + R(RX_DROP_U_MESH_UNEXP_DATA) \ + R(RX_DROP_U_MESH_WRONG_ACTION) \ + R(RX_DROP_U_MESH_UNEXP_MGMT) \ + R(RX_DROP_U_SPURIOUS_NOTIF) \ + R(RX_DROP_U_RUNT_DATA) \ + R(RX_DROP_U_KEY_TAINTED) \ + R(RX_DROP_U_UNPROTECTED) \ + R(RX_DROP_U_MCAST_FRAGMENT) \ + R(RX_DROP_U_DEFRAG_MISMATCH) \ + R(RX_DROP_U_RUNT_MESH_DATA) \ + /* 0x40 */ \ + R(RX_DROP_U_MESH_NO_TTL) \ + R(RX_DROP_U_MESH_RMC) \ + R(RX_DROP_U_MESH_BAD_AE) \ + R(RX_DROP_U_MESH_TTL_EXPIRED) \ + R(RX_DROP_U_MESH_NOT_FORWARDING) \ + R(RX_DROP_U_AMSDU_WITHOUT_DATA) \ + R(RX_DROP_U_NULL_DATA) \ + R(RX_DROP_U_UNEXPECTED_4ADDR) \ + R(RX_DROP_U_PORT_CONTROL) \ + R(RX_DROP_U_UNKNOWN_STA) \ + R(RX_DROP_U_RUNT_BAR) \ + R(RX_DROP_U_BAR_OUTSIDE_SESSION) \ + R(RX_DROP_U_CTRL_FRAME) \ + R(RX_DROP_U_RUNT_MGMT) \ + R(RX_DROP_U_EXPECTED_MGMT) \ + R(RX_DROP_U_NONBCAST_BEACON) \ + /* 0x50 */ \ + R(RX_DROP_U_MALFORMED_ACTION) \ + R(RX_DROP_U_UNKNOWN_MCAST_ACTION) \ + R(RX_DROP_U_UNEXPECTED_EXT_FRAME) \ + R(RX_DROP_U_UNHANDLED_MGMT) \ + R(RX_DROP_U_MCAST_DEAUTH) \ + R(RX_DROP_U_UNHANDLED_DEAUTH) \ + R(RX_DROP_U_MCAST_DISASSOC) \ + R(RX_DROP_U_UNHANDLED_DISASSOC) \ + R(RX_DROP_U_UNHANDLED_PREQ) \ + R(RX_DROP_U_UNHANDLED_MGMT_STYPE) \ + R(RX_DROP_U_NO_LINK) \ /* this line for the trailing \ - add before this */ /* having two enums allows for checking ieee80211_rx_result use with sparse */ @@ -85,7 +128,6 @@ enum ___mac80211_drop_reason { enum mac80211_drop_reason { RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE, RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED, - RX_DROP = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE, #define DEF(x) x = (__force ieee80211_rx_result)___ ## x, MAC80211_DROP_REASONS_UNUSABLE(DEF) #undef DEF diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index fd41046e3b68..078e1e23d8d1 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -5,6 +5,7 @@ * Copyright(c) 2021-2025 Intel Corporation */ +#include "driver-ops.h" #include "ieee80211_i.h" void @@ -102,3 +103,178 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } + +static void +ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *req, int opt_len) +{ + int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + + len += opt_len; /* optional len */ + skb = dev_alloc_skb(local->tx_headroom + len); + if (!skb) + return; + + skb_reserve(skb, local->tx_headroom); + mgmt = skb_put_zero(skb, len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, req->sa, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.eml_omn.action_code = + WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF; + mgmt->u.action.u.eml_omn.dialog_token = + req->u.action.u.eml_omn.dialog_token; + mgmt->u.action.u.eml_omn.control = req->u.action.u.eml_omn.control & + ~(IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE | + IEEE80211_EML_CTRL_INDEV_COEX_ACT); + /* Copy optional fields from the received notification frame */ + memcpy(mgmt->u.action.u.eml_omn.variable, + req->u.action.u.eml_omn.variable, opt_len); + + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + enum nl80211_iftype type = ieee80211_vif_type_p2p(&sdata->vif); + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + const struct wiphy_iftype_ext_capab *ift_ext_capa; + struct ieee80211_mgmt *mgmt = (void *)skb->data; + struct ieee80211_local *local = sdata->local; + u8 control = mgmt->u.action.u.eml_omn.control; + u8 *ptr = mgmt->u.action.u.eml_omn.variable; + struct ieee80211_eml_params eml_params = { + .link_id = status->link_id, + .control = control, + }; + struct sta_info *sta; + int opt_len = 0; + + if (!ieee80211_vif_is_mld(&sdata->vif)) + return; + + /* eMLSR and eMLMR can't be enabled at the same time */ + if ((control & IEEE80211_EML_CTRL_EMLSR_MODE) && + (control & IEEE80211_EML_CTRL_EMLMR_MODE)) + return; + + if ((control & IEEE80211_EML_CTRL_EMLMR_MODE) && + (control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE)) + return; + + ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, type); + if (!ift_ext_capa) + return; + + if (!status->link_valid) + return; + + sta = sta_info_get_bss(sdata, mgmt->sa); + if (!sta) + return; + + if (control & IEEE80211_EML_CTRL_EMLSR_MODE) { + u8 emlsr_param_update_len; + + if (!(ift_ext_capa->eml_capabilities & + IEEE80211_EML_CAP_EMLSR_SUPP)) + return; + + opt_len += sizeof(__le16); /* eMLSR link_bitmap */ + /* eMLSR param update field is not part of Notfication frame + * sent by the AP to client so account it separately. + */ + emlsr_param_update_len = + !!(control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE); + + if (skb->len < len + opt_len + emlsr_param_update_len) + return; + + if (control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE) { + u8 pad_delay, trans_delay; + + pad_delay = u8_get_bits(ptr[2], + IEEE80211_EML_EMLSR_PAD_DELAY); + if (pad_delay > + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return; + + trans_delay = u8_get_bits(ptr[2], + IEEE80211_EML_EMLSR_TRANS_DELAY); + if (trans_delay > + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return; + + /* Update sta padding and transition delay */ + sta->sta.eml_cap = + u8_replace_bits(sta->sta.eml_cap, + pad_delay, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + sta->sta.eml_cap = + u8_replace_bits(sta->sta.eml_cap, + trans_delay, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + } + } + + if (control & IEEE80211_EML_CTRL_EMLMR_MODE) { + u8 mcs_map_size; + int i; + + if (!(ift_ext_capa->eml_capabilities & + IEEE80211_EML_CAP_EMLMR_SUPPORT)) + return; + + opt_len += sizeof(__le16); /* eMLMR link_bitmap */ + opt_len++; /* eMLMR mcs_map_count */ + if (skb->len < len + opt_len) + return; + + eml_params.emlmr_mcs_map_count = ptr[2]; + if (eml_params.emlmr_mcs_map_count > 2) + return; + + mcs_map_size = 3 * (1 + eml_params.emlmr_mcs_map_count); + opt_len += mcs_map_size; + if (skb->len < len + opt_len) + return; + + for (i = 0; i < mcs_map_size; i++) { + u8 rx_mcs, tx_mcs; + + rx_mcs = u8_get_bits(ptr[3 + i], + IEEE80211_EML_EMLMR_RX_MCS_MAP); + if (rx_mcs > 8) + return; + + tx_mcs = u8_get_bits(ptr[3 + i], + IEEE80211_EML_EMLMR_TX_MCS_MAP); + if (tx_mcs > 8) + return; + } + + memcpy(eml_params.emlmr_mcs_map_bw, &ptr[3], mcs_map_size); + } + + if ((control & IEEE80211_EML_CTRL_EMLSR_MODE) || + (control & IEEE80211_EML_CTRL_EMLMR_MODE)) { + eml_params.link_bitmap = get_unaligned_le16(ptr); + if ((eml_params.link_bitmap & sdata->vif.active_links) != + eml_params.link_bitmap) + return; + } + + if (drv_set_eml_op_mode(sdata, &sta->sta, &eml_params)) + return; + + ieee80211_send_eml_op_mode_notif(sdata, mgmt, opt_len); +} diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 5792ef77e986..f7b05e59374c 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -3,7 +3,7 @@ * HE handling * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2019 - 2024 Intel Corporation + * Copyright(c) 2019-2025 Intel Corporation */ #include "ieee80211_i.h" @@ -313,7 +313,7 @@ bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta, ieee80211_link_sta_rc_update_omi(link, link_sta); } else { link_sta->rx_omi_bw_rx = bw; - ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, chanctx); } link_sta->rx_omi_bw_staging = bw; @@ -359,7 +359,7 @@ void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta) /* channel context in finalize only when narrowing bandwidth */ WARN_ON(link_sta->rx_omi_bw_rx < link_sta->rx_omi_bw_staging); link_sta->rx_omi_bw_rx = link_sta->rx_omi_bw_staging; - ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, chanctx); } trace_api_return_void(local); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 6e36b09fe97f..168f84a1353b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2024 Intel Corporation + * Copyright(c) 2018-2025 Intel Corporation */ #include <linux/delay.h> @@ -1554,6 +1554,7 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, { size_t baselen; struct ieee802_11_elems *elems; + u16 type; BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != offsetof(typeof(mgmt->u.beacon), variable)); @@ -1566,8 +1567,9 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; + type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, - len - baselen, false, NULL); + len - baselen, type, NULL); if (elems) { ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems); @@ -1616,9 +1618,11 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, if (ies_len < 0) break; - elems = ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 878c3b14aeb8..e60b814dd89e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #ifndef IEEE80211_I_H @@ -394,9 +394,10 @@ enum ieee80211_conn_mode { IEEE80211_CONN_MODE_VHT, IEEE80211_CONN_MODE_HE, IEEE80211_CONN_MODE_EHT, + IEEE80211_CONN_MODE_UHR, }; -#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_EHT +#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_UHR enum ieee80211_conn_bw_limit { IEEE80211_CONN_BW_LIMIT_20, @@ -430,7 +431,7 @@ struct ieee80211_mgd_auth_data { u8 ap_addr[ETH_ALEN] __aligned(2); - u16 sae_trans, sae_status; + u16 trans, status; size_t data_len; u8 data[]; }; @@ -451,8 +452,6 @@ struct ieee80211_mgd_assoc_data { struct ieee80211_conn_settings conn; u16 status; - - bool disabled; } link[IEEE80211_MLD_MAX_NUM_LINKS]; u8 ap_addr[ETH_ALEN] __aligned(2); @@ -916,9 +915,6 @@ struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; - struct list_head assigned_links; - struct list_head reserved_links; - enum ieee80211_chanctx_replace_state replace_state; struct ieee80211_chanctx *replace_ctx; @@ -1071,9 +1067,6 @@ struct ieee80211_link_data { struct ieee80211_sub_if_data *sdata; unsigned int link_id; - struct list_head assigned_chanctx_list; /* protected by wiphy mutex */ - struct list_head reserved_chanctx_list; /* protected by wiphy mutex */ - /* multicast keys only */ struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + @@ -1107,7 +1100,7 @@ struct ieee80211_link_data { int ap_power_level; /* in dBm */ bool radar_required; - struct wiphy_delayed_work dfs_cac_timer_work; + struct wiphy_hrtimer_work dfs_cac_timer_work; union { struct ieee80211_link_data_managed mgd; @@ -1239,9 +1232,12 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ + for (int ___link_id = ARRAY_SIZE(___sdata->link); \ + ___link_id; ___link_id = 0 /* always stop */) \ list_for_each_entry(___sdata, &(_local)->interfaces, list) \ - if (ieee80211_sdata_running(___sdata)) \ - for (int ___link_id = 0; \ + if (___link_id == ARRAY_SIZE(___sdata->link) && \ + ieee80211_sdata_running(___sdata)) \ + for (___link_id = 0; \ ___link_id < ARRAY_SIZE(___sdata->link); \ ___link_id++) \ if ((_link = wiphy_dereference((_local)->hw.wiphy, \ @@ -1255,9 +1251,12 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ - list_for_each_entry_rcu(___sdata, &(_local)->interfaces, list) \ - if (ieee80211_sdata_running(___sdata)) \ - for (int ___link_id = 0; \ + for (int ___link_id = ARRAY_SIZE(___sdata->link); \ + ___link_id; ___link_id = 0 /* always stop */) \ + list_for_each_entry(___sdata, &(_local)->interfaces, list) \ + if (___link_id == ARRAY_SIZE(___sdata->link) && \ + ieee80211_sdata_running(___sdata)) \ + for (___link_id = 0; \ ___link_id < ARRAY_SIZE((___sdata)->link); \ ___link_id++) \ if ((_link = rcu_dereference((___sdata)->link[___link_id]))) @@ -1826,6 +1825,8 @@ struct ieee802_11_elems { const struct ieee80211_multi_link_elem *ml_epcs; const struct ieee80211_bandwidth_indication *bandwidth_indication; const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT]; + const struct ieee80211_uhr_cap *uhr_cap; + const struct ieee80211_uhr_operation *uhr_operation; /* not the order in the psd values is per element, not per chandef */ struct ieee80211_parsed_tpe tpe; @@ -1850,6 +1851,8 @@ struct ieee802_11_elems { u8 country_elem_len; u8 bssid_index_len; u8 eht_cap_len; + u8 uhr_cap_len; + u8 uhr_operation_len; /* mult-link element can be de-fragmented and thus u8 is not sufficient */ size_t ml_basic_len; @@ -2107,7 +2110,8 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset); int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up); void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); -int ieee80211_add_virtual_monitor(struct ieee80211_local *local); +int ieee80211_add_virtual_monitor(struct ieee80211_local *local, + struct ieee80211_sub_if_data *creator_sdata); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link); @@ -2392,6 +2396,14 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id, enum nl80211_band band); +static inline bool ieee80211_require_encrypted_assoc(__le16 fc, + struct sta_info *sta) +{ + return (sta && sta->sta.epp_peer && + (ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc) || + ieee80211_is_assoc_resp(fc) || ieee80211_is_reassoc_resp(fc))); +} + /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, @@ -2422,7 +2434,8 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * @mode: connection mode for parsing * @start: pointer to the elements * @len: length of the elements - * @action: %true if the elements came from an action frame + * @type: type of the frame the elements came from + * (action, probe response, beacon, etc.) * @filter: bitmap of element IDs to filter out while calculating * the element CRC * @crc: CRC starting value @@ -2440,7 +2453,7 @@ struct ieee80211_elems_parse_params { enum ieee80211_conn_mode mode; const u8 *start; size_t len; - bool action; + u8 type; u64 filter; u32 crc; struct cfg80211_bss *bss; @@ -2452,17 +2465,14 @@ struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params); static inline struct ieee802_11_elems * -ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, - u64 filter, u32 crc, - struct cfg80211_bss *bss) +ieee802_11_parse_elems(const u8 *start, size_t len, u8 type, + struct cfg80211_bss *bss) { struct ieee80211_elems_parse_params params = { .mode = IEEE80211_CONN_MODE_HIGHEST, .start = start, .len = len, - .action = action, - .filter = filter, - .crc = crc, + .type = type, .bss = bss, .link_id = -1, }; @@ -2470,13 +2480,6 @@ ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, return ieee802_11_parse_elems_full(¶ms); } -static inline struct ieee802_11_elems * -ieee802_11_parse_elems(const u8 *start, size_t len, bool action, - struct cfg80211_bss *bss) -{ - return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss); -} - extern const int ieee802_1d_to_ac[8]; static inline int ieee80211_ac_from_tid(int tid) @@ -2668,8 +2671,7 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata); u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap); -int ieee80211_parse_bitrates(enum nl80211_chan_width width, - const struct ieee80211_supported_band *sband, +int ieee80211_parse_bitrates(const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, @@ -2694,6 +2696,9 @@ int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); +int ieee80211_put_uhr_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband); int ieee80211_put_reg_conn(struct sk_buff *skb, enum ieee80211_channel_flags flags); @@ -2768,9 +2773,7 @@ int ieee80211_chanctx_refcount(struct ieee80211_local *local, void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for, - bool check_reserved); + struct ieee80211_chanctx *ctx); bool ieee80211_is_radar_required(struct ieee80211_local *local, struct cfg80211_scan_request *req); bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, @@ -2840,6 +2843,8 @@ void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata); +void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, @@ -2871,6 +2876,13 @@ void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata); +void +ieee80211_uhr_cap_ie_to_sta_uhr_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_uhr_cap *uhr_cap, + u8 uhr_cap_len, + struct link_sta_info *link_sta); + #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 0ca55b9655a7..676b2a43c9f2 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -8,7 +8,7 @@ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -350,6 +350,8 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { if (nsdata != sdata && ieee80211_sdata_running(nsdata)) { + struct ieee80211_link_data *link; + /* * Only OCB and monitor mode may coexist */ @@ -376,8 +378,10 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, * will not add another interface while any channel * switch is active. */ - if (nsdata->vif.bss_conf.csa_active) - return -EBUSY; + for_each_link_data(nsdata, link) { + if (link->conf->csa_active) + return -EBUSY; + } /* * The remaining checks are only performed for interfaces @@ -561,7 +565,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa.finalize_work); wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.color_change_finalize_work); - wiphy_delayed_work_cancel(local->hw.wiphy, + wiphy_hrtimer_work_cancel(local->hw.wiphy, &sdata->deflink.dfs_cac_timer_work); if (sdata->wdev.links[0].cac_started) { @@ -741,8 +745,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_configure_filter(local); ieee80211_hw_config(local, -1, hw_reconf_flags); + /* Passing NULL means an interface is picked for configuration */ if (local->virt_monitors == local->open_count) - ieee80211_add_virtual_monitor(local); + ieee80211_add_virtual_monitor(local, NULL); } void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) @@ -1176,7 +1181,8 @@ static void ieee80211_sdata_init(struct ieee80211_local *local, ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf); } -int ieee80211_add_virtual_monitor(struct ieee80211_local *local) +int ieee80211_add_virtual_monitor(struct ieee80211_local *local, + struct ieee80211_sub_if_data *creator_sdata) { struct ieee80211_sub_if_data *sdata; int ret; @@ -1184,10 +1190,14 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); - if (local->monitor_sdata || - ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) return 0; + /* Already have a monitor set up, configure it */ + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); + if (sdata) + goto configure_monitor; + sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); if (!sdata) return -ENOMEM; @@ -1240,6 +1250,32 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) skb_queue_head_init(&sdata->status_queue); wiphy_work_init(&sdata->work, ieee80211_iface_work); +configure_monitor: + /* Copy in the MU-MIMO configuration if set */ + if (!creator_sdata) { + struct ieee80211_sub_if_data *other; + + list_for_each_entry_rcu(other, &local->mon_list, u.mntr.list) { + if (!other->vif.bss_conf.mu_mimo_owner) + continue; + + creator_sdata = other; + break; + } + } + + if (creator_sdata && creator_sdata->vif.bss_conf.mu_mimo_owner) { + sdata->vif.bss_conf.mu_mimo_owner = true; + memcpy(&sdata->vif.bss_conf.mu_group, + &creator_sdata->vif.bss_conf.mu_group, + sizeof(sdata->vif.bss_conf.mu_group)); + memcpy(&sdata->u.mntr.mu_follow_addr, + creator_sdata->u.mntr.mu_follow_addr, ETH_ALEN); + + ieee80211_link_info_change_notify(sdata, &sdata->deflink, + BSS_CHANGED_MU_GROUPS); + } + return 0; } @@ -1396,11 +1432,13 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (res) goto err_stop; } else { - if (local->virt_monitors == 0 && local->open_count == 0) { - res = ieee80211_add_virtual_monitor(local); + /* add/configure if there is no non-monitor interface */ + if (local->virt_monitors == local->open_count) { + res = ieee80211_add_virtual_monitor(local, sdata); if (res) goto err_stop; } + local->virt_monitors++; /* must be before the call to ieee80211_configure_filter */ @@ -1630,7 +1668,15 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) { - if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->vif.type == NL80211_IFTYPE_AP) { + switch (mgmt->u.action.u.eml_omn.action_code) { + case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: + ieee80211_rx_eml_op_mode_notif(sdata, skb); + break; + default: + break; + } + } else if (sdata->vif.type == NL80211_IFTYPE_STATION) { switch (mgmt->u.action.u.ttlm_req.action_code) { case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: ieee80211_process_neg_ttlm_req(sdata, mgmt, @@ -1755,7 +1801,7 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work) else ieee80211_iface_process_skb(local, sdata, skb); - kfree_skb(skb); + consume_skb(skb); kcov_remote_stop(); } @@ -1764,7 +1810,7 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work) kcov_remote_start_common(skb_get_kcov_handle(skb)); ieee80211_iface_process_status(sdata, skb); - kfree_skb(skb); + consume_skb(skb); kcov_remote_stop(); } diff --git a/net/mac80211/key.c b/net/mac80211/key.c index d5da7ccea66e..04c8809173d7 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -987,7 +987,8 @@ void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata) if (ieee80211_sdata_running(sdata)) { list_for_each_entry(key, &sdata->key_list, list) { - increment_tailroom_need_count(sdata); + if (!(key->flags & KEY_FLAG_TAINTED)) + increment_tailroom_need_count(sdata); ieee80211_key_enable_hw_accel(key); } } diff --git a/net/mac80211/led.c b/net/mac80211/led.c index fabbffdd3ac2..b5600d223452 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -298,7 +298,7 @@ __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, if (WARN_ON(local->tpt_led_trigger)) return NULL; - tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL); + tpt_trig = kzalloc_obj(struct tpt_led_trigger); if (!tpt_trig) return NULL; diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 4a19b765ccb6..03bfca27d205 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -23,9 +23,6 @@ static void ieee80211_update_apvlan_links(struct ieee80211_sub_if_data *sdata) list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { int link_id; - if (!vlan) - continue; - /* No support for 4addr with MLO yet */ if (vlan->wdev.use_4addr) return; @@ -119,9 +116,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_color_change_finalize_work); wiphy_delayed_work_init(&link->color_collision_detect_work, ieee80211_color_collision_detection_work); - INIT_LIST_HEAD(&link->assigned_chanctx_list); - INIT_LIST_HEAD(&link->reserved_chanctx_list); - wiphy_delayed_work_init(&link->dfs_cac_timer_work, + wiphy_hrtimer_work_init(&link->dfs_cac_timer_work, ieee80211_dfs_cac_timer_work); if (!deflink) { @@ -160,7 +155,7 @@ void ieee80211_link_stop(struct ieee80211_link_data *link) &link->csa.finalize_work); if (link->sdata->wdev.links[link->link_id].cac_started) { - wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(link->sdata->dev, &link->conf->chanreq.oper, @@ -286,6 +281,7 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]; struct ieee80211_link_data *old_data[IEEE80211_MLD_MAX_NUM_LINKS]; bool use_deflink = old_links == 0; /* set for error case */ + bool non_sta = sdata->vif.type != NL80211_IFTYPE_STATION; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -300,7 +296,7 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, /* allocate new link structures first */ for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { - link = kzalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc_obj(*link); if (!link) { ret = -ENOMEM; goto free; @@ -342,6 +338,7 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, link = links[link_id]; ieee80211_link_init(sdata, link_id, &link->data, &link->conf); ieee80211_link_setup(&link->data); + ieee80211_set_wmm_default(&link->data, true, non_sta); } if (new_links == 0) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index eefa6f7e899b..616f86b1a7e4 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <net/mac80211.h> @@ -356,8 +356,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !sdata->vif.bss_conf.mu_mimo_owner && - !(changed & BSS_CHANGED_TXPOWER)))) + changed & ~BSS_CHANGED_TXPOWER))) return; if (!check_sdata_in_driver(sdata)) @@ -1124,7 +1123,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g; + bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g, supp_uhr; struct cfg80211_chan_def dflt_chandef = {}; if (ieee80211_hw_check(hw, QUEUE_CONTROL) && @@ -1238,6 +1237,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_he = false; supp_eht = false; supp_s1g = false; + supp_uhr = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; @@ -1294,6 +1294,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_he = supp_he || iftd->he_cap.has_he; supp_eht = supp_eht || iftd->eht_cap.has_eht; + supp_uhr = supp_uhr || iftd->uhr_cap.has_uhr; if (band == NL80211_BAND_2GHZ) he_40_mhz_cap = @@ -1326,6 +1327,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (WARN_ON(supp_eht && !supp_he)) return -EINVAL; + /* UHR requires EHT support */ + if (WARN_ON(supp_uhr && !supp_eht)) + return -EINVAL; + if (!sband->ht_cap.ht_supported) continue; @@ -1354,9 +1359,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR); - local->int_scan_req = kzalloc(struct_size(local->int_scan_req, - channels, channels), - GFP_KERNEL); + local->int_scan_req = kzalloc_flex(*local->int_scan_req, channels, + channels); if (!local->int_scan_req) return -ENOMEM; @@ -1438,6 +1442,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) IEEE80211_EHT_PPE_THRES_MAX_LEN; } + if (supp_uhr) + local->scan_ies_len += + 3 + sizeof(struct ieee80211_uhr_cap) + + sizeof(struct ieee80211_uhr_cap_phy); + if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index f37068a533f4..8fdbdf9ba2a9 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -79,6 +79,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, * - MDA enabled * - Power management control on fc */ + if (!ie->mesh_config) + return false; + if (!(ifmsh->mesh_id_len == ie->mesh_id_len && memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 && (ifmsh->mesh_pp_id == ie->mesh_config->meshconf_psel) && @@ -178,7 +181,7 @@ int mesh_rmc_init(struct ieee80211_sub_if_data *sdata) { int i; - sdata->u.mesh.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL); + sdata->u.mesh.rmc = kmalloc_obj(struct mesh_rmc); if (!sdata->u.mesh.rmc) return -ENOMEM; sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1; @@ -1410,7 +1413,10 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; - elems = ieee802_11_parse_elems(pos, len - baselen, false, NULL); + elems = ieee802_11_parse_elems(pos, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_REQ, + NULL); if (!elems) return; @@ -1455,11 +1461,11 @@ free: } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, - u16 stype, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { + u16 type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE; struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee802_11_elems *elems; @@ -1469,7 +1475,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, enum nl80211_band band = rx_status->band; /* ignore ProbeResp to foreign address */ - if (stype == IEEE80211_STYPE_PROBE_RESP && + if (type == (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP) && !ether_addr_equal(mgmt->da, sdata->vif.addr)) return; @@ -1478,8 +1484,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, return; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, - len - baselen, - false, NULL); + len - baselen, type, NULL); if (!elems) return; @@ -1514,7 +1519,9 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, } if (ifmsh->sync_ops) - ifmsh->sync_ops->rx_bcn_presp(sdata, stype, mgmt, len, + ifmsh->sync_ops->rx_bcn_presp(sdata, + type & IEEE80211_FCTL_STYPE, + mgmt, len, elems->mesh_config, rx_status); free: kfree(elems); @@ -1556,8 +1563,7 @@ int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(sdata->local->hw.wiphy); - tmp_csa_settings = kmalloc(sizeof(*tmp_csa_settings), - GFP_ATOMIC); + tmp_csa_settings = kmalloc_obj(*tmp_csa_settings, GFP_ATOMIC); if (!tmp_csa_settings) return -ENOMEM; @@ -1622,13 +1628,19 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, pos = mgmt->u.action.u.chan_switch.variable; baselen = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); - elems = ieee802_11_parse_elems(pos, len - baselen, true, NULL); + elems = ieee802_11_parse_elems(pos, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; if (!mesh_matches_local(sdata, elems)) goto free; + if (!elems->mesh_chansw_params_ie) + goto free; + ifmsh->chsw_ttl = elems->mesh_chansw_params_ie->mesh_ttl; if (!--ifmsh->chsw_ttl) fwd_csa = false; @@ -1699,8 +1711,7 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, switch (stype) { case IEEE80211_STYPE_PROBE_RESP: case IEEE80211_STYPE_BEACON: - ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len, - rx_status); + ieee80211_mesh_rx_bcn_presp(sdata, mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_PROBE_REQ: ieee80211_mesh_rx_probe_req(sdata, mgmt, skb->len); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 9101858525dd..98d5aaa36d00 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2023 Intel Corporation + * Copyright (C) 2019, 2021-2023, 2025 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ @@ -951,7 +951,10 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, - len - baselen, false, NULL); + len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; @@ -1002,7 +1005,7 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_preq_queue *preq_node; - preq_node = kmalloc(sizeof(struct mesh_preq_queue), GFP_ATOMIC); + preq_node = kmalloc_obj(struct mesh_preq_queue, GFP_ATOMIC); if (!preq_node) { mhwmp_dbg(sdata, "could not allocate PREQ node\n"); return; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 0319674be832..03171cf00855 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -405,7 +405,7 @@ struct mesh_path *mesh_path_new(struct ieee80211_sub_if_data *sdata, { struct mesh_path *new_mpath; - new_mpath = kzalloc(sizeof(struct mesh_path), gfp_flags); + new_mpath = kzalloc_obj(struct mesh_path, gfp_flags); if (!new_mpath) return NULL; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index cb45a5d2009d..04c931cd2063 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2024 Intel Corporation + * Copyright (C) 2019, 2021-2025 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ #include <linux/gfp.h> @@ -1248,7 +1248,10 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; } - elems = ieee802_11_parse_elems(baseaddr, len - baselen, true, NULL); + elems = ieee802_11_parse_elems(baseaddr, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems) { mesh_process_plink_frame(sdata, mgmt, elems, rx_status); kfree(elems); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f3138d158535..810bea1aacc5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2025 Intel Corporation + * Copyright (C) 2018 - 2026 Intel Corporation */ #include <linux/delay.h> @@ -162,6 +162,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, const struct ieee80211_vht_operation *vht_oper = elems->vht_operation; const struct ieee80211_he_operation *he_oper = elems->he_operation; const struct ieee80211_eht_operation *eht_oper = elems->eht_operation; + const struct ieee80211_uhr_operation *uhr_oper = elems->uhr_operation; struct ieee80211_supported_band *sband = sdata->local->hw.wiphy->bands[channel->band]; struct cfg80211_chan_def vht_chandef; @@ -192,7 +193,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special 6 GHz case out of the way */ if (sband->band == NL80211_BAND_6GHZ) { - enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT; + enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_HIGHEST; /* this is an error */ if (conn->mode < IEEE80211_CONN_MODE_HE) @@ -215,7 +216,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, return IEEE80211_CONN_MODE_LEGACY; } - return mode; + if (mode <= IEEE80211_CONN_MODE_EHT) + return mode; + goto check_uhr; } /* now we have the progression HT, VHT, ... */ @@ -276,11 +279,8 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, return IEEE80211_CONN_MODE_VHT; } } else if (!vht_oper || !elems->vht_cap_elem) { - if (sband->band == NL80211_BAND_5GHZ) { - sdata_info(sdata, - "VHT information is missing, disabling VHT\n"); + if (sband->band == NL80211_BAND_5GHZ) return IEEE80211_CONN_MODE_HT; - } no_vht = true; } else if (sband->band == NL80211_BAND_2GHZ) { no_vht = true; @@ -343,7 +343,63 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, *chandef = eht_chandef; } - return IEEE80211_CONN_MODE_EHT; +check_uhr: + if (conn->mode < IEEE80211_CONN_MODE_UHR || !uhr_oper) + return IEEE80211_CONN_MODE_EHT; + + /* + * In beacons we don't have all the data - but we know the size was OK, + * so if the size is valid as a non-beacon case, we have more data and + * can validate the NPCA parameters. + */ + if (ieee80211_uhr_oper_size_ok((const void *)uhr_oper, + elems->uhr_operation_len, + false)) { + struct cfg80211_chan_def npca_chandef = *chandef; + const struct ieee80211_uhr_npca_info *npca; + const __le16 *dis_subch_bmap; + u16 punct = chandef->punctured, npca_punct; + + npca = ieee80211_uhr_npca_info(uhr_oper); + if (npca) { + int width = cfg80211_chandef_get_width(chandef); + u8 offs = le32_get_bits(npca->params, + IEEE80211_UHR_NPCA_PARAMS_PRIMARY_CHAN_OFFS); + u32 cf1 = chandef->center_freq1; + bool pri_upper, npca_upper; + + pri_upper = chandef->chan->center_freq > cf1; + npca_upper = 20 * offs >= width / 2; + + if (20 * offs >= cfg80211_chandef_get_width(chandef) || + pri_upper == npca_upper) { + sdata_info(sdata, + "AP UHR NPCA primary channel invalid, disabling UHR\n"); + return IEEE80211_CONN_MODE_EHT; + } + } + + dis_subch_bmap = ieee80211_uhr_npca_dis_subch_bitmap(uhr_oper); + + if (dis_subch_bmap) { + npca_punct = get_unaligned_le16(dis_subch_bmap); + npca_chandef.punctured = npca_punct; + } + + /* + * must be a valid puncturing pattern for this channel as + * well as puncturing all subchannels that are already in + * the disabled subchannel bitmap on the primary channel + */ + if (!cfg80211_chandef_valid(&npca_chandef) || + ((punct & npca_punct) != punct)) { + sdata_info(sdata, + "AP UHR NPCA disabled subchannel bitmap invalid, disabling UHR\n"); + return IEEE80211_CONN_MODE_EHT; + } + } + + return IEEE80211_CONN_MODE_UHR; } static bool @@ -1002,6 +1058,9 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, .from_ap = true, .start = ies->data, .len = ies->len, + .type = ies->from_beacon ? + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; @@ -1091,6 +1150,7 @@ again: IEEE80211_CONN_BW_LIMIT_160); break; case IEEE80211_CONN_MODE_EHT: + case IEEE80211_CONN_MODE_UHR: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_320); @@ -1108,6 +1168,8 @@ again: set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_EHT) set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors); + if (conn->mode >= IEEE80211_CONN_MODE_UHR) + set_bit(BSS_MEMBERSHIP_SELECTOR_UHR_PHY, sta_selectors); /* * We do not support EPD or GLK so never add them. @@ -1126,7 +1188,10 @@ again: while (!ieee80211_chandef_usable(sdata, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { - if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) { + if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT) { + link_id_info(sdata, link_id, + "unusable channel (%d MHz) for connection\n", + chanreq->oper.chan->center_freq); ret = -EINVAL; goto free; } @@ -1152,6 +1217,11 @@ again: IEEE80211_CONN_BW_LIMIT_160); } + if (conn->mode >= IEEE80211_CONN_MODE_UHR && + !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_NO_UHR)) + conn->mode = IEEE80211_CONN_MODE_EHT; + if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) link_id_info(sdata, link_id, "regulatory prevented using AP config, downgraded\n"); @@ -1545,7 +1615,7 @@ static void ieee80211_assoc_add_rates(struct ieee80211_local *local, * in the association request (e.g. D-Link DAP 1353 in * b-only mode)... */ - ieee80211_parse_bitrates(width, sband, + ieee80211_parse_bitrates(sband, assoc_data->supp_rates, assoc_data->supp_rates_len, &rates); @@ -1881,11 +1951,13 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, /* * careful - need to know about all the present elems before - * calling ieee80211_assoc_add_ml_elem(), so add this one if - * we're going to put it after the ML element + * calling ieee80211_assoc_add_ml_elem(), so add these if + * we're going to put them after the ML element */ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY); + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_UHR) + ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_UHR_CAPA); if (link_id == assoc_data->assoc_link_id) ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa, @@ -1898,6 +1970,9 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, ieee80211_put_eht_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_UHR) + ieee80211_put_uhr_cap(skb, sdata, sband); + if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_add_aid_request_ie(sdata, skb); ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); @@ -2132,6 +2207,9 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, sizeof(struct ieee80211_eht_mcs_nss_supp) + IEEE80211_EHT_PPE_THRES_MAX_LEN; + size += 2 + 1 + sizeof(struct ieee80211_uhr_cap) + + sizeof(struct ieee80211_uhr_cap_phy); + return size; } @@ -2152,6 +2230,8 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) struct ieee80211_prep_tx_info info = {}; unsigned int link_id, n_links = 0; u16 present_elems[PRESENT_ELEMS_MAX] = {}; + struct sta_info *sta; + bool assoc_encrypt; void *capab_pos; size_t size; int ret; @@ -2332,7 +2412,15 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) info.link_id = assoc_data->assoc_link_id; drv_mgd_prepare_tx(local, sdata, &info); - IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + sta = sta_info_get_bss(sdata, sdata->vif.cfg.ap_addr); + + assoc_encrypt = sta && sta->sta.epp_peer && + wiphy_dereference(sdata->local->hw.wiphy, + sta->ptk[sta->ptk_idx]); + + if (!assoc_encrypt) + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -2508,6 +2596,16 @@ static void ieee80211_csa_switch_work(struct wiphy *wiphy, link->u.mgd.csa.waiting_bcn = true; + /* + * The next beacon really should always be different, so this should + * have no effect whatsoever. However, some APs (we observed this in + * an Asus AXE11000), the beacon after the CSA might be identical to + * the last beacon on the old channel - in this case we'd ignore it. + * Resetting the CRC will lead us to handle it better (albeit with a + * disconnect, but clearly the AP is broken.) + */ + link->u.mgd.beacon_crc_valid = false; + /* apply new TPE restrictions immediately on the new channel */ if (link->u.mgd.csa.ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { @@ -4898,6 +4996,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, case WLAN_AUTH_FILS_SK: case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: + case WLAN_AUTH_EPPKE: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { @@ -5170,7 +5269,9 @@ static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) continue; } - elems = ieee802_11_parse_elems(ies->data, ies->len, false, + elems = ieee802_11_parse_elems(ies->data, ies->len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON, NULL); if (!elems) { rcu_read_unlock(); @@ -5216,6 +5317,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; @@ -5504,6 +5606,18 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, bss_conf->epcs_support = false; } + if (elems->uhr_operation && elems->uhr_cap && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_UHR) { + ieee80211_uhr_cap_ie_to_sta_uhr_cap(sdata, sband, + elems->uhr_cap, + elems->uhr_cap_len, + link_sta); + + bss_conf->uhr_support = link_sta->pub->uhr_cap.has_uhr; + } else { + bss_conf->uhr_support = false; + } + if (elems->s1g_oper && link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G && elems->s1g_capab) @@ -5794,6 +5908,7 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, bool is_6ghz = sband->band == NL80211_BAND_6GHZ; const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; + const struct ieee80211_sta_uhr_cap *uhr_cap; struct ieee80211_sta_vht_cap vht_cap; if (sband->band == NL80211_BAND_S1GHZ) { @@ -5969,9 +6084,6 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, "no EHT support, limiting to HE\n"); goto out; } - - /* we have EHT */ - conn->mode = IEEE80211_CONN_MODE_EHT; /* check bandwidth */ @@ -5982,6 +6094,20 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, mlme_link_id_dbg(sdata, link_id, "no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n"); + if (req && req->flags & ASSOC_REQ_DISABLE_UHR) { + mlme_link_id_dbg(sdata, link_id, + "UHR disabled by flag, limiting to EHT\n"); + goto out; + } + + uhr_cap = ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif); + if (!uhr_cap) { + mlme_link_id_dbg(sdata, link_id, + "no UHR support, limiting to EHT\n"); + goto out; + } + conn->mode = IEEE80211_CONN_MODE_UHR; + out: mlme_link_id_dbg(sdata, link_id, "determined local STA to be %s, BW limited to %d MHz\n", @@ -6021,24 +6147,6 @@ ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, conn->bw_limit, tmp.bw_limit); } -static enum ieee80211_ap_reg_power -ieee80211_ap_power_type(u8 control) -{ - switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: - return IEEE80211_REG_LPI_AP; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: - return IEEE80211_REG_SP_AP; - case IEEE80211_6GHZ_CTRL_REG_VLP_AP: - return IEEE80211_REG_VLP_AP; - default: - return IEEE80211_REG_UNSET_AP; - } -} - static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, @@ -6081,7 +6189,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); if (he_6ghz_oper) link->conf->power_type = - ieee80211_ap_power_type(he_6ghz_oper->control); + cfg80211_6ghz_power_type(he_6ghz_oper->control, + cbss->channel->flags); else link_info(link, "HE 6 GHz operation missing (on %d MHz), expect issues\n", @@ -6112,9 +6221,10 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); - /* don't downgrade for 5 and 10 MHz channels, though. */ + /* don't downgrade for 5/10/S1G MHz channels, though. */ if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || - chanreq.oper.width == NL80211_CHAN_WIDTH_10) + chanreq.oper.width == NL80211_CHAN_WIDTH_10 || + cfg80211_chandef_is_s1g(&chanreq.oper)) return ret; while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) { @@ -6161,6 +6271,100 @@ static bool ieee80211_get_dtim(const struct cfg80211_bss_ies *ies, return true; } +static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data) +{ + if (bm_size == 1) + return *data; + + return get_unaligned_le16(data); +} + +static int +ieee80211_parse_adv_t2l(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_ttlm_elem *ttlm, + struct ieee80211_adv_ttlm_info *ttlm_info) +{ + /* The element size was already validated in + * ieee80211_tid_to_link_map_size_ok() + */ + u8 control, link_map_presence, map_size, tid; + u8 *pos; + + memset(ttlm_info, 0, sizeof(*ttlm_info)); + pos = (void *)ttlm->optional; + control = ttlm->control; + + if ((control & IEEE80211_TTLM_CONTROL_DIRECTION) != + IEEE80211_TTLM_DIRECTION_BOTH) { + sdata_info(sdata, "Invalid advertised T2L map direction\n"); + return -EINVAL; + } + + if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) { + link_map_presence = *pos; + pos++; + } + + if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT) { + ttlm_info->switch_time = get_unaligned_le16(pos); + + /* Since ttlm_info->switch_time == 0 means no switch time, bump + * it by 1. + */ + if (!ttlm_info->switch_time) + ttlm_info->switch_time = 1; + + pos += 2; + } + + if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) { + ttlm_info->duration = pos[0] | pos[1] << 8 | pos[2] << 16; + pos += 3; + } + + if (control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) { + ttlm_info->map = 0xffff; + return 0; + } + + if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) + map_size = 1; + else + map_size = 2; + + /* According to Draft P802.11be_D3.0 clause 35.3.7.1.7, an AP MLD shall + * not advertise a TID-to-link mapping that does not map all TIDs to the + * same link set, reject frame if not all links have mapping + */ + if (link_map_presence != 0xff) { + sdata_info(sdata, + "Invalid advertised T2L mapping presence indicator\n"); + return -EINVAL; + } + + ttlm_info->map = ieee80211_get_ttlm(map_size, pos); + if (!ttlm_info->map) { + sdata_info(sdata, + "Invalid advertised T2L map for TID 0\n"); + return -EINVAL; + } + + pos += map_size; + + for (tid = 1; tid < 8; tid++) { + u16 map = ieee80211_get_ttlm(map_size, pos); + + if (map != ttlm_info->map) { + sdata_info(sdata, "Invalid advertised T2L map for tid %d\n", + tid); + return -EINVAL; + } + + pos += map_size; + } + return 0; +} + static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, struct ieee802_11_elems *elems, @@ -6192,8 +6396,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, continue; valid_links |= BIT(link_id); - if (assoc_data->link[link_id].disabled) - dormant_links |= BIT(link_id); if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_allocate_link(sta, link_id); @@ -6202,6 +6404,33 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } } + /* + * We do not support setting a negotiated TTLM during + * association. As such, we can assume that if there is a TTLM, + * then it is the currently active advertised TTLM. + * In that case, there must be exactly one TTLM that does not + * have a switch time set. This mapping should also leave us + * with at least one usable link. + */ + if (elems->ttlm_num > 1) { + sdata_info(sdata, + "More than one advertised TTLM in association response\n"); + goto out_err; + } else if (elems->ttlm_num == 1) { + if (ieee80211_parse_adv_t2l(sdata, elems->ttlm[0], + &sdata->u.mgd.ttlm_info) || + sdata->u.mgd.ttlm_info.switch_time != 0 || + !(valid_links & sdata->u.mgd.ttlm_info.map)) { + sdata_info(sdata, + "Invalid advertised TTLM in association response\n"); + goto out_err; + } + + sdata->u.mgd.ttlm_info.active = true; + dormant_links = + valid_links & ~sdata->u.mgd.ttlm_info.map; + } + ieee80211_vif_set_links(sdata, valid_links, dormant_links); } @@ -6349,6 +6578,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, .bss = NULL, .link_id = -1, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; struct ieee802_11_elems *elems; int ac; @@ -6610,8 +6840,8 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_link_data *link, * Response frame shall be set to the broadcast address [..]" * So, on 6GHz band we should also accept broadcast responses. */ - channel = ieee80211_get_channel(sdata->local->hw.wiphy, - rx_status->freq); + channel = ieee80211_get_channel_khz(sdata->local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; @@ -6855,6 +7085,9 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, control = le16_to_cpu(prof->control); link_id = control & IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID; + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + continue; + removed_links |= BIT(link_id); /* the MAC address should not be included, but handle it */ @@ -6991,95 +7224,6 @@ static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy, sdata->u.mgd.ttlm_info.switch_time = 0; } -static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data) -{ - if (bm_size == 1) - return *data; - else - return get_unaligned_le16(data); -} - -static int -ieee80211_parse_adv_t2l(struct ieee80211_sub_if_data *sdata, - const struct ieee80211_ttlm_elem *ttlm, - struct ieee80211_adv_ttlm_info *ttlm_info) -{ - /* The element size was already validated in - * ieee80211_tid_to_link_map_size_ok() - */ - u8 control, link_map_presence, map_size, tid; - u8 *pos; - - memset(ttlm_info, 0, sizeof(*ttlm_info)); - pos = (void *)ttlm->optional; - control = ttlm->control; - - if ((control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) || - !(control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)) - return 0; - - if ((control & IEEE80211_TTLM_CONTROL_DIRECTION) != - IEEE80211_TTLM_DIRECTION_BOTH) { - sdata_info(sdata, "Invalid advertised T2L map direction\n"); - return -EINVAL; - } - - link_map_presence = *pos; - pos++; - - ttlm_info->switch_time = get_unaligned_le16(pos); - - /* Since ttlm_info->switch_time == 0 means no switch time, bump it - * by 1. - */ - if (!ttlm_info->switch_time) - ttlm_info->switch_time = 1; - - pos += 2; - - if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) { - ttlm_info->duration = pos[0] | pos[1] << 8 | pos[2] << 16; - pos += 3; - } - - if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) - map_size = 1; - else - map_size = 2; - - /* According to Draft P802.11be_D3.0 clause 35.3.7.1.7, an AP MLD shall - * not advertise a TID-to-link mapping that does not map all TIDs to the - * same link set, reject frame if not all links have mapping - */ - if (link_map_presence != 0xff) { - sdata_info(sdata, - "Invalid advertised T2L mapping presence indicator\n"); - return -EINVAL; - } - - ttlm_info->map = ieee80211_get_ttlm(map_size, pos); - if (!ttlm_info->map) { - sdata_info(sdata, - "Invalid advertised T2L map for TID 0\n"); - return -EINVAL; - } - - pos += map_size; - - for (tid = 1; tid < 8; tid++) { - u16 map = ieee80211_get_ttlm(map_size, pos); - - if (map != ttlm_info->map) { - sdata_info(sdata, "Invalid advertised T2L map for tid %d\n", - tid); - return -EINVAL; - } - - pos += map_size; - } - return 0; -} - static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, u64 beacon_ts) @@ -7257,7 +7401,9 @@ ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, (prof->sta_info_len - 1), len - (prof->sta_info_len - 1), - false, NULL); + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON, + NULL); /* memory allocation failed - let's hope that's transient */ if (!prof_elems) @@ -7361,6 +7507,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; lockdep_assert_wiphy(local->hw.wiphy); @@ -7963,7 +8110,10 @@ void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.ttlm_req.variable); elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, - ies_len, true, NULL); + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; @@ -8169,9 +8319,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, break; /* CSA IE cannot be overridden, no need for BSSID */ - elems = ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src = @@ -8198,9 +8350,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, * extended CSA IE can't be overridden, no need for * BSSID */ - elems = ieee802_11_parse_elems( - mgmt->u.action.u.ext_chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.ext_chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src; @@ -8266,6 +8420,12 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) if (WARN_ON_ONCE(!auth_data)) return -EINVAL; + if (auth_data->algorithm == WLAN_AUTH_EPPKE && + ieee80211_vif_is_mld(&sdata->vif) && + !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, + auth_data->data, auth_data->data_len)) + return -EINVAL; + auth_data->tries++; if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) { @@ -8294,9 +8454,12 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) auth_data->expected_transaction = 2; if (auth_data->algorithm == WLAN_AUTH_SAE) { - trans = auth_data->sae_trans; - status = auth_data->sae_status; + trans = auth_data->trans; + status = auth_data->status; auth_data->expected_transaction = trans; + } else if (auth_data->algorithm == WLAN_AUTH_EPPKE) { + trans = auth_data->trans; + status = auth_data->status; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -8953,6 +9116,10 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, goto out_err; } + if (ifmgd->auth_data && + ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE) + new_sta->sta.epp_peer = true; + new_sta->sta.mlo = mlo; } @@ -9207,6 +9374,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, case NL80211_AUTHTYPE_FILS_PK: auth_alg = WLAN_AUTH_FILS_PK; break; + case NL80211_AUTHTYPE_EPPKE: + auth_alg = WLAN_AUTH_EPPKE; + break; default: return -EOPNOTSUPP; } @@ -9231,12 +9401,14 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, auth_data->link_id = req->link_id; if (req->auth_data_len >= 4) { - if (req->auth_type == NL80211_AUTHTYPE_SAE) { + if (req->auth_type == NL80211_AUTHTYPE_SAE || + req->auth_type == NL80211_AUTHTYPE_EPPKE) { __le16 *pos = (__le16 *) req->auth_data; - auth_data->sae_trans = le16_to_cpu(pos[0]); - auth_data->sae_status = le16_to_cpu(pos[1]); + auth_data->trans = le16_to_cpu(pos[0]); + auth_data->status = le16_to_cpu(pos[1]); } + memcpy(auth_data->data, req->auth_data + 4, req->auth_data_len - 4); auth_data->data_len += req->auth_data_len - 4; @@ -9287,7 +9459,11 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, * out SAE Confirm. */ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE && - auth_data->peer_confirmed && auth_data->sae_trans == 2) + auth_data->peer_confirmed && auth_data->trans == 2) + ieee80211_mark_sta_auth(sdata); + + if (cont_auth && req->auth_type == NL80211_AUTHTYPE_EPPKE && + auth_data->trans == 3) ieee80211_mark_sta_auth(sdata); if (ifmgd->associated) { @@ -9726,7 +9902,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, req, true, i, &assoc_data->link[i].conn); assoc_data->link[i].bss = link_cbss; - assoc_data->link[i].disabled = req->links[i].disabled; if (!bss->uapsd_supported) uapsd_supported = false; @@ -10667,7 +10842,7 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, if (added_links) { bool uapsd_supported; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return -ENOMEM; @@ -10708,8 +10883,6 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, &data->link[link_id].conn); data->link[link_id].bss = link_cbss; - data->link[link_id].disabled = - req->add_links[link_id].disabled; data->link[link_id].elems = (u8 *)req->add_links[link_id].elems; data->link[link_id].elems_len = @@ -10978,7 +11151,10 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, pos = scratch + sizeof(control); len -= sizeof(control); - link_elems = ieee802_11_parse_elems(pos, len, false, NULL); + link_elems = ieee802_11_parse_elems(pos, len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!link_elems) continue; @@ -11029,7 +11205,10 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, u.action.u.epcs.variable) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; - elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); + elems = ieee802_11_parse_elems(pos, ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index a5d4358f122a..ebb4f4d88c23 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -47,6 +47,9 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; int band; + if (!ifocb->joined) + return; + /* XXX: Consider removing the least recently used entry and * allow new one to be added. */ diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index ae82533e3c02..f60f6a58948b 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -579,7 +579,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!local->emulate_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; - roc = kzalloc(sizeof(*roc), GFP_KERNEL); + roc = kzalloc_obj(*roc); if (!roc) return -ENOMEM; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index c5e0f7f46004..2b3632c6008a 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * * element parsing for mac80211 */ @@ -189,6 +189,26 @@ ieee80211_parse_extension_element(u32 *crc, elems->ttlm_num++; } break; + case WLAN_EID_EXT_UHR_OPER: + if (params->mode < IEEE80211_CONN_MODE_UHR) + break; + calc_crc = true; + if (ieee80211_uhr_oper_size_ok(data, len, + params->type == (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON))) { + elems->uhr_operation = data; + elems->uhr_operation_len = len; + } + break; + case WLAN_EID_EXT_UHR_CAPA: + if (params->mode < IEEE80211_CONN_MODE_UHR) + break; + calc_crc = true; + if (ieee80211_uhr_capa_size_ok(data, len, true)) { + elems->uhr_cap = data; + elems->uhr_cap_len = len; + } + break; } if (crc && calc_crc) @@ -286,6 +306,24 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, bitmap_zero(seen_elems, 256); + switch (params->type) { + /* we don't need to parse assoc request, luckily (it's value 0) */ + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ: + default: + WARN(1, "invalid frame type 0x%x for element parsing\n", + params->type); + break; + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION: + case IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON: + break; + } + for_each_element(elem, params->start, params->len) { const struct element *subelem; u8 elem_parse_failed; @@ -566,7 +604,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, if (params->mode < IEEE80211_CONN_MODE_VHT) break; - if (!params->action) { + if (params->type != (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION)) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; @@ -582,7 +621,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, case WLAN_EID_CHANNEL_SWITCH_WRAPPER: if (params->mode < IEEE80211_CONN_MODE_VHT) break; - if (params->action) { + if (params->type == (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION)) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; @@ -942,7 +982,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, sub->len = end - sub->start; sub->mode = params->mode; - sub->action = params->action; + sub->type = params->type; sub->from_ap = params->from_ap; sub->link_id = -1; @@ -1008,8 +1048,8 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) if (WARN_ON(params->link_id >= 0 && params->bss)) return NULL; - elems_parse = kzalloc(struct_size(elems_parse, scratch, scratch_len), - GFP_ATOMIC); + elems_parse = kzalloc_flex(*elems_parse, scratch, scratch_len, + GFP_ATOMIC); if (!elems_parse) return NULL; @@ -1041,7 +1081,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) sub.start = elems_parse->scratch_pos; sub.mode = params->mode; sub.len = nontx_len; - sub.action = params->action; + sub.type = params->type; sub.link_id = params->link_id; /* consume the space used for non-transmitted profile */ @@ -1095,8 +1135,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) } EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full); -int ieee80211_parse_bitrates(enum nl80211_chan_width width, - const struct ieee80211_supported_band *sband, +int ieee80211_parse_bitrates(const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { struct ieee80211_rate *br; diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index e441f8541603..31af7dd6aedc 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -163,7 +163,7 @@ int ieee80211_rate_control_register(const struct rate_control_ops *ops) } } - alg = kzalloc(sizeof(*alg), GFP_KERNEL); + alg = kzalloc_obj(*alg); if (alg == NULL) { mutex_unlock(&rate_ctrl_mutex); return -ENOMEM; @@ -263,7 +263,7 @@ rate_control_alloc(const char *name, struct ieee80211_local *local) { struct rate_control_ref *ref; - ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); + ref = kmalloc_obj(struct rate_control_ref); if (!ref) return NULL; ref->ops = ieee80211_rate_control_ops_get(name); diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index f66910013218..62745ca00e06 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1553,7 +1553,7 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) int i = 0; int max_rates = min_t(int, mp->hw->max_rates, IEEE80211_TX_RATE_TABLE_SIZE); - rates = kzalloc(sizeof(*rates), GFP_ATOMIC); + rates = kzalloc_obj(*rates, GFP_ATOMIC); if (!rates) return; @@ -1862,7 +1862,7 @@ minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) max_rates = sband->n_bitrates; } - return kzalloc(sizeof(*mi), gfp); + return kzalloc_obj(*mi, gfp); } static void @@ -1930,7 +1930,7 @@ minstrel_ht_alloc(struct ieee80211_hw *hw) struct minstrel_priv *mp; int i; - mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); + mp = kzalloc_obj(struct minstrel_priv, GFP_ATOMIC); if (!mp) return NULL; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5b4c3fe9970a..11d6c56c9d7e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -6,7 +6,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/jiffies.h> @@ -59,7 +59,8 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, status->flag &= ~(RX_FLAG_RADIOTAP_TLV_AT_END | RX_FLAG_RADIOTAP_LSIG | RX_FLAG_RADIOTAP_HE_MU | - RX_FLAG_RADIOTAP_HE); + RX_FLAG_RADIOTAP_HE | + RX_FLAG_RADIOTAP_VHT); hdr = (void *)skb->data; fc = hdr->frame_control; @@ -151,8 +152,10 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, } if (status->encoding == RX_ENC_VHT) { + /* Included even if RX_FLAG_RADIOTAP_VHT is not set */ len = ALIGN(len, 2); len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_vht) != 12); } if (local->hw.radiotap_timestamp.units_pos >= 0) { @@ -195,6 +198,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, * The position to look at depends on the existence (or non- * existence) of other elements, so take that into account... */ + if (status->flag & RX_FLAG_RADIOTAP_VHT) + tlv_offset += + sizeof(struct ieee80211_radiotap_vht); if (status->flag & RX_FLAG_RADIOTAP_HE) tlv_offset += sizeof(struct ieee80211_radiotap_he); @@ -319,10 +325,17 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, u32 tlvs_len = 0; int mpdulen, chain; unsigned long chains = status->chains; + struct ieee80211_radiotap_vht vht = {}; struct ieee80211_radiotap_he he = {}; struct ieee80211_radiotap_he_mu he_mu = {}; struct ieee80211_radiotap_lsig lsig = {}; + if (status->flag & RX_FLAG_RADIOTAP_VHT) { + vht = *(struct ieee80211_radiotap_vht *)skb->data; + skb_pull(skb, sizeof(vht)); + WARN_ON_ONCE(status->encoding != RX_ENC_VHT); + } + if (status->flag & RX_FLAG_RADIOTAP_HE) { he = *(struct ieee80211_radiotap_he *)skb->data; skb_pull(skb, sizeof(he)); @@ -530,45 +543,61 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } if (status->encoding == RX_ENC_VHT) { - u16 known = local->hw.radiotap_vht_details; + u16 fill = local->hw.radiotap_vht_details; - rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); - put_unaligned_le16(known, pos); - pos += 2; - /* flags */ - if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) - *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; + /* Leave driver filled fields alone */ + fill &= ~le16_to_cpu(vht.known); + vht.known |= cpu_to_le16(fill); + + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_GI && + status->enc_flags & RX_ENC_FLAG_SHORT_GI) + vht.flags |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; /* in VHT, STBC is binary */ - if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) - *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC; - if (status->enc_flags & RX_ENC_FLAG_BF) + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_STBC && + status->enc_flags & RX_ENC_FLAG_STBC_MASK) + vht.flags |= IEEE80211_RADIOTAP_VHT_FLAG_STBC; + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_BEAMFORMED && + status->enc_flags & RX_ENC_FLAG_BF) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED; - pos++; - /* bandwidth */ - switch (status->bw) { - case RATE_INFO_BW_80: - *pos++ = 4; - break; - case RATE_INFO_BW_160: - *pos++ = 11; - break; - case RATE_INFO_BW_40: - *pos++ = 1; - break; - default: - *pos++ = 0; + + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) { + switch (status->bw) { + case RATE_INFO_BW_40: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_40; + break; + case RATE_INFO_BW_80: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_80; + break; + case RATE_INFO_BW_160: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_160; + break; + default: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_20; + break; + } } - /* MCS/NSS */ - *pos = (status->rate_idx << 4) | status->nss; - pos += 4; - /* coding field */ - if (status->enc_flags & RX_ENC_FLAG_LDPC) - *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0; - pos++; - /* group ID */ - pos++; - /* partial_aid */ - pos += 2; + + /* + * If the driver filled in mcs_nss[0], then do not touch it. + * + * Otherwise, put some information about MCS/NSS into the + * user 0 field. Note that this is not technically correct for + * an MU frame as we might have decoded a different user. + */ + if (!vht.mcs_nss[0]) { + vht.mcs_nss[0] = (status->rate_idx << 4) | status->nss; + + /* coding field */ + if (status->enc_flags & RX_ENC_FLAG_LDPC) + vht.coding |= IEEE80211_RADIOTAP_CODING_LDPC_USER0; + } + + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); + memcpy(pos, &vht, sizeof(vht)); + pos += sizeof(vht); } if (local->hw.radiotap_timestamp.units_pos >= 0) { @@ -763,6 +792,51 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local, return skb; } +static bool +ieee80211_validate_monitor_radio(struct ieee80211_sub_if_data *sdata, + struct ieee80211_local *local, + struct ieee80211_rx_status *status) +{ + struct wiphy *wiphy = local->hw.wiphy; + int i, freq, bw; + + if (!wiphy->n_radio) + return true; + + switch (status->bw) { + case RATE_INFO_BW_20: + bw = 20000; + break; + case RATE_INFO_BW_40: + bw = 40000; + break; + case RATE_INFO_BW_80: + bw = 80000; + break; + case RATE_INFO_BW_160: + bw = 160000; + break; + case RATE_INFO_BW_320: + bw = 320000; + break; + default: + return false; + } + + freq = MHZ_TO_KHZ(status->freq); + + for (i = 0; i < wiphy->n_radio; i++) { + if (!(sdata->wdev.radio_mask & BIT(i))) + continue; + + if (!ieee80211_radio_freq_range_valid(&wiphy->radio[i], freq, bw)) + continue; + + return true; + } + return false; +} + /* * This function copies a received frame to all monitor interfaces and * returns a cleaned-up SKB that no longer includes the FCS nor the @@ -789,6 +863,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } + if (status->flag & RX_FLAG_RADIOTAP_VHT) + rtap_space += sizeof(struct ieee80211_radiotap_vht); + if (status->flag & RX_FLAG_RADIOTAP_HE) rtap_space += sizeof(struct ieee80211_radiotap_he); @@ -855,6 +932,10 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, chandef->chan->center_freq != status->freq) continue; + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && + !ieee80211_validate_monitor_radio(sdata, local, status)) + continue; + if (!prev_sdata) { prev_sdata = sdata; continue; @@ -1056,14 +1137,14 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { if (ieee80211_has_tods(hdr->frame_control) || !ieee80211_has_fromds(hdr->frame_control)) - return RX_DROP; + return RX_DROP_U_MESH_DS_BITS; if (ether_addr_equal(hdr->addr3, dev_addr)) - return RX_DROP; + return RX_DROP_U_MESH_A3_MISMATCH; } else { if (!ieee80211_has_a4(hdr->frame_control)) - return RX_DROP; + return RX_DROP_U_MESH_NO_A4; if (ether_addr_equal(hdr->addr4, dev_addr)) - return RX_DROP; + return RX_DROP_U_MESH_A4_MISMATCH; } } @@ -1075,20 +1156,20 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt; if (!ieee80211_is_mgmt(hdr->frame_control)) - return RX_DROP; + return RX_DROP_U_MESH_UNEXP_DATA; if (ieee80211_is_action(hdr->frame_control)) { u8 category; /* make sure category field is present */ if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) - return RX_DROP; + return RX_DROP_U_RUNT_ACTION; mgmt = (struct ieee80211_mgmt *)hdr; category = mgmt->u.action.category; if (category != WLAN_CATEGORY_MESH_ACTION && category != WLAN_CATEGORY_SELF_PROTECTED) - return RX_DROP; + return RX_DROP_U_MESH_WRONG_ACTION; return RX_CONTINUE; } @@ -1098,7 +1179,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) ieee80211_is_auth(hdr->frame_control)) return RX_CONTINUE; - return RX_DROP; + return RX_DROP_U_MESH_UNEXP_MGMT; } return RX_CONTINUE; @@ -1524,7 +1605,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (rx->skb->len < hdrlen + 8) - return RX_DROP; + return RX_DROP_U_RUNT_DATA; skb_copy_bits(rx->skb, hdrlen + 6, ðertype, 2); if (ethertype == rx->sdata->control_port_protocol) @@ -1534,9 +1615,9 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) if (rx->sdata->vif.type == NL80211_IFTYPE_AP && cfg80211_rx_spurious_frame(rx->sdata->dev, hdr->addr2, rx->link_id, GFP_ATOMIC)) - return RX_DROP_U_SPURIOUS; + return RX_DROP_U_SPURIOUS_NOTIF; - return RX_DROP; + return RX_DROP_U_SPURIOUS; } return RX_CONTINUE; @@ -1799,7 +1880,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) link_sta->rx_stats.fragments++; u64_stats_update_begin(&link_sta->rx_stats.syncp); - link_sta->rx_stats.bytes += rx->skb->len; + u64_stats_add(&link_sta->rx_stats.bytes, rx->skb->len); u64_stats_update_end(&link_sta->rx_stats.syncp); if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { @@ -2025,7 +2106,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->link_sta) { if (ieee80211_is_group_privacy_action(skb) && test_sta_flag(rx->sta, WLAN_STA_MFP)) - return RX_DROP; + return RX_DROP_U_UNPROTECTED; rx->key = rcu_dereference(rx->link_sta->gtk[mmie_keyidx]); } @@ -2110,11 +2191,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->key) { if (unlikely(rx->key->flags & KEY_FLAG_TAINTED)) - return RX_DROP; + return RX_DROP_U_KEY_TAINTED; /* TODO: add threshold stuff again */ } else { - return RX_DROP; + return RX_DROP_U_UNPROTECTED; } switch (rx->key->conf.cipher) { @@ -2134,10 +2215,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) rx, IEEE80211_CCMP_256_MIC_LEN); break; case WLAN_CIPHER_SUITE_AES_CMAC: - result = ieee80211_crypto_aes_cmac_decrypt(rx); + result = ieee80211_crypto_aes_cmac_decrypt( + rx, IEEE80211_CMAC_128_MIC_LEN); break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: - result = ieee80211_crypto_aes_cmac_256_decrypt(rx); + result = ieee80211_crypto_aes_cmac_decrypt( + rx, IEEE80211_CMAC_256_MIC_LEN); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: @@ -2288,7 +2371,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) goto out; if (is_multicast_ether_addr(hdr->addr1)) - return RX_DROP; + return RX_DROP_U_MCAST_FRAGMENT; I802_DEBUG_INC(rx->local->rx_handlers_fragments); @@ -2343,7 +2426,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, hdr); if (!entry) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); - return RX_DROP; + return RX_DROP_U_DEFRAG_MISMATCH; } /* "The receiver shall discard MSDUs and MMPDUs whose constituent @@ -2526,6 +2609,14 @@ ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC))) return RX_DROP_U_UNPROT_ROBUST_ACTION; + /* + * Drop unprotected (Re)Association Request/Response frame received from + * an EPP Peer. + */ + if (!ieee80211_has_protected(fc) && + ieee80211_require_encrypted_assoc(fc, rx->sta)) + return RX_DROP_U_UNPROT_UCAST_MGMT; + return RX_CONTINUE; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_drop_unencrypted_mgmt); @@ -2694,7 +2785,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) * frame, so count MSDUs. */ u64_stats_update_begin(&rx->link_sta->rx_stats.syncp); - rx->link_sta->rx_stats.msdu[rx->seqno_idx]++; + u64_stats_inc(&rx->link_sta->rx_stats.msdu[rx->seqno_idx]); u64_stats_update_end(&rx->link_sta->rx_stats.syncp); } @@ -2865,25 +2956,25 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta return RX_CONTINUE; if (!pskb_may_pull(skb, sizeof(*eth) + 6)) - return RX_DROP; + return RX_DROP_U_RUNT_MESH_DATA; mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth)); mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr); if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen)) - return RX_DROP; + return RX_DROP_U_RUNT_MESH_DATA; eth = (struct ethhdr *)skb->data; multicast = is_multicast_ether_addr(eth->h_dest); mesh_hdr = (struct ieee80211s_hdr *)(eth + 1); if (!mesh_hdr->ttl) - return RX_DROP; + return RX_DROP_U_MESH_NO_TTL; /* frame is in RMC, don't forward */ if (is_multicast_ether_addr(eth->h_dest) && mesh_rmc_check(sdata, eth->h_source, mesh_hdr)) - return RX_DROP; + return RX_DROP_U_MESH_RMC; /* forward packet */ if (sdata->crypto_tx_tailroom_needed_cnt) @@ -2900,7 +2991,7 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta /* has_a4 already checked in ieee80211_rx_mesh_check */ proxied_addr = mesh_hdr->eaddr2; else - return RX_DROP; + return RX_DROP_U_MESH_BAD_AE; rcu_read_lock(); mppath = mpp_path_lookup(sdata, proxied_addr); @@ -2932,14 +3023,14 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta goto rx_accept; IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); - return RX_DROP; + return RX_DROP_U_MESH_TTL_EXPIRED; } if (!ifmsh->mshcfg.dot11MeshForwarding) { if (is_multicast_ether_addr(eth->h_dest)) goto rx_accept; - return RX_DROP; + return RX_DROP_U_MESH_NOT_FORWARDING; } skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]); @@ -3125,7 +3216,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(fc))) - return RX_DROP; + return RX_DROP_U_AMSDU_WITHOUT_DATA; if (unlikely(ieee80211_has_a4(hdr->frame_control))) { switch (rx->sdata->vif.type) { @@ -3182,7 +3273,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) - return RX_DROP; + return RX_DROP_U_NULL_DATA; /* Send unexpected-4addr-frame event to hostapd */ if (ieee80211_has_a4(hdr->frame_control) && @@ -3192,7 +3283,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, rx->sta->sta.addr, rx->link_id, GFP_ATOMIC); - return RX_DROP; + return RX_DROP_U_UNEXPECTED_4ADDR; } res = __ieee80211_data_to_8023(rx, &port_control); @@ -3204,7 +3295,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return res; if (!ieee80211_frame_allowed(rx, fc)) - return RX_DROP; + return RX_DROP_U_PORT_CONTROL; /* directly handle TDLS channel switch requests/responses */ if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto == @@ -3269,11 +3360,11 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) }; if (!rx->sta) - return RX_DROP; + return RX_DROP_U_UNKNOWN_STA; if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control), &bar_data, sizeof(bar_data))) - return RX_DROP; + return RX_DROP_U_RUNT_BAR; tid = le16_to_cpu(bar_data.control) >> 12; @@ -3285,7 +3376,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) - return RX_DROP; + return RX_DROP_U_BAR_OUTSIDE_SESSION; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; event.u.ba.tid = tid; @@ -3309,7 +3400,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_QUEUED; } - return RX_DROP; + return RX_DROP_U_CTRL_FRAME; } static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, @@ -3418,16 +3509,21 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) * and unknown (reserved) frames are useless. */ if (rx->skb->len < 24) - return RX_DROP; + return RX_DROP_U_RUNT_MGMT; if (!ieee80211_is_mgmt(mgmt->frame_control)) - return RX_DROP; + return RX_DROP_U_EXPECTED_MGMT; /* drop too small action frames */ if (ieee80211_is_action(mgmt->frame_control) && rx->skb->len < IEEE80211_MIN_ACTION_SIZE) return RX_DROP_U_RUNT_ACTION; + /* Drop non-broadcast Beacon frames */ + if (ieee80211_is_beacon(mgmt->frame_control) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_U_NONBCAST_BEACON; + if (rx->sdata->vif.type == NL80211_IFTYPE_AP && ieee80211_is_beacon(mgmt->frame_control) && !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { @@ -3521,8 +3617,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.category) { case WLAN_CATEGORY_HT: - /* reject HT action frames from stations not supporting HT */ - if (!rx->link_sta->pub->ht_cap.ht_supported) + /* reject HT action frames from stations not supporting HT + * or not HE Capable + */ + if (!rx->link_sta->pub->ht_cap.ht_supported && + !rx->link_sta->pub->he_cap.has_he) goto invalid; if (sdata->vif.type != NL80211_IFTYPE_STATION && @@ -3829,6 +3928,14 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) u.action.u.epcs)) goto invalid; goto queue; + case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: + if (sdata->vif.type != NL80211_IFTYPE_AP) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.eml_omn)) + goto invalid; + goto queue; default: break; } @@ -3955,10 +4062,10 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) - return RX_DROP; + return RX_DROP_U_MALFORMED_ACTION; if (is_multicast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_UNKNOWN_MCAST_ACTION; /* do not return rejected action frames */ if (mgmt->u.action.category & 0x80) @@ -4003,7 +4110,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP; + return RX_DROP_U_UNEXPECTED_EXT_FRAME; /* for now only beacons are ext, so queue them */ ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4024,7 +4131,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_OCB && sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP; + return RX_DROP_U_UNHANDLED_MGMT; switch (stype) { case cpu_to_le16(IEEE80211_STYPE_AUTH): @@ -4035,32 +4142,32 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_DEAUTH): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_MCAST_DEAUTH; /* process only for station/IBSS */ if (sdata->vif.type != NL80211_IFTYPE_STATION && sdata->vif.type != NL80211_IFTYPE_ADHOC) - return RX_DROP; + return RX_DROP_U_UNHANDLED_DEAUTH; break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_MCAST_DISASSOC; /* process only for station */ if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP; + return RX_DROP_U_UNHANDLED_DISASSOC; break; case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ): /* process only for ibss and mesh */ if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) - return RX_DROP; + return RX_DROP_U_UNHANDLED_PREQ; break; default: - return RX_DROP; + return RX_DROP_U_UNHANDLED_MGMT_STYPE; } ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4088,7 +4195,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { - ieee80211_rx_result res = RX_DROP; + ieee80211_rx_result res; struct sk_buff *skb; #define CALL_RXH(rxh) \ @@ -4114,8 +4221,10 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, */ rx->skb = skb; - if (WARN_ON_ONCE(!rx->link)) + if (WARN_ON_ONCE(!rx->link)) { + res = RX_DROP_U_NO_LINK; goto rxh_next; + } CALL_RXH(ieee80211_rx_h_check_more_data); CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll); @@ -4152,7 +4261,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) { struct sk_buff_head reorder_release; - ieee80211_rx_result res = RX_DROP; + ieee80211_rx_result res; __skb_queue_head_init(&reorder_release); @@ -4777,8 +4886,8 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, * frame, so count MSDUs. */ u64_stats_update_begin(&stats->syncp); - stats->msdu[rx->seqno_idx]++; - stats->bytes += orig_len; + u64_stats_inc(&stats->msdu[rx->seqno_idx]); + u64_stats_add(&stats->bytes, orig_len); u64_stats_update_end(&stats->syncp); if (fast_rx->internal_forward) { @@ -4903,6 +5012,11 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* after this point, don't punt to the slowpath! */ + if (fast_rx->uses_rss) + stats = this_cpu_ptr(rx->link_sta->pcpu_rx_stats); + else + stats = &rx->link_sta->rx_stats; + if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) && pskb_trim(skb, skb->len - fast_rx->icv_len)) goto drop; @@ -4937,6 +5051,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); switch (res) { case RX_QUEUED: + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); return true; case RX_CONTINUE: break; @@ -4950,11 +5066,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, drop: dev_kfree_skb(skb); - if (fast_rx->uses_rss) - stats = this_cpu_ptr(rx->link_sta->pcpu_rx_stats); - else - stats = &rx->link_sta->rx_stats; - stats->dropped++; return true; } @@ -5415,6 +5526,32 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss, status->eht.gi)) goto drop; break; + case RX_ENC_UHR: + if (WARN_ONCE(!(status->rate_idx <= 15 || + status->rate_idx == 17 || + status->rate_idx == 19 || + status->rate_idx == 20 || + status->rate_idx == 23) || + !status->nss || + status->nss > 8 || + status->uhr.gi > NL80211_RATE_INFO_EHT_GI_3_2, + "Rate marked as a UHR rate but data is invalid: MCS:%d, NSS:%d, GI:%d\n", + status->rate_idx, status->nss, status->uhr.gi)) + goto drop; + if (WARN_ONCE(status->uhr.elr && + (status->nss != 1 || status->rate_idx > 1 || + status->uhr.gi != NL80211_RATE_INFO_EHT_GI_1_6 || + status->bw != RATE_INFO_BW_20 || status->uhr.im), + "bad UHR ELR MCS MCS:%d, NSS:%d, GI:%d, BW:%d, IM:%d\n", + status->rate_idx, status->nss, status->uhr.gi, + status->bw, status->uhr.im)) + goto drop; + if (WARN_ONCE(status->uhr.im && + (status->nss != 1 || status->rate_idx == 15), + "bad UHR IM MCS MCS:%d, NSS:%d\n", + status->rate_idx, status->nss)) + goto drop; + break; default: WARN_ON_ONCE(1); fallthrough; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index bb9563f50e7b..4823c8d45639 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -76,7 +76,11 @@ void ieee80211_inform_bss(struct wiphy *wiphy, if (!update_data) return; - elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); + elems = ieee802_11_parse_elems(ies->data, ies->len, + update_data->beacon ? + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, + NULL); if (!elems) return; @@ -343,8 +347,13 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) mgmt->da)) return; } else { - /* Beacons are expected only with broadcast address */ - if (!is_broadcast_ether_addr(mgmt->da)) + /* + * Non-S1G beacons are expected only with broadcast address. + * S1G beacons only carry the SA so no DA check is required + * nor possible. + */ + if (!ieee80211_is_s1g_beacon(mgmt->frame_control) && + !is_broadcast_ether_addr(mgmt->da)) return; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f4d3b67fda06..dd51a578fbc5 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/module.h> @@ -360,7 +360,9 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) struct link_sta_info *link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); struct ieee80211_link_data *link; + unsigned int start; int ac, tid; + u64 value; u32 thr; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { @@ -369,8 +371,13 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) sta->rem_link_stats.tx_bytes += link_sta->tx_stats.bytes[ac]; } + do { + start = u64_stats_fetch_begin(&link_sta->rx_stats.syncp); + value = u64_stats_read(&link_sta->rx_stats.bytes); + } while (u64_stats_fetch_retry(&link_sta->rx_stats.syncp, start)); + sta->rem_link_stats.rx_packets += link_sta->rx_stats.packets; - sta->rem_link_stats.rx_bytes += link_sta->rx_stats.bytes; + sta->rem_link_stats.rx_bytes += value; sta->rem_link_stats.tx_retries += link_sta->status_stats.retry_count; sta->rem_link_stats.tx_failed += link_sta->status_stats.retry_failed; sta->rem_link_stats.rx_dropped_misc += link_sta->rx_stats.dropped; @@ -380,8 +387,13 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) sta->rem_link_stats.expected_throughput += thr; for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { - sta->rem_link_stats.pertid_stats.rx_msdu += - link_sta->rx_stats.msdu[tid]; + do { + start = u64_stats_fetch_begin(&link_sta->rx_stats.syncp); + value = u64_stats_read(&link_sta->rx_stats.msdu[tid]); + } while (u64_stats_fetch_retry(&link_sta->rx_stats.syncp, + start)); + + sta->rem_link_stats.pertid_stats.rx_msdu += value; sta->rem_link_stats.pertid_stats.tx_msdu += link_sta->tx_stats.msdu[tid]; sta->rem_link_stats.pertid_stats.tx_msdu_retries += @@ -632,7 +644,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { - sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); + sta->mesh = kzalloc_obj(*sta->mesh, gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; @@ -891,7 +903,7 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) goto out_cleanup; } - sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); + sinfo = kzalloc_obj(struct station_info); if (!sinfo) { err = -ENOMEM; goto out_cleanup; @@ -1533,6 +1545,10 @@ static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) } } + sinfo = kzalloc_obj(*sinfo); + if (sinfo) + sta_set_sinfo(sta, sinfo, true); + if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); @@ -1541,9 +1557,6 @@ static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); - sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); - if (sinfo) - sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); @@ -2554,6 +2567,17 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; + case STA_STATS_RATE_TYPE_UHR: + rinfo->flags = RATE_INFO_FLAGS_UHR_MCS; + rinfo->mcs = STA_STATS_GET(UHR_MCS, rate); + rinfo->nss = STA_STATS_GET(UHR_NSS, rate); + rinfo->eht_gi = STA_STATS_GET(UHR_GI, rate); + rinfo->eht_ru_alloc = STA_STATS_GET(UHR_RU, rate); + if (STA_STATS_GET(UHR_ELR, rate)) + rinfo->flags |= RATE_INFO_FLAGS_UHR_ELR_MCS; + if (STA_STATS_GET(UHR_IM, rate)) + rinfo->flags |= RATE_INFO_FLAGS_UHR_IM; + break; } } @@ -2577,7 +2601,7 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, do { start = u64_stats_fetch_begin(&rxstats->syncp); - value = rxstats->msdu[tid]; + value = u64_stats_read(&rxstats->msdu[tid]); } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; @@ -2653,7 +2677,7 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) do { start = u64_stats_fetch_begin(&rxstats->syncp); - value = rxstats->bytes; + value = u64_stats_read(&rxstats->bytes); } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; @@ -2758,7 +2782,9 @@ static void sta_set_link_sinfo(struct sta_info *sta, } link_sinfo->inactive_time = - jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, link_id)); + jiffies_delta_to_msecs(jiffies - + ieee80211_sta_last_active(sta, + link_id)); if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { @@ -2991,7 +3017,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = - jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, -1)); + jiffies_delta_to_msecs(jiffies - + ieee80211_sta_last_active(sta, -1)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { @@ -3282,7 +3309,7 @@ int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) sta->link[link_id])) return -EBUSY; - alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); + alloc = kzalloc_obj(*alloc); if (!alloc) return -ENOMEM; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 5288d5286651..2875ef7d7946 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -3,7 +3,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright(c) 2020-2024 Intel Corporation + * Copyright(c) 2020-2026 Intel Corporation */ #ifndef STA_INFO_H @@ -434,8 +434,8 @@ struct ieee80211_sta_rx_stats { s8 chain_signal_last[IEEE80211_MAX_CHAINS]; u32 last_rate; struct u64_stats_sync syncp; - u64 bytes; - u64 msdu[IEEE80211_NUM_TIDS + 1]; + u64_stats_t bytes; + u64_stats_t msdu[IEEE80211_NUM_TIDS + 1]; }; /* @@ -1009,25 +1009,49 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_HE, STA_STATS_RATE_TYPE_S1G, STA_STATS_RATE_TYPE_EHT, + STA_STATS_RATE_TYPE_UHR, }; -#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) -#define STA_STATS_FIELD_LEGACY_IDX GENMASK( 3, 0) -#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) -#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) -#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) -#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0) -#define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_BW GENMASK(12, 8) -#define STA_STATS_FIELD_SGI GENMASK(13, 13) -#define STA_STATS_FIELD_TYPE GENMASK(16, 14) -#define STA_STATS_FIELD_HE_RU GENMASK(19, 17) -#define STA_STATS_FIELD_HE_GI GENMASK(21, 20) -#define STA_STATS_FIELD_HE_DCM GENMASK(22, 22) -#define STA_STATS_FIELD_EHT_RU GENMASK(20, 17) -#define STA_STATS_FIELD_EHT_GI GENMASK(22, 21) +/* common */ +#define STA_STATS_FIELD_TYPE 0x0000000F +#define STA_STATS_FIELD_BW 0x000001F0 +#define STA_STATS_FIELD_RESERVED 0x00000E00 + +/* STA_STATS_RATE_TYPE_LEGACY */ +#define STA_STATS_FIELD_LEGACY_IDX 0x0000F000 +#define STA_STATS_FIELD_LEGACY_BAND 0x000F0000 + +/* STA_STATS_RATE_TYPE_HT */ +#define STA_STATS_FIELD_HT_MCS 0x000FF000 + +/* STA_STATS_RATE_TYPE_VHT */ +#define STA_STATS_FIELD_VHT_MCS 0x0000F000 +#define STA_STATS_FIELD_VHT_NSS 0x000F0000 + +/* HT & VHT */ +#define STA_STATS_FIELD_SGI 0x00100000 + +/* STA_STATS_RATE_TYPE_HE */ +#define STA_STATS_FIELD_HE_MCS 0x0000F000 +#define STA_STATS_FIELD_HE_NSS 0x000F0000 +#define STA_STATS_FIELD_HE_RU 0x00700000 +#define STA_STATS_FIELD_HE_GI 0x01800000 +#define STA_STATS_FIELD_HE_DCM 0x02000000 + +/* STA_STATS_RATE_TYPE_EHT */ +#define STA_STATS_FIELD_EHT_MCS 0x0000F000 +#define STA_STATS_FIELD_EHT_NSS 0x000F0000 +#define STA_STATS_FIELD_EHT_RU 0x00F00000 +#define STA_STATS_FIELD_EHT_GI 0x03000000 + +/* STA_STATS_RATE_TYPE_UHR */ +#define STA_STATS_FIELD_UHR_MCS 0x0001F000 +#define STA_STATS_FIELD_UHR_NSS 0x001E0000 +#define STA_STATS_FIELD_UHR_RU 0x01E00000 +#define STA_STATS_FIELD_UHR_GI 0x06000000 +#define STA_STATS_FIELD_UHR_ELR 0x08000000 +#define STA_STATS_FIELD_UHR_IM 0x10000000 + #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -1040,8 +1064,15 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r = STA_STATS_FIELD(BW, s->bw); - if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) - r |= STA_STATS_FIELD(SGI, 1); + switch (s->encoding) { + case RX_ENC_HT: + case RX_ENC_VHT: + if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) + r |= STA_STATS_FIELD(SGI, 1); + break; + default: + break; + } switch (s->encoding) { case RX_ENC_VHT: @@ -1073,6 +1104,15 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(EHT_GI, s->eht.gi); r |= STA_STATS_FIELD(EHT_RU, s->eht.ru); break; + case RX_ENC_UHR: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_UHR); + r |= STA_STATS_FIELD(UHR_NSS, s->nss); + r |= STA_STATS_FIELD(UHR_MCS, s->rate_idx); + r |= STA_STATS_FIELD(UHR_GI, s->uhr.gi); + r |= STA_STATS_FIELD(UHR_RU, s->uhr.ru); + r |= STA_STATS_FIELD(UHR_ELR, s->uhr.elr); + r |= STA_STATS_FIELD(UHR_IM, s->uhr.im); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index ba5fbacbeeda..1dca2fae05a5 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019, 2021-2024 Intel Corporation + * Copyright (C) 2019, 2021-2025 Intel Corporation */ #include <linux/ieee80211.h> @@ -1449,7 +1449,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, } sta = sta_info_get(sdata, peer); - if (!sta) + if (!sta || !sta->sta.tdls) return -ENOLINK; iee80211_tdls_recalc_chanctx(sdata, sta); @@ -1783,7 +1783,10 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, } elems = ieee802_11_parse_elems(tf->u.chan_switch_resp.variable, - skb->len - baselen, false, NULL); + skb->len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) { ret = -ENOMEM; goto out; @@ -1902,7 +1905,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, } elems = ieee802_11_parse_elems(tf->u.chan_switch_req.variable, - skb->len - baselen, false, NULL); + skb->len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return -ENOMEM; diff --git a/net/mac80211/tests/elems.c b/net/mac80211/tests/elems.c index a53c55a879a8..1039794a0183 100644 --- a/net/mac80211/tests/elems.c +++ b/net/mac80211/tests/elems.c @@ -2,7 +2,7 @@ /* * KUnit tests for element parsing * - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation */ #include <kunit/test.h> #include "../ieee80211_i.h" @@ -15,6 +15,8 @@ static void mle_defrag(struct kunit *test) .link_id = 12, .from_ap = true, .mode = IEEE80211_CONN_MODE_EHT, + /* type is not really relevant here */ + .type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON, }; struct ieee802_11_elems *parsed; struct sk_buff *skb; diff --git a/net/mac80211/tests/util.c b/net/mac80211/tests/util.c index 9c2d63a5cd2b..0b01264af577 100644 --- a/net/mac80211/tests/util.c +++ b/net/mac80211/tests/util.c @@ -195,16 +195,16 @@ int t_sdata_init(struct kunit_resource *resource, void *ctx) struct kunit *test = kunit_get_current_test(); struct t_sdata *t_sdata; - t_sdata = kzalloc(sizeof(*t_sdata), GFP_KERNEL); + t_sdata = kzalloc_obj(*t_sdata); KUNIT_ASSERT_NOT_NULL(test, t_sdata); resource->data = t_sdata; resource->name = "sdata"; - t_sdata->sdata = kzalloc(sizeof(*t_sdata->sdata), GFP_KERNEL); + t_sdata->sdata = kzalloc_obj(*t_sdata->sdata); KUNIT_ASSERT_NOT_NULL(test, t_sdata->sdata); - t_sdata->wiphy = kzalloc(sizeof(*t_sdata->wiphy), GFP_KERNEL); + t_sdata->wiphy = kzalloc_obj(*t_sdata->wiphy); KUNIT_ASSERT_NOT_NULL(test, t_sdata->wiphy); strscpy(t_sdata->sdata->name, "kunit"); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 0bfbce157486..c04d4547e8f4 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -3353,6 +3353,38 @@ TRACE_EVENT(drv_prep_add_interface, ) ); +TRACE_EVENT(drv_set_eml_op_mode, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + unsigned int link_id, + u8 control, u16 link_bitmap), + + TP_ARGS(local, sdata, sta, link_id, control, link_bitmap), + + TP_STRUCT__entry(LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, link_id) + __field(u8, control) + __field(u16, link_bitmap)), + + TP_fast_assign(LOCAL_ASSIGN; + VIF_ASSIGN; + STA_NAMED_ASSIGN(sta); + __entry->link_id = link_id; + __entry->control = control; + __entry->link_bitmap = link_bitmap; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT + " (link:%d control:%02x link_bitmap:%04x)", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id, + __entry->control, __entry->link_bitmap + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e7b141c55f7a..b7aedaab8483 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -640,7 +640,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb) && - !ieee80211_is_group_privacy_action(tx->skb)) + !ieee80211_is_group_privacy_action(tx->skb) && + !ieee80211_require_encrypted_assoc(hdr->frame_control, + tx->sta)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & @@ -1062,9 +1064,11 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_256_MIC_LEN); case WLAN_CIPHER_SUITE_AES_CMAC: - return ieee80211_crypto_aes_cmac_encrypt(tx); + return ieee80211_crypto_aes_cmac_encrypt( + tx, IEEE80211_CMAC_128_MIC_LEN); case WLAN_CIPHER_SUITE_BIP_CMAC_256: - return ieee80211_crypto_aes_cmac_256_encrypt(tx); + return ieee80211_crypto_aes_cmac_encrypt( + tx, IEEE80211_CMAC_256_MIC_LEN); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return ieee80211_crypto_aes_gmac_encrypt(tx); @@ -1607,8 +1611,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) local->cparams.target = MS2TIME(20); local->cparams.ecn = true; - local->cvars = kvcalloc(fq->flows_cnt, sizeof(local->cvars[0]), - GFP_KERNEL); + local->cvars = kvzalloc_objs(local->cvars[0], fq->flows_cnt); if (!local->cvars) { spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); @@ -1896,8 +1899,10 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_tx_data tx; struct sk_buff *skb2; - if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) + if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) { + kfree_skb(skb); return false; + } info->band = band; info->control.vif = vif; @@ -2395,6 +2400,8 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (chanctx_conf) chandef = &chanctx_conf->def; + else if (local->emulate_chanctx) + chandef = &local->hw.conf.chandef; else goto fail_rcu; @@ -5543,8 +5550,7 @@ ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw, if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt) return NULL; - ema = kzalloc(struct_size(ema, bcn, beacon->mbssid_ies->cnt), - GFP_ATOMIC); + ema = kzalloc_flex(*ema, bcn, beacon->mbssid_ies->cnt, GFP_ATOMIC); if (!ema) return NULL; diff --git a/net/mac80211/uhr.c b/net/mac80211/uhr.c new file mode 100644 index 000000000000..2d8f5e5480ef --- /dev/null +++ b/net/mac80211/uhr.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * UHR handling + * + * Copyright(c) 2025-2026 Intel Corporation + */ + +#include "ieee80211_i.h" + +void +ieee80211_uhr_cap_ie_to_sta_uhr_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_uhr_cap *uhr_cap, + u8 uhr_cap_len, + struct link_sta_info *link_sta) +{ + struct ieee80211_sta_uhr_cap *sta_uhr_cap = &link_sta->pub->uhr_cap; + bool from_ap; + + memset(sta_uhr_cap, 0, sizeof(*sta_uhr_cap)); + + if (!ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif)) + return; + + sta_uhr_cap->has_uhr = true; + + sta_uhr_cap->mac = uhr_cap->mac; + from_ap = sdata->vif.type == NL80211_IFTYPE_STATION; + sta_uhr_cap->phy = *ieee80211_uhr_phy_cap(uhr_cap, from_ap); +} diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c9931537f9d2..b2e6c8b98381 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * * utilities for mac80211 */ @@ -101,7 +101,6 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, return NULL; } -EXPORT_SYMBOL(ieee80211_get_bssid); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { @@ -800,20 +799,56 @@ void ieee80211_iterate_active_interfaces_atomic( } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); -void ieee80211_iterate_active_interfaces_mtx( - struct ieee80211_hw *hw, u32 iter_flags, - void (*iterator)(void *data, u8 *mac, - struct ieee80211_vif *vif), - void *data) +struct ieee80211_vif * +__ieee80211_iterate_interfaces(struct ieee80211_hw *hw, + struct ieee80211_vif *prev, + u32 iter_flags) { + bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; + struct ieee80211_sub_if_data *sdata = NULL, *monitor; struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); - __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, - iterator, data); + if (prev) + sdata = vif_to_sdata(prev); + + monitor = rcu_dereference_check(local->monitor_sdata, + lockdep_is_held(&hw->wiphy->mtx)); + if (monitor && monitor == sdata) + return NULL; + + sdata = list_prepare_entry(sdata, &local->interfaces, list); + list_for_each_entry_continue(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; + break; + case NL80211_IFTYPE_AP_VLAN: + continue; + default: + break; + } + if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && + active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + continue; + if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && + !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + continue; + if (ieee80211_sdata_running(sdata) || !active_only) + return &sdata->vif; + } + + if (monitor && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || + monitor->flags & IEEE80211_SDATA_IN_DRIVER)) + return &monitor->vif; + + return NULL; } -EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); +EXPORT_SYMBOL_GPL(__ieee80211_iterate_interfaces); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, @@ -844,18 +879,29 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); -void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, - void (*iterator)(void *data, - struct ieee80211_sta *sta), - void *data) +struct ieee80211_sta * +__ieee80211_iterate_stations(struct ieee80211_hw *hw, + struct ieee80211_sta *prev) { struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta = NULL; lockdep_assert_wiphy(local->hw.wiphy); - __iterate_stations(local, iterator, data); + if (prev) + sta = container_of(prev, struct sta_info, sta); + + sta = list_prepare_entry(sta, &local->sta_list, list); + list_for_each_entry_continue(sta, &local->sta_list, list) { + if (!sta->uploaded) + continue; + + return &sta->sta; + } + + return NULL; } -EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx); +EXPORT_SYMBOL_GPL(__ieee80211_iterate_stations); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { @@ -1096,14 +1142,17 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC), .basic.len = sizeof(mle.basic), }; + bool add_mle; int err; - memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); + add_mle = (multi_link && + !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, + extra, extra_len)); /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN + - multi_link * sizeof(mle)); + add_mle * sizeof(mle)); if (!skb) return; @@ -1120,8 +1169,11 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); - if (multi_link) + + if (add_mle) { + memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); skb_put_data(skb, &mle, sizeof(mle)); + } if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); @@ -1369,6 +1421,13 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, if (err) return err; + if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), + IEEE80211_CHAN_NO_UHR)) { + err = ieee80211_put_uhr_cap(skb, sdata, sband); + if (err) + return err; + } + /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. @@ -1683,9 +1742,7 @@ static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) if (WARN_ON(res)) return res; - funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1, - sizeof(*funcs), - GFP_KERNEL); + funcs = kzalloc_objs(*funcs, sdata->local->hw.max_nan_de_entries + 1); if (!funcs) return -ENOMEM; @@ -2206,9 +2263,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) } } + /* Passing NULL means an interface is picked for configuration */ if (local->virt_monitors > 0 && local->virt_monitors == local->open_count) - ieee80211_add_virtual_monitor(local); + ieee80211_add_virtual_monitor(local, NULL); if (!suspended) return 0; @@ -2347,7 +2405,7 @@ void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); - ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, chanctx); } } @@ -3544,7 +3602,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, if (ctx && &ctx->conf != chanctx_conf) continue; - wiphy_delayed_work_cancel(local->hw.wiphy, + wiphy_hrtimer_work_cancel(local->hw.wiphy, &link->dfs_cac_timer_work); if (!sdata->wdev.links[link_id].cac_started) @@ -4016,23 +4074,23 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; - list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) - if (link->reserved_radar_required) - radar_detect |= BIT(link->reserved.oper.width); - - /* - * An in-place reservation context should not have any assigned vifs - * until it replaces the other context. - */ - WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && - !list_empty(&ctx->assigned_links)); + for_each_sdata_link(local, link) { + if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { + /* + * An in-place reservation context should not have any + * assigned links until it replaces the other context. + */ + WARN_ON(ctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER); - list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { - if (!link->radar_required) - continue; + if (link->radar_required) + radar_detect |= + BIT(link->conf->chanreq.oper.width); + } - radar_detect |= - BIT(link->conf->chanreq.oper.width); + if (link->reserved_chanctx == ctx && + link->reserved_radar_required) + radar_detect |= BIT(link->reserved.oper.width); } return radar_detect; @@ -4474,6 +4532,32 @@ int ieee80211_put_eht_cap(struct sk_buff *skb, return 0; } +int ieee80211_put_uhr_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband) +{ + const struct ieee80211_sta_uhr_cap *uhr_cap = + ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif); + int len; + + if (!uhr_cap) + return 0; + + len = 2 + 1 + sizeof(struct ieee80211_uhr_cap) + + sizeof(struct ieee80211_uhr_cap_phy); + + if (skb_tailroom(skb) < len) + return -ENOBUFS; + + skb_put_u8(skb, WLAN_EID_EXTENSION); + skb_put_u8(skb, len - 2); + skb_put_u8(skb, WLAN_EID_EXT_UHR_CAPA); + skb_put_data(skb, &uhr_cap->mac, sizeof(uhr_cap->mac)); + skb_put_data(skb, &uhr_cap->phy, sizeof(uhr_cap->phy)); + + return 0; +} + const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) { static const char * const modes[] = { @@ -4483,6 +4567,7 @@ const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) [IEEE80211_CONN_MODE_VHT] = "VHT", [IEEE80211_CONN_MODE_HE] = "HE", [IEEE80211_CONN_MODE_EHT] = "EHT", + [IEEE80211_CONN_MODE_UHR] = "UHR", }; if (WARN_ON(mode >= ARRAY_SIZE(modes))) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 40d5d9e48479..fdf98c21d32c 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -527,7 +527,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(skb)) + !ieee80211_is_robust_mgmt_frame(skb) && + !ieee80211_require_encrypted_assoc(hdr->frame_control, rx->sta)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { @@ -723,7 +724,8 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(skb)) + !ieee80211_is_robust_mgmt_frame(skb) && + !ieee80211_require_encrypted_assoc(hdr->frame_control, rx->sta)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { @@ -828,12 +830,14 @@ static inline void bip_ipn_swap(u8 *d, const u8 *s) ieee80211_tx_result -ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx, + unsigned int mic_len) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; - struct ieee80211_mmie *mmie; + struct ieee80211_mmie_var *mmie; + size_t mmie_len; u8 aad[20]; u64 pn64; @@ -848,60 +852,14 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; - if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) - return TX_DROP; - - mmie = skb_put(skb, sizeof(*mmie)); - mmie->element_id = WLAN_EID_MMIE; - mmie->length = sizeof(*mmie) - 2; - mmie->key_id = cpu_to_le16(key->conf.keyidx); - - /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->conf.tx_pn); - - bip_ipn_set64(mmie->sequence_number, pn64); - - if (info->control.hw_key) - return TX_CONTINUE; - - bip_aad(skb, aad); - - /* - * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64) - */ - ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mmie->mic); - - return TX_CONTINUE; -} - -ieee80211_tx_result -ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) -{ - struct sk_buff *skb; - struct ieee80211_tx_info *info; - struct ieee80211_key *key = tx->key; - struct ieee80211_mmie_16 *mmie; - u8 aad[20]; - u64 pn64; - - if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) - return TX_DROP; - - skb = skb_peek(&tx->skbs); - - info = IEEE80211_SKB_CB(skb); - - if (info->control.hw_key && - !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) - return TX_CONTINUE; + mmie_len = sizeof(*mmie) + mic_len; - if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) + if (WARN_ON(skb_tailroom(skb) < mmie_len)) return TX_DROP; - mmie = skb_put(skb, sizeof(*mmie)); + mmie = skb_put(skb, mmie_len); mmie->element_id = WLAN_EID_MMIE; - mmie->length = sizeof(*mmie) - 2; + mmie->length = mmie_len - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ @@ -914,86 +872,40 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) bip_aad(skb, aad); - /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128) - */ - ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mmie->mic); + if (ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, + skb->data + 24, skb->len - 24, + mmie->mic, mic_len)) + return TX_DROP; return TX_CONTINUE; } ieee80211_rx_result -ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx, + unsigned int mic_len) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; - struct ieee80211_mmie *mmie; - u8 aad[20], mic[8], ipn[6]; + struct ieee80211_mmie_var *mmie; + size_t mmie_len; + u8 aad[20], mic[IEEE80211_CMAC_256_MIC_LEN], ipn[6]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; - /* management frames are already linear */ - - if (skb->len < 24 + sizeof(*mmie)) - return RX_DROP_U_SHORT_CMAC; - - mmie = (struct ieee80211_mmie *) - (skb->data + skb->len - sizeof(*mmie)); - if (mmie->element_id != WLAN_EID_MMIE || - mmie->length != sizeof(*mmie) - 2) - return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */ - - bip_ipn_swap(ipn, mmie->sequence_number); - - if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { - key->u.aes_cmac.replays++; - return RX_DROP_U_REPLAY; - } - - if (!(status->flag & RX_FLAG_DECRYPTED)) { - /* hardware didn't decrypt/verify MIC */ - bip_aad(skb, aad); - ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mic); - if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { - key->u.aes_cmac.icverrors++; - return RX_DROP_U_MIC_FAIL; - } - } - - memcpy(key->u.aes_cmac.rx_pn, ipn, 6); - - /* Remove MMIE */ - skb_trim(skb, skb->len - sizeof(*mmie)); - - return RX_CONTINUE; -} - -ieee80211_rx_result -ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) -{ - struct sk_buff *skb = rx->skb; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - struct ieee80211_key *key = rx->key; - struct ieee80211_mmie_16 *mmie; - u8 aad[20], mic[16], ipn[6]; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - - if (!ieee80211_is_mgmt(hdr->frame_control)) - return RX_CONTINUE; + mmie_len = sizeof(*mmie) + mic_len; /* management frames are already linear */ - if (skb->len < 24 + sizeof(*mmie)) - return RX_DROP_U_SHORT_CMAC256; + if (skb->len < 24 + mmie_len) + return mic_len == IEEE80211_CMAC_128_MIC_LEN ? + RX_DROP_U_SHORT_CMAC : RX_DROP_U_SHORT_CMAC256; - mmie = (struct ieee80211_mmie_16 *) - (skb->data + skb->len - sizeof(*mmie)); + mmie = (struct ieee80211_mmie_var *)(skb->data + skb->len - mmie_len); if (mmie->element_id != WLAN_EID_MMIE || - mmie->length != sizeof(*mmie) - 2) + mmie->length != mmie_len - 2) return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); @@ -1006,9 +918,11 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); - ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mic); - if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { + if (ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, + skb->data + 24, skb->len - 24, + mic, mic_len)) + return RX_DROP_U_DECRYPT_FAIL; + if (crypto_memneq(mic, mmie->mic, mic_len)) { key->u.aes_cmac.icverrors++; return RX_DROP_U_MIC_FAIL; } @@ -1017,7 +931,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) memcpy(key->u.aes_cmac.rx_pn, ipn, 6); /* Remove MMIE */ - skb_trim(skb, skb->len - sizeof(*mmie)); + skb_trim(skb, skb->len - mmie_len); return RX_CONTINUE; } @@ -1113,7 +1027,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx) memcpy(nonce, hdr->addr2, ETH_ALEN); memcpy(nonce + ETH_ALEN, ipn, 6); - mic = kmalloc(GMAC_MIC_LEN, GFP_ATOMIC); + mic = kmalloc(IEEE80211_GMAC_MIC_LEN, GFP_ATOMIC); if (!mic) return RX_DROP_U_OOM; if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce, diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h index a9a81abb5479..6e8846dfe710 100644 --- a/net/mac80211/wpa.h +++ b/net/mac80211/wpa.h @@ -29,13 +29,11 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, unsigned int mic_len); ieee80211_tx_result -ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx); -ieee80211_tx_result -ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx); -ieee80211_rx_result -ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx); +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx, + unsigned int mic_len); ieee80211_rx_result -ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx); +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx, + unsigned int mic_len); ieee80211_tx_result ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx); ieee80211_rx_result diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index ef7f23af043f..e8484031a390 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -339,7 +339,7 @@ static int mac802154_associate(struct wpan_phy *wpan_phy, if (coord->mode == IEEE802154_SHORT_ADDRESSING) return -EINVAL; - parent = kzalloc(sizeof(*parent), GFP_KERNEL); + parent = kzalloc_obj(*parent); if (!parent) return -ENOMEM; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 9e4631fade90..000be60d9580 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -469,7 +469,9 @@ static int mac802154_header_create(struct sk_buff *skb, } static int -mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) +mac802154_header_parse(const struct sk_buff *skb, + const struct net_device *dev, + unsigned char *haddr) { struct ieee802154_hdr hdr; diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index f13b07ebfb98..e8512578398e 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -117,7 +117,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) struct mac802154_llsec_key *key; int i; - key = kzalloc(sizeof(*key), GFP_KERNEL); + key = kzalloc_obj(*key); if (!key) return NULL; @@ -241,7 +241,7 @@ int mac802154_llsec_key_add(struct mac802154_llsec *sec, break; } - new = kzalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc_obj(*new); if (!new) return -ENOMEM; @@ -369,7 +369,7 @@ int mac802154_llsec_dev_add(struct mac802154_llsec *sec, llsec_dev_find_long(sec, dev->hwaddr)) return -EEXIST; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kmalloc_obj(*entry); if (!entry) return -ENOMEM; @@ -441,7 +441,7 @@ int mac802154_llsec_devkey_add(struct mac802154_llsec *sec, if (llsec_devkey_find(dev, &key->key_id)) return -EEXIST; - devkey = kmalloc(sizeof(*devkey), GFP_KERNEL); + devkey = kmalloc_obj(*devkey); if (!devkey) return -ENOMEM; @@ -500,7 +500,7 @@ int mac802154_llsec_seclevel_add(struct mac802154_llsec *sec, if (llsec_find_seclevel(sec, sl)) return -EEXIST; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kmalloc_obj(*entry); if (!entry) return -ENOMEM; @@ -925,7 +925,7 @@ llsec_update_devkey_record(struct mac802154_llsec_device *dev, if (!devkey) { struct mac802154_llsec_device_key *next; - next = kzalloc(sizeof(*devkey), GFP_ATOMIC); + next = kzalloc_obj(*devkey, GFP_ATOMIC); if (!next) return -ENOMEM; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index aac359b5c71d..cd8f2a11920d 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -213,7 +213,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, if (!mac802154_is_scanning(sdata->local)) goto fail; - mac_pkt = kzalloc(sizeof(*mac_pkt), GFP_ATOMIC); + mac_pkt = kzalloc_obj(*mac_pkt, GFP_ATOMIC); if (!mac_pkt) goto fail; @@ -227,7 +227,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, case IEEE802154_FC_TYPE_MAC_CMD: dev_dbg(&sdata->dev->dev, "MAC COMMAND received\n"); - mac_pkt = kzalloc(sizeof(*mac_pkt), GFP_ATOMIC); + mac_pkt = kzalloc_obj(*mac_pkt, GFP_ATOMIC); if (!mac_pkt) goto fail; diff --git a/net/mac802154/scan.c b/net/mac802154/scan.c index a6dab3cc3ad8..0a31ac8d8415 100644 --- a/net/mac802154/scan.c +++ b/net/mac802154/scan.c @@ -798,7 +798,7 @@ int mac802154_process_association_req(struct ieee802154_sub_if_data *sdata, goto unlock; } - child = kzalloc(sizeof(*child), GFP_KERNEL); + child = kzalloc_obj(*child); if (!child) { ret = -ENOMEM; goto unlock; diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index b99ba14f39d2..209a963112e3 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -49,7 +49,7 @@ static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr) !addr->__smctp_pad0[2]; } -static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +static int mctp_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); @@ -128,7 +128,7 @@ out_release: /* Used to set a specific peer prior to bind. Not used for outbound * connections (Tag Owner set) since MCTP is a datagram protocol. */ -static int mctp_connect(struct socket *sock, struct sockaddr *addr, +static int mctp_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sock *sk = sock->sk; diff --git a/net/mctp/device.c b/net/mctp/device.c index 4d404edd7446..2c84df674669 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -70,6 +70,7 @@ static int mctp_fill_addrinfo(struct sk_buff *skb, return -EMSGSIZE; hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); hdr->ifa_family = AF_MCTP; hdr->ifa_prefixlen = 0; hdr->ifa_flags = 0; @@ -335,7 +336,7 @@ static struct mctp_dev *mctp_add_dev(struct net_device *dev) ASSERT_RTNL(); - mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + mdev = kzalloc_obj(*mdev); if (!mdev) return ERR_PTR(-ENOMEM); diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 05b899f22d90..87e40db0f078 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -40,7 +40,7 @@ static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, goto out; } - neigh = kzalloc(sizeof(*neigh), GFP_KERNEL); + neigh = kzalloc_obj(*neigh); if (!neigh) { rc = -ENOMEM; goto out; @@ -218,6 +218,7 @@ static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, return -EMSGSIZE; hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); hdr->ndm_family = AF_MCTP; hdr->ndm_ifindex = dev->ifindex; hdr->ndm_state = 0; // TODO other state bits? diff --git a/net/mctp/route.c b/net/mctp/route.c index 2ac4011a953f..59ad60b88563 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -227,7 +227,7 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, { struct mctp_sk_key *key; - key = kzalloc(sizeof(*key), gfp); + key = kzalloc_obj(*key, gfp); if (!key) return NULL; @@ -359,6 +359,7 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) { struct mctp_sk_key *key; struct mctp_flow *flow; + unsigned long flags; flow = skb_ext_find(skb, SKB_EXT_MCTP); if (!flow) @@ -366,12 +367,14 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) key = flow->key; - if (key->dev) { + spin_lock_irqsave(&key->lock, flags); + + if (!key->dev) + mctp_dev_set_key(dev, key); + else WARN_ON(key->dev != dev); - return; - } - mctp_dev_set_key(dev, key); + spin_unlock_irqrestore(&key->lock, flags); } #else static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} @@ -675,7 +678,7 @@ static struct mctp_route *mctp_route_alloc(void) { struct mctp_route *rt; - rt = kzalloc(sizeof(*rt), GFP_KERNEL); + rt = kzalloc_obj(*rt); if (!rt) return NULL; @@ -1643,6 +1646,7 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, return -EMSGSIZE; hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); hdr->rtm_family = AF_MCTP; /* we use the _len fields as a number of EIDs, rather than diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 69a3ccfc6310..75ea96c10e49 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -20,7 +20,6 @@ struct mctp_frag_test { static void mctp_test_fragment(struct kunit *test) { const struct mctp_frag_test *params; - struct mctp_test_pktqueue tpq; int rc, i, n, mtu, msgsize; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -43,13 +42,12 @@ static void mctp_test_fragment(struct kunit *test) dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); - mctp_test_dst_setup(test, &dst, dev, &tpq, mtu); + mctp_test_dst_setup(test, &dst, dev, mtu); rc = mctp_do_fragment_route(&dst, skb, mtu, MCTP_TAG_OWNER); KUNIT_EXPECT_FALSE(test, rc); - n = tpq.pkts.qlen; - + n = dev->pkts.qlen; KUNIT_EXPECT_EQ(test, n, params->n_frags); for (i = 0;; i++) { @@ -61,8 +59,7 @@ static void mctp_test_fragment(struct kunit *test) first = i == 0; last = i == (n - 1); - skb2 = skb_dequeue(&tpq.pkts); - + skb2 = skb_dequeue(&dev->pkts); if (!skb2) break; @@ -99,7 +96,7 @@ static void mctp_test_fragment(struct kunit *test) kfree_skb(skb2); } - mctp_test_dst_release(&dst, &tpq); + mctp_dst_release(&dst); mctp_test_destroy_dev(dev); } @@ -130,13 +127,11 @@ struct mctp_rx_input_test { static void mctp_test_rx_input(struct kunit *test) { const struct mctp_rx_input_test *params; - struct mctp_test_pktqueue tpq; struct mctp_test_route *rt; struct mctp_test_dev *dev; struct sk_buff *skb; params = test->param_value; - test->priv = &tpq; dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); @@ -147,13 +142,10 @@ static void mctp_test_rx_input(struct kunit *test) skb = mctp_test_create_skb(¶ms->hdr, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); - mctp_test_pktqueue_init(&tpq); - mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); - KUNIT_EXPECT_EQ(test, !!tpq.pkts.qlen, params->input); + KUNIT_EXPECT_EQ(test, !!dev->pkts.qlen, params->input); - skb_queue_purge(&tpq.pkts); mctp_test_route_destroy(test, rt); mctp_test_destroy_dev(dev); } @@ -182,7 +174,6 @@ KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, static void __mctp_route_test_init(struct kunit *test, struct mctp_test_dev **devp, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket **sockp, unsigned int netid) { @@ -196,7 +187,7 @@ static void __mctp_route_test_init(struct kunit *test, if (netid != MCTP_NET_ANY) WRITE_ONCE(dev->mdev->net, netid); - mctp_test_dst_setup(test, dst, dev, tpq, 68); + mctp_test_dst_setup(test, dst, dev, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -205,7 +196,7 @@ static void __mctp_route_test_init(struct kunit *test, addr.smctp_network = netid; addr.smctp_addr.s_addr = 8; addr.smctp_type = 0; - rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + rc = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); KUNIT_ASSERT_EQ(test, rc, 0); *devp = dev; @@ -215,11 +206,10 @@ static void __mctp_route_test_init(struct kunit *test, static void __mctp_route_test_fini(struct kunit *test, struct mctp_test_dev *dev, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket *sock) { sock_release(sock); - mctp_test_dst_release(dst, tpq); + mctp_dst_release(dst); mctp_test_destroy_dev(dev); } @@ -232,7 +222,6 @@ struct mctp_route_input_sk_test { static void mctp_test_route_input_sk(struct kunit *test) { const struct mctp_route_input_sk_test *params; - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -241,13 +230,12 @@ static void mctp_test_route_input_sk(struct kunit *test) params = test->param_value; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); mctp_test_skb_set_dev(skb, dev); - mctp_test_pktqueue_init(&tpq); rc = mctp_dst_input(&dst, skb); @@ -266,7 +254,7 @@ static void mctp_test_route_input_sk(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } #define FL_S (MCTP_HDR_FLAG_SOM) @@ -303,7 +291,6 @@ struct mctp_route_input_sk_reasm_test { static void mctp_test_route_input_sk_reasm(struct kunit *test) { const struct mctp_route_input_sk_reasm_test *params; - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -313,7 +300,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) params = test->param_value; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); for (i = 0; i < params->n_hdrs; i++) { c = i; @@ -336,7 +323,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } #define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT)) @@ -438,7 +425,6 @@ struct mctp_route_input_sk_keys_test { static void mctp_test_route_input_sk_keys(struct kunit *test) { const struct mctp_route_input_sk_keys_test *params; - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_sk_key *key; @@ -457,7 +443,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); net = READ_ONCE(dev->mdev->net); - mctp_test_dst_setup(test, &dst, dev, &tpq, 68); + mctp_test_dst_setup(test, &dst, dev, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -497,7 +483,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) skb_free_datagram(sock->sk, skb2); mctp_key_unref(key); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = { @@ -572,7 +558,6 @@ KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests, struct test_net { unsigned int netid; struct mctp_test_dev *dev; - struct mctp_test_pktqueue tpq; struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; @@ -591,20 +576,18 @@ mctp_test_route_input_multiple_nets_bind_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, - t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->sock, t->netid); t->skb = mctp_test_create_skb_data(&hdr, &t->msg); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb); mctp_test_skb_set_dev(t->skb, t->dev); - mctp_test_pktqueue_init(&t->tpq); } static void mctp_test_route_input_multiple_nets_bind_fini(struct kunit *test, struct test_net *t) { - __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, t->sock); } /* Test that skbs from different nets (otherwise identical) get routed to their @@ -661,8 +644,7 @@ mctp_test_route_input_multiple_nets_key_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, - t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->sock, t->netid); msk = container_of(t->sock->sk, struct mctp_sock, sk); @@ -685,7 +667,7 @@ mctp_test_route_input_multiple_nets_key_fini(struct kunit *test, struct test_net *t) { mctp_key_unref(t->key); - __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, t->sock); } /* test that skbs from different nets (otherwise identical) get routed to their @@ -738,14 +720,13 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) static void mctp_test_route_input_sk_fail_single(struct kunit *test) { const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; int rc; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. @@ -768,7 +749,7 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); kfree_skb(skb); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } /* Input route to socket, using a fragmented message, where sock delivery fails. @@ -776,7 +757,6 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) static void mctp_test_route_input_sk_fail_frag(struct kunit *test) { const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skbs[2]; struct mctp_dst dst; @@ -784,7 +764,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) unsigned int i; int rc; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); lock_sock(sock->sk); WRITE_ONCE(sock->sk->sk_rcvbuf, 0); @@ -815,7 +795,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); kfree_skb(skbs[1]); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } /* Input route to socket, using a fragmented message created from clones. @@ -833,7 +813,6 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) const size_t data_len = 3; /* arbitrary */ u8 compare[3 * ARRAY_SIZE(hdrs)]; u8 flat[3 * ARRAY_SIZE(hdrs)]; - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skb[5]; struct sk_buff *rx_skb; @@ -845,7 +824,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) total = data_len + sizeof(struct mctp_hdr); - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); /* Create a single skb initially with concatenated packets */ skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total); @@ -922,7 +901,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) kfree_skb(skb[i]); } - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } #if IS_ENABLED(CONFIG_MCTP_FLOWS) @@ -930,7 +909,6 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) static void mctp_test_flow_init(struct kunit *test, struct mctp_test_dev **devp, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket **sock, struct sk_buff **skbp, unsigned int len) @@ -944,7 +922,7 @@ static void mctp_test_flow_init(struct kunit *test, * mctp_local_output, which will call dst->output on whatever * route we provide */ - __mctp_route_test_init(test, &dev, dst, tpq, sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, dst, sock, MCTP_NET_ANY); /* Assign a single EID. ->addrs is freed on mctp netdev release */ dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); @@ -965,16 +943,14 @@ static void mctp_test_flow_init(struct kunit *test, static void mctp_test_flow_fini(struct kunit *test, struct mctp_test_dev *dev, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket *sock) { - __mctp_route_test_fini(test, dev, dst, tpq, sock); + __mctp_route_test_fini(test, dev, dst, sock); } /* test that an outgoing skb has the correct MCTP extension data set */ static void mctp_test_packet_flow(struct kunit *test) { - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -983,15 +959,15 @@ static void mctp_test_packet_flow(struct kunit *test) u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 30); + mctp_test_flow_init(test, &dev, &dst, &sock, &skb, 30); rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = tpq.pkts.qlen; + n = dev->pkts.qlen; KUNIT_ASSERT_EQ(test, n, 1); - skb2 = skb_dequeue(&tpq.pkts); + skb2 = skb_dequeue(&dev->pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2); flow = skb_ext_find(skb2, SKB_EXT_MCTP); @@ -1000,7 +976,7 @@ static void mctp_test_packet_flow(struct kunit *test) KUNIT_ASSERT_PTR_EQ(test, flow->key->sk, sock->sk); kfree_skb(skb2); - mctp_test_flow_fini(test, dev, &dst, &tpq, sock); + mctp_test_flow_fini(test, dev, &dst, sock); } /* test that outgoing skbs, after fragmentation, all have the correct MCTP @@ -1008,7 +984,6 @@ static void mctp_test_packet_flow(struct kunit *test) */ static void mctp_test_fragment_flow(struct kunit *test) { - struct mctp_test_pktqueue tpq; struct mctp_flow *flows[2]; struct sk_buff *tx_skbs[2]; struct mctp_test_dev *dev; @@ -1018,17 +993,17 @@ static void mctp_test_fragment_flow(struct kunit *test) u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 100); + mctp_test_flow_init(test, &dev, &dst, &sock, &skb, 100); rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = tpq.pkts.qlen; + n = dev->pkts.qlen; KUNIT_ASSERT_EQ(test, n, 2); /* both resulting packets should have the same flow data */ - tx_skbs[0] = skb_dequeue(&tpq.pkts); - tx_skbs[1] = skb_dequeue(&tpq.pkts); + tx_skbs[0] = skb_dequeue(&dev->pkts); + tx_skbs[1] = skb_dequeue(&dev->pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[0]); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[1]); @@ -1044,7 +1019,7 @@ static void mctp_test_fragment_flow(struct kunit *test) kfree_skb(tx_skbs[0]); kfree_skb(tx_skbs[1]); - mctp_test_flow_fini(test, dev, &dst, &tpq, sock); + mctp_test_flow_fini(test, dev, &dst, sock); } #else @@ -1063,7 +1038,6 @@ static void mctp_test_fragment_flow(struct kunit *test) static void mctp_test_route_output_key_create(struct kunit *test) { const u8 dst_eid = 26, src_eid = 15; - struct mctp_test_pktqueue tpq; const unsigned int netid = 50; struct mctp_test_dev *dev; struct mctp_sk_key *key; @@ -1080,7 +1054,7 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); WRITE_ONCE(dev->mdev->net, netid); - mctp_test_dst_setup(test, &dst, dev, &tpq, 68); + mctp_test_dst_setup(test, &dst, dev, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -1127,14 +1101,13 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_EXPECT_FALSE(test, key->tag & MCTP_TAG_OWNER); sock_release(sock); - mctp_test_dst_release(&dst, &tpq); + mctp_dst_release(&dst); mctp_test_destroy_dev(dev); } static void mctp_test_route_extaddr_input(struct kunit *test) { static const unsigned char haddr[] = { 0xaa, 0x55 }; - struct mctp_test_pktqueue tpq; struct mctp_skb_cb *cb, *cb2; const unsigned int len = 40; struct mctp_test_dev *dev; @@ -1149,7 +1122,7 @@ static void mctp_test_route_extaddr_input(struct kunit *test) hdr.dest = 8; hdr.flags_seq_tag = FL_S | FL_E | FL_TO; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb(&hdr, len); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); @@ -1178,7 +1151,7 @@ static void mctp_test_route_extaddr_input(struct kunit *test) KUNIT_EXPECT_MEMEQ(test, cb2->haddr, haddr, sizeof(haddr)); kfree_skb(skb2); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } static void mctp_test_route_gw_lookup(struct kunit *test) @@ -1530,14 +1503,13 @@ static void mctp_test_bind_lookup(struct kunit *test) struct socket *socks[ARRAY_SIZE(lookup_binds)]; struct sk_buff *skb_pkt = NULL, *skb_sock = NULL; struct socket *sock_ty0, *sock_expect = NULL; - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct mctp_dst dst; int rc; rx = test->param_value; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock_ty0, rx->net); + __mctp_route_test_init(test, &dev, &dst, &sock_ty0, rx->net); /* Create all binds */ for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) { mctp_test_bind_run(test, &lookup_binds[i], @@ -1557,7 +1529,6 @@ static void mctp_test_bind_lookup(struct kunit *test) skb_pkt = mctp_test_create_skb_data(&rx->hdr, &rx->ty); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb_pkt); mctp_test_skb_set_dev(skb_pkt, dev); - mctp_test_pktqueue_init(&tpq); rc = mctp_dst_input(&dst, skb_pkt); if (rx->expect) { @@ -1591,7 +1562,7 @@ cleanup: for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) sock_release(socks[i]); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock_ty0); + __mctp_route_test_fini(test, dev, &dst, sock_ty0); } static struct kunit_case mctp_test_cases[] = { diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 953d41902771..97afe8cd2b05 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -13,7 +13,10 @@ static netdev_tx_t mctp_test_dev_tx(struct sk_buff *skb, struct net_device *ndev) { - kfree_skb(skb); + struct mctp_test_dev *dev = netdev_priv(ndev); + + skb_queue_tail(&dev->pkts, skb); + return NETDEV_TX_OK; } @@ -26,7 +29,7 @@ static void mctp_test_dev_setup(struct net_device *ndev) ndev->type = ARPHRD_MCTP; ndev->mtu = MCTP_DEV_TEST_MTU; ndev->hard_header_len = 0; - ndev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + ndev->tx_queue_len = 0; ndev->flags = IFF_NOARP; ndev->netdev_ops = &mctp_test_netdev_ops; ndev->needs_free_netdev = true; @@ -51,6 +54,7 @@ static struct mctp_test_dev *__mctp_test_create_dev(unsigned short lladdr_len, dev->ndev = ndev; ndev->addr_len = lladdr_len; dev_addr_set(ndev, lladdr); + skb_queue_head_init(&dev->pkts); rc = register_netdev(ndev); if (rc) { @@ -63,6 +67,11 @@ static struct mctp_test_dev *__mctp_test_create_dev(unsigned short lladdr_len, dev->mdev->net = mctp_default_net(dev_net(ndev)); rcu_read_unlock(); + /* bring the device up; we want to be able to TX immediately */ + rtnl_lock(); + dev_open(ndev, NULL); + rtnl_unlock(); + return dev; } @@ -79,26 +88,15 @@ struct mctp_test_dev *mctp_test_create_dev_lladdr(unsigned short lladdr_len, void mctp_test_destroy_dev(struct mctp_test_dev *dev) { + skb_queue_purge(&dev->pkts); mctp_dev_put(dev->mdev); unregister_netdev(dev->ndev); } -static const unsigned int test_pktqueue_magic = 0x5f713aef; - -void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq) -{ - tpq->magic = test_pktqueue_magic; - skb_queue_head_init(&tpq->pkts); -} - static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct kunit *test = current->kunit_test; - struct mctp_test_pktqueue *tpq = test->priv; - - KUNIT_ASSERT_EQ(test, tpq->magic, test_pktqueue_magic); - - skb_queue_tail(&tpq->pkts, skb); + skb->dev = dst->dev->dev; + dev_queue_xmit(skb); return 0; } @@ -108,7 +106,7 @@ static struct mctp_test_route *mctp_route_test_alloc(void) { struct mctp_test_route *rt; - rt = kzalloc(sizeof(*rt), GFP_KERNEL); + rt = kzalloc_obj(*rt); if (!rt) return NULL; @@ -169,11 +167,9 @@ struct mctp_test_route *mctp_test_create_route_gw(struct net *net, return rt; } -/* Convenience function for our test dst; release with mctp_test_dst_release() - */ +/* Convenience function for our test dst; release with mctp_dst_release() */ void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, - struct mctp_test_dev *dev, - struct mctp_test_pktqueue *tpq, unsigned int mtu) + struct mctp_test_dev *dev, unsigned int mtu) { KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); @@ -183,15 +179,6 @@ void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, __mctp_dev_get(dst->dev->dev); dst->mtu = mtu; dst->output = mctp_test_dst_output; - mctp_test_pktqueue_init(tpq); - test->priv = tpq; -} - -void mctp_test_dst_release(struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq) -{ - mctp_dst_release(dst); - skb_queue_purge(&tpq->pkts); } void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt) @@ -279,7 +266,7 @@ void mctp_test_bind_run(struct kunit *test, addr.smctp_addr.s_addr = setup->peer_addr; /* connect() type must match bind() type */ addr.smctp_type = setup->bind_type; - rc = kernel_connect(*sock, (struct sockaddr *)&addr, + rc = kernel_connect(*sock, (struct sockaddr_unsized *)&addr, sizeof(addr), 0); KUNIT_EXPECT_EQ(test, rc, 0); } @@ -292,5 +279,6 @@ void mctp_test_bind_run(struct kunit *test, addr.smctp_type = setup->bind_type; *ret_bind_errno = - kernel_bind(*sock, (struct sockaddr *)&addr, sizeof(addr)); + kernel_bind(*sock, (struct sockaddr_unsized *)&addr, + sizeof(addr)); } diff --git a/net/mctp/test/utils.h b/net/mctp/test/utils.h index 06bdb6cb5eff..4cc90c9da4d1 100644 --- a/net/mctp/test/utils.h +++ b/net/mctp/test/utils.h @@ -18,6 +18,8 @@ struct mctp_test_dev { unsigned short lladdr_len; unsigned char lladdr[MAX_ADDR_LEN]; + + struct sk_buff_head pkts; }; struct mctp_test_dev; @@ -26,11 +28,6 @@ struct mctp_test_route { struct mctp_route rt; }; -struct mctp_test_pktqueue { - unsigned int magic; - struct sk_buff_head pkts; -}; - struct mctp_test_bind_setup { mctp_eid_t bind_addr; int bind_net; @@ -59,11 +56,7 @@ struct mctp_test_route *mctp_test_create_route_gw(struct net *net, mctp_eid_t gw, unsigned int mtu); void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, - struct mctp_test_dev *dev, - struct mctp_test_pktqueue *tpq, unsigned int mtu); -void mctp_test_dst_release(struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq); -void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq); + struct mctp_test_dev *dev, unsigned int mtu); void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt); void mctp_test_skb_set_dev(struct sk_buff *skb, struct mctp_test_dev *dev); struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 25c88cba5c48..d5417688f69e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -75,16 +75,23 @@ static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); -static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) +static struct mpls_route *mpls_route_input(struct net *net, unsigned int index) { - struct mpls_route *rt = NULL; + struct mpls_route __rcu **platform_label; - if (index < net->mpls.platform_labels) { - struct mpls_route __rcu **platform_label = - rcu_dereference_rtnl(net->mpls.platform_label); - rt = rcu_dereference_rtnl(platform_label[index]); - } - return rt; + platform_label = mpls_dereference(net, net->mpls.platform_label); + return mpls_dereference(net, platform_label[index]); +} + +static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned int index) +{ + struct mpls_route __rcu **platform_label; + + if (index >= net->mpls.platform_labels) + return NULL; + + platform_label = rcu_dereference(net->mpls.platform_label); + return rcu_dereference(platform_label[index]); } bool mpls_output_possible(const struct net_device *dev) @@ -129,25 +136,26 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); -void mpls_stats_inc_outucastpkts(struct net_device *dev, +void mpls_stats_inc_outucastpkts(struct net *net, + struct net_device *dev, const struct sk_buff *skb) { struct mpls_dev *mdev; if (skb->protocol == htons(ETH_P_MPLS_UC)) { - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (mdev) MPLS_INC_STATS_LEN(mdev, skb->len, tx_packets, tx_bytes); } else if (skb->protocol == htons(ETH_P_IP)) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { - struct inet6_dev *in6dev = __in6_dev_get(dev); + struct inet6_dev *in6dev = in6_dev_rcu(dev); if (in6dev) - IP6_UPD_PO_STATS(dev_net(dev), in6dev, + IP6_UPD_PO_STATS(net, in6dev, IPSTATS_MIB_OUT, skb->len); #endif } @@ -342,7 +350,7 @@ static bool mpls_egress(struct net *net, struct mpls_route *rt, static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct net *net = dev_net(dev); + struct net *net = dev_net_rcu(dev); struct mpls_shim_hdr *hdr; const struct mpls_nh *nh; struct mpls_route *rt; @@ -357,7 +365,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) goto drop; @@ -434,7 +442,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ - if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) + if (!mpls_egress(net, rt, skb, dec)) goto err; } else { bool bos; @@ -451,7 +459,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, } } - mpls_stats_inc_outucastpkts(out_dev, skb); + mpls_stats_inc_outucastpkts(net, out_dev, skb); /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) @@ -466,7 +474,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, return 0; tx_err: - out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + out_mdev = out_dev ? mpls_dev_rcu(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); goto drop; @@ -530,10 +538,23 @@ static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels) return rt; } +static void mpls_rt_free_rcu(struct rcu_head *head) +{ + struct mpls_route *rt; + + rt = container_of(head, struct mpls_route, rt_rcu); + + change_nexthops(rt) { + netdev_put(nh->nh_dev, &nh->nh_dev_tracker); + } endfor_nexthops(rt); + + kfree(rt); +} + static void mpls_rt_free(struct mpls_route *rt) { if (rt) - kfree_rcu(rt, rt_rcu); + call_rcu(&rt->rt_rcu, mpls_rt_free_rcu); } static void mpls_notify_route(struct net *net, unsigned index, @@ -557,10 +578,8 @@ static void mpls_route_update(struct net *net, unsigned index, struct mpls_route __rcu **platform_label; struct mpls_route *rt; - ASSERT_RTNL(); - - platform_label = rtnl_dereference(net->mpls.platform_label); - rt = rtnl_dereference(platform_label[index]); + platform_label = mpls_dereference(net, net->mpls.platform_label); + rt = mpls_dereference(net, platform_label[index]); rcu_assign_pointer(platform_label[index], new); mpls_notify_route(net, index, rt, new, info); @@ -569,24 +588,23 @@ static void mpls_route_update(struct net *net, unsigned index, mpls_rt_free(rt); } -static unsigned find_free_label(struct net *net) +static unsigned int find_free_label(struct net *net) { - struct mpls_route __rcu **platform_label; - size_t platform_labels; - unsigned index; + unsigned int index; - platform_label = rtnl_dereference(net->mpls.platform_label); - platform_labels = net->mpls.platform_labels; - for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels; + for (index = MPLS_LABEL_FIRST_UNRESERVED; + index < net->mpls.platform_labels; index++) { - if (!rtnl_dereference(platform_label[index])) + if (!mpls_route_input(net, index)) return index; } + return LABEL_NOT_SPECIFIED; } #if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { struct net_device *dev; @@ -599,14 +617,14 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, return ERR_CAST(rt); dev = rt->dst.dev; - dev_hold(dev); - + netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); ip_rt_put(rt); return dev; } #else static struct net_device *inet_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); @@ -615,6 +633,7 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, #if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { struct net_device *dev; @@ -631,13 +650,14 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, return ERR_CAST(dst); dev = dst->dev; - dev_hold(dev); + netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); dst_release(dst); return dev; } #else static struct net_device *inet6_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); @@ -653,16 +673,17 @@ static struct net_device *find_outdev(struct net *net, if (!oif) { switch (nh->nh_via_table) { case NEIGH_ARP_TABLE: - dev = inet_fib_lookup_dev(net, mpls_nh_via(rt, nh)); + dev = inet_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_ND_TABLE: - dev = inet6_fib_lookup_dev(net, mpls_nh_via(rt, nh)); + dev = inet6_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_LINK_TABLE: break; } } else { - dev = dev_get_by_index(net, oif); + dev = netdev_get_by_index(net, oif, + &nh->nh_dev_tracker, GFP_KERNEL); } if (!dev) @@ -671,8 +692,7 @@ static struct net_device *find_outdev(struct net *net, if (IS_ERR(dev)) return dev; - /* The caller is holding rtnl anyways, so release the dev reference */ - dev_put(dev); + nh->nh_dev = dev; return dev; } @@ -686,20 +706,17 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, dev = find_outdev(net, rt, nh, oif); if (IS_ERR(dev)) { err = PTR_ERR(dev); - dev = NULL; goto errout; } /* Ensure this is a supported device */ err = -EINVAL; - if (!mpls_dev_get(dev)) - goto errout; + if (!mpls_dev_get(net, dev)) + goto errout_put; if ((nh->nh_via_table == NEIGH_LINK_TABLE) && (dev->addr_len != nh->nh_via_alen)) - goto errout; - - nh->nh_dev = dev; + goto errout_put; if (!(dev->flags & IFF_UP)) { nh->nh_flags |= RTNH_F_DEAD; @@ -713,6 +730,9 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, return 0; +errout_put: + netdev_put(nh->nh_dev, &nh->nh_dev_tracker); + nh->nh_dev = NULL; errout: return err; } @@ -890,7 +910,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; int err = 0; - u8 nhs = 0; + + rt->rt_nhn = 0; change_nexthops(rt) { int attrlen; @@ -926,11 +947,9 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, rt->rt_nhn_alive--; rtnh = rtnh_next(rtnh, &remaining); - nhs++; + rt->rt_nhn++; } endfor_nexthops(rt); - rt->rt_nhn = nhs; - return 0; errout: @@ -940,30 +959,28 @@ errout: static bool mpls_label_ok(struct net *net, unsigned int *index, struct netlink_ext_ack *extack) { - bool is_ok = true; - /* Reserved labels may not be set */ if (*index < MPLS_LABEL_FIRST_UNRESERVED) { NL_SET_ERR_MSG(extack, "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); - is_ok = false; + return false; } /* The full 20 bit range may not be supported. */ - if (is_ok && *index >= net->mpls.platform_labels) { + if (*index >= net->mpls.platform_labels) { NL_SET_ERR_MSG(extack, "Label >= configured maximum in platform_labels"); - is_ok = false; + return false; } *index = array_index_nospec(*index, net->mpls.platform_labels); - return is_ok; + + return true; } static int mpls_route_add(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { - struct mpls_route __rcu **platform_label; struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_route *rt, *old; int err = -EINVAL; @@ -991,8 +1008,7 @@ static int mpls_route_add(struct mpls_route_config *cfg, } err = -EEXIST; - platform_label = rtnl_dereference(net->mpls.platform_label); - old = rtnl_dereference(platform_label[index]); + old = mpls_route_input(net, index); if ((cfg->rc_nlflags & NLM_F_EXCL) && old) goto errout; @@ -1103,7 +1119,7 @@ static int mpls_fill_stats_af(struct sk_buff *skb, struct mpls_dev *mdev; struct nlattr *nla; - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) return -ENODATA; @@ -1123,7 +1139,7 @@ static size_t mpls_get_stats_af_size(const struct net_device *dev) { struct mpls_dev *mdev; - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) return 0; @@ -1264,23 +1280,32 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; - if (!tb[NETCONFA_IFINDEX]) + if (!tb[NETCONFA_IFINDEX]) { + err = -EINVAL; goto errout; + } ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); - dev = __dev_get_by_index(net, ifindex); - if (!dev) - goto errout; - - mdev = mpls_dev_get(dev); - if (!mdev) - goto errout; - err = -ENOBUFS; skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); - if (!skb) + if (!skb) { + err = -ENOBUFS; goto errout; + } + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + err = -EINVAL; + goto errout_unlock; + } + + mdev = mpls_dev_rcu(dev); + if (!mdev) { + err = -EINVAL; + goto errout_unlock; + } err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(in_skb).portid, @@ -1289,12 +1314,19 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; + goto errout_unlock; } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + + rcu_read_unlock(); errout: return err; + +errout_unlock: + rcu_read_unlock(); + kfree_skb(skb); + goto errout; } static int mpls_netconf_dump_devconf(struct sk_buff *skb, @@ -1326,7 +1358,7 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) continue; err = mpls_netconf_fill_devconf(skb, mdev, @@ -1438,9 +1470,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) int err = -ENOMEM; int i; - ASSERT_RTNL(); - - mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + mdev = kzalloc_obj(*mdev); if (!mdev) return ERR_PTR(err); @@ -1481,16 +1511,15 @@ static void mpls_dev_destroy_rcu(struct rcu_head *head) static int mpls_ifdown(struct net_device *dev, int event) { - struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - unsigned index; + unsigned int index; - platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { - struct mpls_route *rt = rtnl_dereference(platform_label[index]); + struct mpls_route *rt; bool nh_del = false; u8 alive = 0; + rt = mpls_route_input(net, index); if (!rt) continue; @@ -1524,8 +1553,12 @@ static int mpls_ifdown(struct net_device *dev, int event) change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; - if (nh->nh_dev != dev) + if (nh->nh_dev != dev) { + if (nh_del) + netdev_hold(nh->nh_dev, &nh->nh_dev_tracker, + GFP_KERNEL); goto next; + } switch (event) { case NETDEV_DOWN: @@ -1557,15 +1590,14 @@ next: static void mpls_ifup(struct net_device *dev, unsigned int flags) { - struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - unsigned index; + unsigned int index; u8 alive; - platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { - struct mpls_route *rt = rtnl_dereference(platform_label[index]); + struct mpls_route *rt; + rt = mpls_route_input(net, index); if (!rt) continue; @@ -1592,28 +1624,33 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); struct mpls_dev *mdev; unsigned int flags; int err; + mutex_lock(&net->mpls.platform_mutex); + if (event == NETDEV_REGISTER) { mdev = mpls_add_dev(dev); - if (IS_ERR(mdev)) - return notifier_from_errno(PTR_ERR(mdev)); + if (IS_ERR(mdev)) { + err = PTR_ERR(mdev); + goto err; + } - return NOTIFY_OK; + goto out; } - mdev = mpls_dev_get(dev); + mdev = mpls_dev_get(net, dev); if (!mdev) - return NOTIFY_OK; + goto out; switch (event) { case NETDEV_DOWN: err = mpls_ifdown(dev, event); if (err) - return notifier_from_errno(err); + goto err; break; case NETDEV_UP: flags = netif_get_flags(dev); @@ -1629,14 +1666,15 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, } else { err = mpls_ifdown(dev, event); if (err) - return notifier_from_errno(err); + goto err; } break; case NETDEV_UNREGISTER: err = mpls_ifdown(dev, event); if (err) - return notifier_from_errno(err); - mdev = mpls_dev_get(dev); + goto err; + + mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); @@ -1644,16 +1682,23 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, } break; case NETDEV_CHANGENAME: - mdev = mpls_dev_get(dev); + mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) - return notifier_from_errno(err); + goto err; } break; } + +out: + mutex_unlock(&net->mpls.platform_mutex); return NOTIFY_OK; + +err: + mutex_unlock(&net->mpls.platform_mutex); + return notifier_from_errno(err); } static struct notifier_block mpls_dev_notifier = { @@ -1928,10 +1973,11 @@ errout: static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; - cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + cfg = kzalloc_obj(*cfg); if (!cfg) return -ENOMEM; @@ -1939,7 +1985,9 @@ static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto out; + mutex_lock(&net->mpls.platform_mutex); err = mpls_route_del(cfg, extack); + mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); @@ -1950,10 +1998,11 @@ out: static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; - cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + cfg = kzalloc_obj(*cfg); if (!cfg) return -ENOMEM; @@ -1961,7 +2010,9 @@ static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto out; + mutex_lock(&net->mpls.platform_mutex); err = mpls_route_add(cfg, extack); + mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); @@ -2124,7 +2175,7 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, if (i == RTA_OIF) { ifindex = nla_get_u32(tb[i]); - filter->dev = __dev_get_by_index(net, ifindex); + filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; filter->filter_set = 1; @@ -2162,20 +2213,19 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; struct fib_dump_filter filter = { - .rtnl_held = true, + .rtnl_held = false, }; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; + int err; - ASSERT_RTNL(); + rcu_read_lock(); if (cb->strict_check) { - int err; - err = mpls_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) - return err; + goto err; /* for MPLS, there is only 1 table with fixed type and flags. * If either are set in the filter then return nothing. @@ -2183,14 +2233,14 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) || (filter.rt_type && filter.rt_type != RTN_UNICAST) || filter.flags) - return skb->len; + goto unlock; } index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; - platform_label = rtnl_dereference(net->mpls.platform_label); + platform_label = rcu_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; if (filter.filter_set) @@ -2199,7 +2249,7 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) for (; index < platform_labels; index++) { struct mpls_route *rt; - rt = rtnl_dereference(platform_label[index]); + rt = rcu_dereference(platform_label[index]); if (!rt) continue; @@ -2214,7 +2264,13 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) } cb->args[0] = index; +unlock: + rcu_read_unlock(); return skb->len; + +err: + rcu_read_unlock(); + return err; } static inline size_t lfib_nlmsg_size(struct mpls_route *rt) @@ -2345,18 +2401,20 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, u32 portid = NETLINK_CB(in_skb).portid; u32 in_label = LABEL_NOT_SPECIFIED; struct nlattr *tb[RTA_MAX + 1]; + struct mpls_route *rt = NULL; u32 labels[MAX_NEW_LABELS]; struct mpls_shim_hdr *hdr; unsigned int hdr_size = 0; const struct mpls_nh *nh; struct net_device *dev; - struct mpls_route *rt; struct rtmsg *rtm, *r; struct nlmsghdr *nlh; struct sk_buff *skb; u8 n_labels; int err; + mutex_lock(&net->mpls.platform_mutex); + err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack); if (err < 0) goto errout; @@ -2378,7 +2436,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, } } - rt = mpls_route_input_rcu(net, in_label); + if (in_label < net->mpls.platform_labels) + rt = mpls_route_input(net, in_label); if (!rt) { err = -ENETUNREACH; goto errout; @@ -2399,7 +2458,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, goto errout_free; } - return rtnl_unicast(skb, net, portid); + err = rtnl_unicast(skb, net, portid); + goto errout; } if (tb[RTA_NEWDST]) { @@ -2491,12 +2551,14 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, err = rtnl_unicast(skb, net, portid); errout: + mutex_unlock(&net->mpls.platform_mutex); return err; nla_put_failure: nlmsg_cancel(skb, nlh); err = -EMSGSIZE; errout_free: + mutex_unlock(&net->mpls.platform_mutex); kfree_skb(skb); return err; } @@ -2519,10 +2581,13 @@ static int resize_platform_label_table(struct net *net, size_t limit) /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; + rt0 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt0)) goto nort0; + rt0->rt_nh->nh_dev = lo; + netdev_hold(lo, &rt0->rt_nh->nh_dev_tracker, GFP_KERNEL); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; @@ -2533,10 +2598,13 @@ static int resize_platform_label_table(struct net *net, size_t limit) } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; + rt2 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt2)) goto nort2; + rt2->rt_nh->nh_dev = lo; + netdev_hold(lo, &rt2->rt_nh->nh_dev_tracker, GFP_KERNEL); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; @@ -2546,9 +2614,10 @@ static int resize_platform_label_table(struct net *net, size_t limit) lo->addr_len); } - rtnl_lock(); + mutex_lock(&net->mpls.platform_mutex); + /* Remember the original table */ - old = rtnl_dereference(net->mpls.platform_label); + old = mpls_dereference(net, net->mpls.platform_label); old_limit = net->mpls.platform_labels; /* Free any labels beyond the new table */ @@ -2579,7 +2648,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) net->mpls.platform_labels = limit; rcu_assign_pointer(net->mpls.platform_label, labels); - rtnl_unlock(); + mutex_unlock(&net->mpls.platform_mutex); mpls_rt_free(rt2); mpls_rt_free(rt0); @@ -2652,12 +2721,13 @@ static const struct ctl_table mpls_table[] = { }, }; -static int mpls_net_init(struct net *net) +static __net_init int mpls_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; + mutex_init(&net->mpls.platform_mutex); net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; @@ -2683,7 +2753,7 @@ static int mpls_net_init(struct net *net) return 0; } -static void mpls_net_exit(struct net *net) +static __net_exit void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; @@ -2703,16 +2773,20 @@ static void mpls_net_exit(struct net *net) * As such no additional rcu synchronization is necessary when * freeing the platform_label table. */ - rtnl_lock(); - platform_label = rtnl_dereference(net->mpls.platform_label); + mutex_lock(&net->mpls.platform_mutex); + + platform_label = mpls_dereference(net, net->mpls.platform_label); platform_labels = net->mpls.platform_labels; + for (index = 0; index < platform_labels; index++) { - struct mpls_route *rt = rtnl_dereference(platform_label[index]); - RCU_INIT_POINTER(platform_label[index], NULL); + struct mpls_route *rt; + + rt = mpls_dereference(net, platform_label[index]); mpls_notify_route(net, index, rt, NULL, NULL); mpls_rt_free(rt); } - rtnl_unlock(); + + mutex_unlock(&net->mpls.platform_mutex); kvfree(platform_label); } @@ -2729,12 +2803,15 @@ static struct rtnl_af_ops mpls_af_ops __read_mostly = { }; static const struct rtnl_msg_handler mpls_rtnl_msg_handlers[] __initdata_or_module = { - {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0}, - {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0}, - {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, 0}, + {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED}, + {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED}, + {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, + RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, - RTNL_FLAG_DUMP_UNLOCKED}, + RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init mpls_init(void) @@ -2777,6 +2854,7 @@ out_unregister_rtnl_af: rtnl_af_unregister(&mpls_af_ops); out_unregister_dev_type: dev_remove_pack(&mpls_packet_type); + unregister_netdevice_notifier(&mpls_dev_notifier); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); goto out; diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 83c629529b57..80cb5bbcd946 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -88,6 +88,7 @@ enum mpls_payload_type { struct mpls_nh { /* next hop label forwarding entry */ struct net_device *nh_dev; + netdevice_tracker nh_dev_tracker; /* nh_flags is accessed under RCU in the packet path; it is * modified handling netdev events with rtnl lock held @@ -184,9 +185,20 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } -static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +#define mpls_dereference(net, p) \ + rcu_dereference_protected( \ + (p), \ + lockdep_is_held(&(net)->mpls.platform_mutex)) + +static inline struct mpls_dev *mpls_dev_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->mpls_ptr); +} + +static inline struct mpls_dev *mpls_dev_get(const struct net *net, + const struct net_device *dev) { - return rcu_dereference_rtnl(dev->mpls_ptr); + return mpls_dereference(net, dev->mpls_ptr); } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, @@ -196,7 +208,8 @@ int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); -void mpls_stats_inc_outucastpkts(struct net_device *dev, +void mpls_stats_inc_outucastpkts(struct net *net, + struct net_device *dev, const struct sk_buff *skb); #endif /* MPLS_INTERNAL_H */ diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 6e73da94af7f..1a1a0eb5b787 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -53,7 +53,7 @@ static int mpls_xmit(struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; - net = dev_net(out_dev); + net = dev_net_rcu(out_dev); if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) @@ -128,7 +128,7 @@ static int mpls_xmit(struct sk_buff *skb) bos = false; } - mpls_stats_inc_outucastpkts(out_dev, skb); + mpls_stats_inc_outucastpkts(net, out_dev, skb); if (rt) { if (rt->rt_gw_family == AF_INET6) @@ -153,7 +153,7 @@ static int mpls_xmit(struct sk_buff *skb) return LWTUNNEL_XMIT_DONE; drop: - out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + out_mdev = out_dev ? mpls_dev_rcu(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index 20328920f6ed..be71fc9b4638 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -4,7 +4,7 @@ config MPTCP depends on INET select SKB_EXTENSIONS select CRYPTO_LIB_SHA256 - select CRYPTO + select CRYPTO_LIB_UTILS help Multipath TCP (MPTCP) connections send and receive data over multiple subflows in order to utilize multiple network paths. Each subflow diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index b9e451197902..82ec15bcfd7f 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -32,7 +32,8 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf /* dequeue the skb from sk receive queue */ __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); - skb_orphan(skb); + + mptcp_subflow_lend_fwdmem(subflow, skb); /* We copy the fastopen data, but that don't belong to the mptcp sequence * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset() @@ -50,6 +51,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf mptcp_data_lock(sk); DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); + mptcp_borrow_fwdmem(sk, skb); skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); mptcp_sk(sk)->bytes_received += skb->len; diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 171643815076..f23fda0c55a7 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -71,7 +71,6 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), - SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index a1d3e9369fbb..812218b5ed2b 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -70,7 +70,6 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ - MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index ac974299de71..136c2d05c0ee 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -195,7 +195,8 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_info *info = _info; - r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_rqueue = sk_rmem_alloc_get(sk) + + READ_ONCE(mptcp_sk(sk)->backlog_len); r->idiag_wqueue = sk_wmem_alloc_get(sk); if (inet_sk_state_load(sk) == TCP_LISTEN) { diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index dcffd847af33..c180930a8e0e 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h index e24258f6f819..b9280bcb43f5 100644 --- a/net/mptcp/mptcp_pm_gen.h +++ b/net/mptcp/mptcp_pm_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_MPTCP_PM_GEN_H #define _LINUX_MPTCP_PM_GEN_H diff --git a/net/mptcp/options.c b/net/mptcp/options.c index f24ae7d40e88..43df4293f58b 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -408,6 +408,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { + if (unlikely(subflow_simultaneous_connect(sk))) { + WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); + + /* Ensure mptcp_finish_connect() will not process the + * MPC handshake. + */ + subflow->request_mptcp = 0; + return false; + } + opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 9604b91902b8..57a456690406 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -212,9 +212,24 @@ void mptcp_pm_send_ack(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); } -void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +static bool subflow_in_rm_list(const struct mptcp_subflow_context *subflow, + const struct mptcp_rm_list *rm_list) +{ + u8 i, id = subflow_get_local_id(subflow); + + for (i = 0; i < rm_list->nr; i++) { + if (rm_list->ids[i] == id) + return true; + } + + return false; +} + +static void +mptcp_pm_addr_send_ack_avoid_list(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { - struct mptcp_subflow_context *subflow, *alt = NULL; + struct mptcp_subflow_context *subflow, *stale = NULL, *same_id = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); @@ -224,19 +239,35 @@ void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) return; mptcp_for_each_subflow(msk, subflow) { - if (__mptcp_subflow_active(subflow)) { - if (!subflow->stale) { - mptcp_pm_send_ack(msk, subflow, false, false); - return; - } + if (!__mptcp_subflow_active(subflow)) + continue; - if (!alt) - alt = subflow; + if (unlikely(subflow->stale)) { + if (!stale) + stale = subflow; + } else if (unlikely(rm_list && + subflow_in_rm_list(subflow, rm_list))) { + if (!same_id) + same_id = subflow; + } else { + goto send_ack; } } - if (alt) - mptcp_pm_send_ack(msk, alt, false, false); + if (same_id) + subflow = same_id; + else if (stale) + subflow = stale; + else + return; + +send_ack: + mptcp_pm_send_ack(msk, subflow, false, false); +} + +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +{ + mptcp_pm_addr_send_ack_avoid_list(msk, NULL); } int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, @@ -388,7 +419,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, goto reset_timer; } - add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); + add_entry = kmalloc_obj(*add_entry, GFP_ATOMIC); if (!add_entry) return false; @@ -470,7 +501,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_addr_send_ack(msk); + mptcp_pm_addr_send_ack_avoid_list(msk, rm_list); return 0; } @@ -594,6 +625,7 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk) void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow) { + struct sock *sk = (struct sock *)msk; struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; @@ -617,7 +649,8 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, /* Even if this subflow is not really established, tell the PM to try * to pick the next ones, if possible. */ - if (mptcp_pm_nl_check_work_pending(msk)) + if (mptcp_is_fully_established(sk) && + mptcp_pm_nl_check_work_pending(msk)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 0a50fd5edc06..82e59f9c6dd9 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -22,6 +22,7 @@ struct pm_nl_pernet { u8 endp_signal_max; u8 endp_subflow_max; u8 endp_laminar_max; + u8 endp_fullmesh_max; u8 limit_add_addr_accepted; u8 limit_extra_subflows; u8 next_id; @@ -70,6 +71,14 @@ u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); +u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_fullmesh_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_fullmesh_max); + u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -328,6 +337,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct mptcp_pm_local local; mptcp_mpc_endpoint_setup(msk); + if (!mptcp_is_fully_established(sk)) + return; pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, endp_subflow_max, @@ -407,6 +418,15 @@ subflow: } exit: + /* If an endpoint has both the signal and subflow flags, but it is not + * possible to create subflows -- the 'while' loop body above never + * executed -- then still mark the endp as used, which is somehow the + * case. This avoids issues later when removing the endpoint and calling + * __mark_subflow_endp_available(), which expects the increment here. + */ + if (signal_and_subflow && local.addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + mptcp_pm_nl_check_work_pending(msk); } @@ -600,12 +620,11 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk); - int i; /* If there is at least one MPTCP endpoint with a fullmesh flag */ - i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case); - if (i) - return i; + if (mptcp_pm_get_endp_fullmesh_max(msk)) + return fill_local_addresses_vec_fullmesh(msk, remote, locals, + c_flag_case); /* If there is at least one MPTCP endpoint with a laminar flag */ if (mptcp_pm_get_endp_laminar_max(msk)) @@ -790,6 +809,10 @@ find_next: addr_max = pernet->endp_laminar_max; WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + addr_max = pernet->endp_fullmesh_max; + WRITE_ONCE(pernet->endp_fullmesh_max, addr_max + 1); + } pernet->endpoints++; if (!entry->addr.port) @@ -815,7 +838,7 @@ static struct lock_class_key mptcp_keys[2]; static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { - bool is_ipv6 = sk->sk_family == AF_INET6; + bool is_ipv6 = entry->addr.family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct sock *newsk, *ssk; @@ -855,10 +878,10 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, addrlen = sizeof(struct sockaddr_in6); #endif if (ssk->sk_family == AF_INET) - err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); + err = inet_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ssk->sk_family == AF_INET6) - err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); + err = inet6_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen); #endif if (err) return err; @@ -1030,26 +1053,23 @@ out_free: return ret; } -static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, +static void mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; - bool ret; + bool announced; list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - ret = mptcp_remove_anno_list_by_saddr(msk, addr); - if (ret || force) { + announced = mptcp_remove_anno_list_by_saddr(msk, addr); + if (announced || force) { spin_lock_bh(&msk->pm.lock); - if (ret) { - __set_bit(addr->id, msk->pm.id_avail_bitmap); + if (announced) msk->pm.add_addr_signaled--; - } mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } - return ret; } static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) @@ -1083,17 +1103,15 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); list.ids[0] = mptcp_endp_get_local_id(msk, addr); - if (remove_subflow) { - spin_lock_bh(&msk->pm.lock); - mptcp_pm_rm_subflow(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - spin_lock_bh(&msk->pm.lock); + spin_lock_bh(&msk->pm.lock); + if (remove_subflow) + mptcp_pm_rm_subflow(msk, &list); + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) __mark_subflow_endp_available(msk, list.ids[0]); - spin_unlock_bh(&msk->pm.lock); - } + else /* mark endp ID as available, e.g. Signal or MPC endp */ + __set_bit(addr->id, msk->pm.id_avail_bitmap); + spin_unlock_bh(&msk->pm.lock); if (msk->mpc_endpoint_id == entry->addr.id) msk->mpc_endpoint_id = 0; @@ -1187,6 +1205,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) addr_max = pernet->endp_laminar_max; WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + addr_max = pernet->endp_fullmesh_max; + WRITE_ONCE(pernet->endp_fullmesh_max, addr_max - 1); + } pernet->endpoints--; list_del_rcu(&entry->list); @@ -1276,16 +1298,26 @@ static void __reset_counters(struct pm_nl_pernet *pernet) int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - LIST_HEAD(free_list); + struct list_head free_list; spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->endp_list, &free_list); + free_list = pernet->endp_list; + INIT_LIST_HEAD_RCU(&pernet->endp_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); - mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); + + if (free_list.next == &pernet->endp_list) + return 0; + synchronize_rcu(); + + /* Adjust the pointers to free_list instead of pernet->endp_list */ + free_list.prev->next = &free_list; + free_list.next->prev = &free_list; + + mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); __flush_addrs(&free_list); return 0; } @@ -1502,6 +1534,18 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, changed = (local->flags ^ entry->flags) & mask; entry->flags = (entry->flags & ~mask) | (local->flags & mask); *local = *entry; + + if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) { + u8 addr_max = pernet->endp_fullmesh_max; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) + addr_max++; + else + addr_max--; + + WRITE_ONCE(pernet->endp_fullmesh_max, addr_max); + } + spin_unlock_bh(&pernet->lock); mptcp_pm_nl_set_flags_all(net, local, changed); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index d5b383870f79..7aa42de9c47b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -119,7 +119,8 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, } if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) - entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]) & + MPTCP_PM_ADDR_FLAGS_MASK; if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1e413426deee..cf1852b99963 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -28,6 +28,8 @@ #include "protocol.h" #include "mib.h" +static unsigned int mptcp_inq_hint(const struct sock *sk); + #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> @@ -224,9 +226,6 @@ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) do_div(grow, oldval); rcvwin += grow << 1; - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) - rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; - cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); @@ -350,15 +349,12 @@ merge_right: end: skb_condense(skb); skb_set_owner_r(skb, sk); - /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ - if (sk->sk_socket) - mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); } static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, int copy_len) { - const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: @@ -383,11 +379,7 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *tail; - /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); - goto drop; - } + mptcp_borrow_fwdmem(sk, skb); if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -409,16 +401,13 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); -drop: mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->mptcp_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } @@ -524,7 +513,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? - icsk_timeout(inet_csk(ssk)) - jiffies : 0; + tcp_timeout_expires(ssk) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) @@ -664,8 +653,50 @@ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) } } +static void __mptcp_add_backlog(struct sock *sk, + struct mptcp_subflow_context *subflow, + struct sk_buff *skb) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tail = NULL; + struct sock *ssk = skb->sk; + bool fragstolen; + int delta; + + if (unlikely(sk->sk_state == TCP_CLOSE)) { + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); + return; + } + + /* Try to coalesce with the last skb in our backlog */ + if (!list_empty(&msk->backlog_list)) + tail = list_last_entry(&msk->backlog_list, struct sk_buff, list); + + if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq && + ssk == tail->sk && + __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) { + skb->truesize -= delta; + kfree_skb_partial(skb, fragstolen); + __mptcp_subflow_lend_fwdmem(subflow, delta); + goto account; + } + + list_add_tail(&skb->list, &msk->backlog_list); + mptcp_subflow_lend_fwdmem(subflow, skb); + delta = skb->truesize; + +account: + WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta); + + /* Possibly not accept()ed yet, keep track of memory not CG + * accounted, mptcp_graft_subflows() will handle it. + */ + if (!mem_cgroup_from_sk(ssk)) + msk->backlog_unaccounted += delta; +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, - struct sock *ssk) + struct sock *ssk, bool own_msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; @@ -681,9 +712,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sk_buff *skb; bool fin; - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) - break; - /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); @@ -710,8 +738,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, size_t len = skb->len - offset; mptcp_init_skb(ssk, skb, offset, len); - skb_orphan(skb); - ret = __mptcp_move_skb(sk, skb) || ret; + + if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) { + mptcp_subflow_lend_fwdmem(subflow, skb); + ret |= __mptcp_move_skb(sk, skb); + } else { + __mptcp_add_backlog(sk, subflow, skb); + } seq += len; if (unlikely(map_remaining < len)) { @@ -784,11 +817,8 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { - int err = sock_error(ssk); int ssk_state; - - if (!err) - return false; + int err; /* only propagate errors on fallen-back sockets or * on MPC connect @@ -796,6 +826,10 @@ static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) return false; + err = sock_error(ssk); + if (!err) + return false; + /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly @@ -830,7 +864,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) struct sock *sk = (struct sock *)msk; bool moved; - moved = __mptcp_move_skbs_from_subflow(msk, ssk); + moved = __mptcp_move_skbs_from_subflow(msk, ssk, true); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) __mptcp_subflow_error_report(sk, ssk); @@ -845,31 +879,26 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return moved; } -static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - /* Wake-up the reader only for in-sequence data */ - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) - sk->sk_data_ready(sk); -} - void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_sock *msk = mptcp_sk(sk); /* The peer can send data while we are shutting down this - * subflow at msk destruction time, but we must avoid enqueuing + * subflow at subflow destruction time, but we must avoid enqueuing * more data to the msk receive queue */ - if (unlikely(subflow->disposable)) + if (unlikely(subflow->closing)) return; mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - __mptcp_data_ready(sk, ssk); - else - __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); + if (!sock_owned_by_user(sk)) { + /* Wake-up the reader only for in-sequence data */ + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); + } else { + __mptcp_move_skbs_from_subflow(msk, ssk, false); + } mptcp_data_unlock(sk); } @@ -895,12 +924,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) mptcp_subflow_joined(msk, ssk); spin_unlock_bh(&msk->fallback_lock); - /* attach to msk socket only after we are sure we will deal with it - * at close time - */ - if (sk->sk_socket && !ssk->sk_socket) - mptcp_sock_graft(ssk, sk->sk_socket); - mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_stop_tout_timer(sk); @@ -926,12 +949,11 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list static bool mptcp_rtx_timer_pending(struct sock *sk) { - return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); + return timer_pending(&sk->mptcp_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; /* prevent rescheduling on close */ @@ -939,7 +961,7 @@ static void mptcp_reset_rtx_timer(struct sock *sk) return; tout = mptcp_sk(sk)->timer_ival; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); + sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) @@ -1088,11 +1110,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (first) + if (first && !ssk->sk_bypass_prot_mem) { tcp_enter_memory_pressure(ssk); - sk_stream_moderate_sndbuf(ssk); + first = false; + } - first = false; + sk_stream_moderate_sndbuf(ssk); } __mptcp_sync_sndbuf(sk); } @@ -1137,8 +1160,9 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, - u64 data_seq, int avail_size) +static size_t mptcp_check_allowed_size(const struct mptcp_sock *msk, + struct sock *ssk, u64 data_seq, + size_t avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; @@ -1147,7 +1171,7 @@ static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *s return avail_size; mptcp_snd_wnd = window_end - data_seq; - avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); + avail_size = min(mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); @@ -1491,7 +1515,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (!ssk || !sk_stream_memory_free(ssk)) return NULL; - burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; @@ -1597,7 +1621,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) struct mptcp_sendmsg_info info = { .flags = flags, }; - bool do_check_data_fin = false; + bool copied = false; int push_count = 1; while (mptcp_send_head(sk) && (push_count > 0)) { @@ -1639,7 +1663,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) push_count--; continue; } - do_check_data_fin = true; + copied = true; } } } @@ -1648,11 +1672,14 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) if (ssk) mptcp_push_release(ssk, &info); - /* ensure the rtx timer is running */ - if (!mptcp_rtx_timer_pending(sk)) - mptcp_reset_rtx_timer(sk); - if (do_check_data_fin) + /* Avoid scheduling the rtx timer if no data has been pushed; the timer + * will be updated on positive acks by __mptcp_cleanup_una(). + */ + if (copied) { + if (!mptcp_rtx_timer_pending(sk)) + mptcp_reset_rtx_timer(sk); mptcp_check_send_data_fin(sk); + } } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) @@ -1965,6 +1992,17 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); +static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ + /* avoid the indirect call, we know the destructor is sock_rfree */ + skb->destructor = NULL; + skb->sk = NULL; + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); + __skb_unlink(skb, &sk->sk_receive_queue); + skb_attempt_defer_free(skb); +} + static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, @@ -2019,13 +2057,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, break; } - /* avoid the indirect call, we know the destructor is sock_rfree */ - skb->destructor = NULL; - skb->sk = NULL; - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); - sk_mem_uncharge(sk, skb->truesize); - __skb_unlink(skb, &sk->sk_receive_queue); - skb_attempt_defer_free(skb); + mptcp_eat_recv_skb(sk, skb); } if (copied >= len) @@ -2036,6 +2068,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, return copied; } +static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) +{ + const struct tcp_sock *tp = tcp_sk(ssk); + + msk->rcvspace_init = 1; + msk->rcvq_space.copied = 0; + msk->rcvq_space.rtt_us = 0; + + /* initial rcv_space offering made to peer */ + msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, + TCP_INIT_CWND * tp->advmss); + if (msk->rcvq_space.space == 0) + msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; +} + /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. @@ -2058,8 +2105,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) msk->rcvq_space.copied += copied; - mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); - time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); + mstamp = mptcp_stamp(); + time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time)); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) @@ -2089,6 +2136,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; + trace_mptcp_rcvbuf_grow(sk, time); if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to @@ -2113,60 +2161,80 @@ new_measure: msk->rcvq_space.time = mstamp; } -static struct mptcp_subflow_context * -__mptcp_first_ready_from(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) +static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta) { - struct mptcp_subflow_context *start_subflow = subflow; + struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list); + struct mptcp_sock *msk = mptcp_sk(sk); + bool moved = false; + + *delta = 0; + while (1) { + /* If the msk recvbuf is full stop, don't drop */ + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) + break; + + prefetch(skb->next); + list_del(&skb->list); + *delta += skb->truesize; - while (!READ_ONCE(subflow->data_avail)) { - subflow = mptcp_next_subflow(msk, subflow); - if (subflow == start_subflow) - return NULL; + moved |= __mptcp_move_skb(sk, skb); + if (list_empty(skbs)) + break; + + skb = list_first_entry(skbs, struct sk_buff, list); } - return subflow; + + __mptcp_ofo_queue(msk); + if (moved) + mptcp_check_data_fin((struct sock *)msk); + return moved; } -static bool __mptcp_move_skbs(struct sock *sk) +static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs) { - struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - bool ret = false; - if (list_empty(&msk->conn_list)) + /* After CG initialization, subflows should never add skb before + * gaining the CG themself. + */ + DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket && + mem_cgroup_from_sk(sk)); + + /* Don't spool the backlog if the rcvbuf is full. */ + if (list_empty(&msk->backlog_list) || + sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) return false; - subflow = list_first_entry(&msk->conn_list, - struct mptcp_subflow_context, node); - for (;;) { - struct sock *ssk; - bool slowpath; + INIT_LIST_HEAD(skbs); + list_splice_init(&msk->backlog_list, skbs); + return true; +} - /* - * As an optimization avoid traversing the subflows list - * and ev. acquiring the subflow socket lock before baling out - */ - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) - break; +static void mptcp_backlog_spooled(struct sock *sk, u32 moved, + struct list_head *skbs) +{ + struct mptcp_sock *msk = mptcp_sk(sk); - subflow = __mptcp_first_ready_from(msk, subflow); - if (!subflow) - break; + WRITE_ONCE(msk->backlog_len, msk->backlog_len - moved); + list_splice(skbs, &msk->backlog_list); +} - ssk = mptcp_subflow_tcp_sock(subflow); - slowpath = lock_sock_fast(ssk); - ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; - if (unlikely(ssk->sk_err)) - __mptcp_error_report(sk); - unlock_sock_fast(ssk, slowpath); +static bool mptcp_move_skbs(struct sock *sk) +{ + struct list_head skbs; + bool enqueued = false; + u32 moved; - subflow = mptcp_next_subflow(msk, subflow); - } + mptcp_data_lock(sk); + while (mptcp_can_spool_backlog(sk, &skbs)) { + mptcp_data_unlock(sk); + enqueued |= __mptcp_move_skbs(sk, &skbs, &moved); - __mptcp_ofo_queue(msk); - if (ret) - mptcp_check_data_fin((struct sock *)msk); - return ret; + mptcp_data_lock(sk); + mptcp_backlog_spooled(sk, moved, &skbs); + } + mptcp_data_unlock(sk); + return enqueued; } static unsigned int mptcp_inq_hint(const struct sock *sk) @@ -2232,7 +2300,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) + if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit @@ -2305,9 +2373,7 @@ out_err: static void mptcp_retransmit_timer(struct timer_list *t) { - struct inet_connection_sock *icsk = timer_container_of(icsk, t, - icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; + struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer); struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); @@ -2325,7 +2391,9 @@ static void mptcp_retransmit_timer(struct timer_list *t) static void mptcp_tout_timer(struct timer_list *t) { - struct sock *sk = timer_container_of(sk, t, sk_timer); + struct inet_connection_sock *icsk = + timer_container_of(icsk, t, mptcp_tout_timer); + struct sock *sk = &icsk->icsk_inet.sk; mptcp_schedule_work(sk); sock_put(sk); @@ -2418,10 +2486,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, - unsigned int flags) + bool fastclosing) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - subflow->send_fastclose) { + fastclosing) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ @@ -2446,6 +2514,25 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; + int fwd_remaining; + + /* Do not pass RX data to the msk, even if the subflow socket is not + * going to be freed (i.e. even for the first subflow on graceful + * subflow close. + */ + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + subflow->closing = 1; + + /* Borrow the fwd allocated page left-over; fwd memory for the subflow + * could be negative at this point, but will be reach zero soon - when + * the data allocated using such fragment will be freed. + */ + if (subflow->lent_mem_frag) { + fwd_remaining = PAGE_SIZE - subflow->lent_mem_frag; + sk_forward_alloc_add(sk, fwd_remaining); + sk_forward_alloc_add(ssk, -fwd_remaining); + subflow->lent_mem_frag = 0; + } /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is @@ -2457,7 +2544,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); - lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; } @@ -2466,14 +2552,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (dispose_it) list_del(&subflow->node); - lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE) tcp_set_state(ssk, TCP_CLOSE); need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { - __mptcp_subflow_disconnect(ssk, subflow, flags); + __mptcp_subflow_disconnect(ssk, subflow, msk->fastclosing); release_sock(ssk); goto out; @@ -2530,8 +2614,11 @@ out: void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { - /* The first subflow can already be closed and still in the list */ - if (subflow->close_event_done) + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + + /* The first subflow can already be closed or disconnected */ + if (subflow->close_event_done || READ_ONCE(subflow->local_id) < 0) return; subflow->close_event_done = true; @@ -2539,6 +2626,17 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); + /* Remove any reference from the backlog to this ssk; backlog skbs consume + * space in the msk receive queue, no need to touch sk->sk_rmem_alloc + */ + list_for_each_entry(skb, &msk->backlog_list, list) { + if (skb->sk != ssk) + continue; + + atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc); + skb->sk = NULL; + } + /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ @@ -2690,10 +2788,13 @@ static void __mptcp_retrans(struct sock *sk) /* * make the whole retrans decision, xmit, disallow - * fallback atomic + * fallback atomic, note that we can't retrans even + * when an infinite fallback is in progress, i.e. new + * subflows are disallowed. */ spin_lock_bh(&msk->fallback_lock); - if (__mptcp_check_fallback(msk)) { + if (__mptcp_check_fallback(msk) || + !msk->allow_subflows) { spin_unlock_bh(&msk->fallback_lock); release_sock(ssk); goto clear_scheduled; @@ -2758,7 +2859,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; - sk_reset_timer(sk, &sk->sk_timer, timeout); + sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) @@ -2777,12 +2878,32 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) unlock_sock_fast(ssk, slow); } +static void mptcp_backlog_purge(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tmp, *skb; + LIST_HEAD(backlog); + + mptcp_data_lock(sk); + list_splice_init(&msk->backlog_list, &backlog); + msk->backlog_len = 0; + mptcp_data_unlock(sk); + + list_for_each_entry_safe(skb, tmp, &backlog, list) { + mptcp_borrow_fwdmem(sk, skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); + } + sk_mem_reclaim(sk); +} + static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); + mptcp_backlog_purge(sk); + msk->fastclosing = 1; /* Explicitly send the fastclose reset as need */ if (__mptcp_check_fallback(msk)) @@ -2867,11 +2988,13 @@ static void __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); + INIT_LIST_HEAD(&msk->backlog_list); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; + msk->backlog_len = 0; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -2888,8 +3011,8 @@ static void __mptcp_init_sock(struct sock *sk) spin_lock_init(&msk->fallback_lock); /* re-use the csk retrans timer for MPTCP-level retrans */ - timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); + timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0); + timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) @@ -2935,6 +3058,7 @@ static int mptcp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); + sk->sk_write_space = sk_stream_write_space; return 0; } @@ -3091,7 +3215,7 @@ static void __mptcp_destroy_sock(struct sock *sk) might_sleep(); mptcp_stop_rtx_timer(sk); - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); msk->pm.status = 0; mptcp_release_sched(msk); @@ -3242,6 +3366,28 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } +static void mptcp_destroy_common(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = (struct sock *)msk; + + __mptcp_clear_xmit(sk); + mptcp_backlog_purge(sk); + + /* join list will be eventually flushed (with rst) at sock lock release time */ + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); + + __skb_queue_purge(&sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); + + /* move all the rx fwd alloc into the sk_mem_reclaim_final in + * inet_sock_destruct() will dispose it + */ + mptcp_token_destroy(msk); + mptcp_pm_destroy(msk); +} + static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -3293,6 +3439,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; + msk->fastclosing = 0; + + /* for fallback's sake */ + WRITE_ONCE(msk->ack_seq, 0); WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); @@ -3418,6 +3568,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); + msk->rcvq_space.time = mptcp_stamp(); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); @@ -3427,44 +3578,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, return nsk; } -void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) -{ - const struct tcp_sock *tp = tcp_sk(ssk); - - msk->rcvspace_init = 1; - msk->rcvq_space.copied = 0; - msk->rcvq_space.rtt_us = 0; - - msk->rcvq_space.time = tp->tcp_mstamp; - - /* initial rcv_space offering made to peer */ - msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, - TCP_INIT_CWND * tp->advmss); - if (msk->rcvq_space.space == 0) - msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; -} - -void mptcp_destroy_common(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - - __mptcp_clear_xmit(sk); - - /* join list will be eventually flushed (with rst) at sock lock release time */ - mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); - - __skb_queue_purge(&sk->sk_receive_queue); - skb_rbtree_purge(&msk->out_of_order_queue); - - /* move all the rx fwd alloc into the sk_mem_reclaim_final in - * inet_sock_destruct() will dispose it - */ - mptcp_token_destroy(msk); - mptcp_pm_destroy(msk); -} - static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -3493,8 +3606,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ - BIT(MPTCP_FLUSH_JOIN_LIST) | \ - BIT(MPTCP_DEQUEUE)) + BIT(MPTCP_FLUSH_JOIN_LIST)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) @@ -3504,9 +3616,12 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); - struct list_head join_list; + struct list_head join_list, skbs; + bool spool_bl; + u32 moved; - if (!flags) + spool_bl = mptcp_can_spool_backlog(sk, &skbs); + if (!flags && !spool_bl) break; INIT_LIST_HEAD(&join_list); @@ -3528,7 +3643,7 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); - if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { + if (spool_bl && __mptcp_move_skbs(sk, &skbs, &moved)) { /* notify ack seq update */ mptcp_cleanup_rbuf(msk, 0); sk->sk_data_ready(sk); @@ -3536,6 +3651,8 @@ static void mptcp_release_cb(struct sock *sk) cond_resched(); spin_lock_bh(&sk->sk_lock.slock); + if (spool_bl) + mptcp_backlog_spooled(sk, moved, &skbs); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) @@ -3649,6 +3766,7 @@ void mptcp_finish_connect(struct sock *ssk) * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); + WRITE_ONCE(msk->rcvq_space.time, mptcp_stamp()); mptcp_pm_new_connection(msk, ssk, 0); } @@ -3661,6 +3779,23 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } +/* Can be called without holding the msk socket lock; use the callback lock + * to avoid {READ_,WRITE_}ONCE annotations on sk_socket. + */ +static void mptcp_sock_check_graft(struct sock *sk, struct sock *ssk) +{ + struct socket *sock; + + write_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + write_unlock_bh(&sk->sk_callback_lock); + if (sock) { + mptcp_sock_graft(ssk, sock); + __mptcp_inherit_cgrp_data(sk, ssk); + __mptcp_inherit_memcg(sk, ssk, GFP_ATOMIC); + } +} + bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -3676,7 +3811,9 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - /* active subflow, already present inside the conn_list */ + /* Active subflow, already present inside the conn_list; is grafted + * either by __mptcp_subflow_connect() or accept. + */ if (!list_empty(&subflow->node)) { spin_lock_bh(&msk->fallback_lock); if (!msk->allow_subflows) { @@ -3703,11 +3840,17 @@ bool mptcp_finish_join(struct sock *ssk) if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); + mptcp_sock_check_graft(parent, ssk); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); + + /* In case of later failures, __mptcp_flush_join_list() will + * properly orphan the ssk via mptcp_close_ssk(). + */ + mptcp_sock_check_graft(parent, ssk); } mptcp_data_unlock(parent); @@ -3768,7 +3911,7 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return -EINVAL; lock_sock(sk); - if (__mptcp_move_skbs(sk)) + if (mptcp_move_skbs(sk)) mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); @@ -3790,7 +3933,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); @@ -3900,7 +4044,7 @@ static struct proto mptcp_prot = { .no_autobind = true, }; -static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; @@ -3967,6 +4111,69 @@ unlock: return err; } +static void mptcp_graft_subflows(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + if (mem_cgroup_sockets_enabled) { + LIST_HEAD(join_list); + + /* Subflows joining after __inet_accept() will get the + * mem CG properly initialized at mptcp_finish_join() time, + * but subflows pending in join_list need explicit + * initialization before flushing `backlog_unaccounted` + * or MPTCP can later unexpectedly observe unaccounted memory. + */ + mptcp_data_lock(sk); + list_splice_init(&msk->join_list, &join_list); + mptcp_data_unlock(sk); + + __mptcp_flush_join_list(sk, &join_list); + } + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* Set ssk->sk_socket of accept()ed flows to mptcp socket. + * This is needed so NOSPACE flag can be set from tcp stack. + */ + if (!ssk->sk_socket) + mptcp_sock_graft(ssk, sk->sk_socket); + + if (!mem_cgroup_sk_enabled(sk)) + goto unlock; + + __mptcp_inherit_cgrp_data(sk, ssk); + __mptcp_inherit_memcg(sk, ssk, GFP_KERNEL); + +unlock: + release_sock(ssk); + } + + if (mem_cgroup_sk_enabled(sk)) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; + int amt; + + /* Account the backlog memory; prior accept() is aware of + * fwd and rmem only. + */ + mptcp_data_lock(sk); + amt = sk_mem_pages(sk->sk_forward_alloc + + msk->backlog_unaccounted + + atomic_read(&sk->sk_rmem_alloc)) - + sk_mem_pages(sk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + msk->backlog_unaccounted = 0; + mptcp_data_unlock(sk); + + if (amt) + mem_cgroup_sk_charge(sk, amt, gfp); + } +} + static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -4014,26 +4221,17 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, msk = mptcp_sk(newsk); msk->in_accept_queue = 0; - /* set ssk->sk_socket of accept()ed flows to mptcp socket. - * This is needed so NOSPACE flag can be set from tcp stack. - */ - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - if (!ssk->sk_socket) - mptcp_sock_graft(ssk, newsock); - } - + mptcp_graft_subflows(newsk); mptcp_rps_record_subflows(msk); /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { - __mptcp_close_ssk(newsk, msk->first, - mptcp_subflow_ctx(msk->first), 0); if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); + mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first)); } } else { tcpfallback: @@ -4118,6 +4316,201 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } +static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + u32 offset; + + if (!list_empty(&msk->backlog_list)) + mptcp_move_skbs(sk); + + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + offset = MPTCP_SKB_CB(skb)->offset; + if (offset < skb->len) { + *off = offset; + return skb; + } + mptcp_eat_recv_skb(sk, skb); + } + return NULL; +} + +/* + * Note: + * - It is assumed that the socket was locked by the caller. + */ +static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + int copied = 0; + u32 offset; + + msk_owned_by_me(msk); + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) { + u32 data_len = skb->len - offset; + int count; + u32 size; + + size = min_t(size_t, data_len, INT_MAX); + count = recv_actor(desc, skb, offset, size); + if (count <= 0) { + if (!copied) + copied = count; + break; + } + + copied += count; + + msk->bytes_consumed += count; + if (count < data_len) { + MPTCP_SKB_CB(skb)->offset += count; + MPTCP_SKB_CB(skb)->map_seq += count; + break; + } + + mptcp_eat_recv_skb(sk, skb); + } + + if (noack) + goto out; + + mptcp_rcv_space_adjust(msk, copied); + + if (copied > 0) { + mptcp_recv_skb(sk, &offset); + mptcp_cleanup_rbuf(msk, copied); + } +out: + return copied; +} + +static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __mptcp_read_sock(sk, desc, recv_actor, false); +} + +static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + .count = tss->len, + }; + + return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * mptcp_splice_read - splice data from MPTCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + * Return: + * Amount of bytes that have been spliced. + * + **/ +static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + struct sock *sk = sock->sk; + ssize_t spliced = 0; + int ret = 0; + long timeo; + + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + lock_sock(sk); + + mptcp_rps_record_subflows(mptcp_sk(sk)); + + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); + while (tss.len) { + ret = __mptcp_splice_read(sk, &tss); + if (ret < 0) { + break; + } else if (!ret) { + if (spliced) + break; + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + /* if __mptcp_splice_read() got nothing while we have + * an skb in receive queue, we do not want to loop. + * This might happen with URG data. + */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + if (!tss.len || !timeo) + break; + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} + static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -4138,6 +4531,8 @@ static const struct proto_ops mptcp_stream_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, + .read_sock = mptcp_read_sock, + .splice_read = mptcp_splice_read, }; static struct inet_protosw mptcp_protosw = { @@ -4242,6 +4637,8 @@ static const struct proto_ops mptcp_v6_stream_ops = { .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, + .read_sock = mptcp_read_sock, + .splice_read = mptcp_splice_read, }; static struct proto mptcp_v6_prot; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6ca97096607c..0bd1ee860316 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -124,7 +124,6 @@ #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 -#define MPTCP_DEQUEUE 8 struct mptcp_skb_cb { u64 map_seq; @@ -247,14 +246,14 @@ struct mptcp_pm_data { struct mptcp_pm_local { struct mptcp_addr_info addr; - u8 flags; + u32 flags; int ifindex; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; - u8 flags; + u32 flags; int ifindex; struct socket *lsk; }; @@ -321,7 +320,8 @@ struct mptcp_sock { fastopening:1, in_accept_queue:1, free_first:1, - rcvspace_init:1; + rcvspace_init:1, + fastclosing:1; u32 notsent_lowat; int keepalive_cnt; int keepalive_idle; @@ -357,6 +357,10 @@ struct mptcp_sock { * allow_infinite_fallback and * allow_join */ + + struct list_head backlog_list; /* protected by the data lock */ + u32 backlog_len; + u32 backlog_unaccounted; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -407,6 +411,7 @@ static inline int mptcp_space_from_win(const struct sock *sk, int win) static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - + READ_ONCE(mptcp_sk(sk)->backlog_len) - sk_rmem_alloc_get(sk)); } @@ -536,16 +541,18 @@ struct mptcp_subflow_context { send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ + closing : 1, /* must not pass rx data to msk anymore */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ - __unused : 9; + __unused : 8; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ bool fully_established; /* path validated */ + u32 lent_mem_frag; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -645,6 +652,42 @@ mptcp_send_active_reset_reason(struct sock *sk) tcp_send_active_reset(sk, GFP_ATOMIC, reason); } +/* Made the fwd mem carried by the given skb available to the msk, + * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing + * the skb or setting the skb ownership. + */ +static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb) +{ + struct sock *ssk = skb->sk; + + /* The subflow just lend the skb fwd memory; if the subflow meanwhile + * closed, mptcp_close_ssk() already released the ssk rcv memory. + */ + DEBUG_NET_WARN_ON_ONCE(skb->destructor); + sk_forward_alloc_add(sk, skb->truesize); + if (!ssk) + return; + + atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); + skb->sk = NULL; +} + +static inline void +__mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size) +{ + int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1); + + subflow->lent_mem_frag = frag; +} + +static inline void +mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, + struct sk_buff *skb) +{ + __mptcp_subflow_lend_fwdmem(subflow, skb->truesize); + skb->destructor = NULL; +} + static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { @@ -707,6 +750,9 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) return ret; } +void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp); +void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk); + int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); @@ -847,7 +893,7 @@ static inline void mptcp_stop_tout_timer(struct sock *sk) if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } @@ -869,7 +915,11 @@ static inline bool mptcp_is_fully_established(struct sock *sk) READ_ONCE(mptcp_sk(sk)->fully_established); } -void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); +static inline u64 mptcp_stamp(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_USEC); +} + void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); @@ -925,7 +975,7 @@ static inline void mptcp_write_space(struct sock *sk) /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (mptcp_stream_memory_free(sk, 1)) - sk_stream_write_space(sk); + INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) @@ -977,8 +1027,6 @@ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) local_bh_enable(); } -void mptcp_destroy_common(struct mptcp_sock *msk); - #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); @@ -1184,6 +1232,7 @@ void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); @@ -1293,10 +1342,8 @@ static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - return (1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && - is_active_ssk(subflow) && - !subflow->conn_finished; + /* Note that the sk state implies !subflow->conn_finished. */ + return sk->sk_state == TCP_SYN_RECV && is_active_ssk(subflow); } #ifdef CONFIG_SYN_COOKIES diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a28a48385885..de90a2897d2d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -982,6 +982,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_endp_subflow_max(msk); info->mptcpi_endp_laminar_max = mptcp_pm_get_endp_laminar_max(msk); + info->mptcpi_endp_fullmesh_max = + mptcp_pm_get_endp_fullmesh_max(msk); } if (__mptcp_check_fallback(msk)) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index af707ce0f624..6716970693e9 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -462,8 +462,6 @@ void __mptcp_sync_state(struct sock *sk, int state) subflow = mptcp_subflow_ctx(ssk); __mptcp_propagate_sndbuf(sk, ssk); - if (!msk->rcvspace_init) - mptcp_rcv_space_init(msk, ssk); if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, @@ -491,6 +489,9 @@ static void subflow_set_remote_key(struct mptcp_sock *msk, mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; + /* for fallback's sake */ + subflow->map_seq = subflow->iasn; + WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); @@ -807,7 +808,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, - bool *own_req) + bool *own_req, + void (*opt_child_init)(struct sock *newsk, + const struct sock *sk)) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; @@ -854,7 +857,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, - req_unhash, own_req); + req_unhash, own_req, opt_child_init); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); @@ -1285,6 +1288,7 @@ static bool subflow_is_done(const struct sock *sk) /* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE && @@ -1303,7 +1307,8 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss */ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && msk->first == ssk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_update_rcv_data_fin(msk, subflow->map_seq + + subflow->map_data_len, true)) mptcp_schedule_work(sk); } @@ -1433,9 +1438,12 @@ reset: skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; - subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + subflow->map_seq = __mptcp_expand_seq(subflow->map_seq, + subflow->iasn + + TCP_SKB_CB(skb)->seq - + subflow->ssn_offset - 1); WRITE_ONCE(subflow->data_avail, true); return true; } @@ -1660,7 +1668,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; - err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); + err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", @@ -1680,7 +1688,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); - err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); + err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", @@ -1712,21 +1720,35 @@ err_out: return err; } -static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + /* Only if the msk has been accepted already (and not orphaned).*/ + if (!mem_cgroup_sockets_enabled || !sk->sk_socket) + return; + + mem_cgroup_sk_inherit(sk, ssk); + __sk_charge(ssk, gfp); +} + +void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk) { #ifdef CONFIG_SOCK_CGROUP_DATA - struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, - *child_skcd = &child->sk_cgrp_data; + struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data, + *ssk_cd = &ssk->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ - if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != - cgroup_id(sock_cgroup_ptr(child_skcd))) { - cgroup_sk_free(child_skcd); - *child_skcd = *parent_skcd; - cgroup_sk_clone(child_skcd); + if (cgroup_id(sock_cgroup_ptr(sk_cd)) != + cgroup_id(sock_cgroup_ptr(ssk_cd))) { + cgroup_sk_free(ssk_cd); + *ssk_cd = *sk_cd; + cgroup_sk_clone(sk_cd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ +} +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ + __mptcp_inherit_cgrp_data(parent, child); if (mem_cgroup_sockets_enabled) mem_cgroup_sk_inherit(parent, child); } @@ -1822,7 +1844,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; - ctx = kzalloc(sizeof(*ctx), priority); + ctx = kzalloc_obj(*ctx, priority); if (!ctx) return NULL; @@ -1856,12 +1878,6 @@ static void subflow_state_change(struct sock *sk) __subflow_state_change(sk); - if (subflow_simultaneous_connect(sk)) { - WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); - subflow->conn_finished = 1; - mptcp_propagate_state(parent, sk, subflow, NULL); - } - /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 5bb924534387..f1a50f367add 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -103,7 +103,7 @@ static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) * It creates a unique token to identify the new mptcp connection, * a secret local key and the initial data sequence number (idsn). * - * Returns 0 on success. + * Return: 0 on success. */ int mptcp_token_new_request(struct request_sock *req) { @@ -146,7 +146,7 @@ int mptcp_token_new_request(struct request_sock *req) * the computed token at a later time, this is needed to process * join requests. * - * returns 0 on success. + * Return: 0 on success. */ int mptcp_token_new_connect(struct sock *ssk) { @@ -241,7 +241,7 @@ found: * This function returns the mptcp connection structure with the given token. * A reference count on the mptcp socket returned is taken. * - * returns NULL if no connection with the given token value exists. + * Return: NULL if no connection with the given token value exists. */ struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { @@ -288,11 +288,13 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock); * @s_slot: start slot number * @s_num: start number inside the given lock * - * This function returns the first mptcp connection structure found inside the - * token container starting from the specified position, or NULL. + * Description: + * On successful iteration, the iterator is moved to the next position and a + * reference to the returned socket is acquired. * - * On successful iteration, the iterator is moved to the next position and - * a reference to the returned socket is acquired. + * Return: + * The first mptcp connection structure found inside the token container + * starting from the specified position, or NULL. */ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num) diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 62fb1031763d..040a31557201 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -224,7 +224,8 @@ int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb) if (!nah) { netdev_warn(ndp->ndev.dev, "Invalid AEN (0x%x) received\n", h->type); - return -ENOENT; + ret = -ENOENT; + goto out; } ret = ncsi_validate_aen_pkt(h, nah->payload); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 446e4e3b9553..54d0df0a9efe 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -211,7 +211,7 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) int index; unsigned long flags; - nc = kzalloc(sizeof(*nc), GFP_ATOMIC); + nc = kzalloc_obj(*nc, GFP_ATOMIC); if (!nc) return NULL; @@ -285,7 +285,7 @@ struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, struct ncsi_package *np, *tmp; unsigned long flags; - np = kzalloc(sizeof(*np), GFP_ATOMIC); + np = kzalloc_obj(*np, GFP_ATOMIC); if (!np) return NULL; @@ -1697,7 +1697,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) return -ENOSPC; } - vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); + vlan = kzalloc_obj(*vlan); if (!vlan) return -ENOMEM; @@ -1767,7 +1767,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, return nd; /* Create NCSI device */ - ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC); + ndp = kzalloc_obj(*ndp, GFP_ATOMIC); if (!ndp) return NULL; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 271ec6c3929e..fbd84bc8026a 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1176,8 +1176,10 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, /* Find the NCSI device */ nd = ncsi_find_dev(orig_dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; - if (!ndp) - return -ENODEV; + if (!ndp) { + ret = -ENODEV; + goto err_free_skb; + } /* Check if it is AEN packet */ hdr = (struct ncsi_pkt_hdr *)skb_network_header(skb); @@ -1199,7 +1201,8 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, if (!nrh) { netdev_err(nd->dev, "Received unrecognized packet (0x%x)\n", hdr->type); - return -ENOENT; + ret = -ENOENT; + goto err_free_skb; } /* Associate with the request */ @@ -1207,7 +1210,8 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, nr = &ndp->requests[hdr->id]; if (!nr->used) { spin_unlock_irqrestore(&ndp->lock, flags); - return -ENODEV; + ret = -ENODEV; + goto err_free_skb; } nr->rsp = skb; @@ -1261,4 +1265,8 @@ out_netlink: out: ncsi_free_request(nr); return ret; + +err_free_skb: + kfree_skb(skb); + return ret; } diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e43e20f529f8..6bfc250e474f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_path.o \ nf_flow_table_offload.o nf_flow_table_xdp.o nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o ifeq ($(CONFIG_NF_FLOW_TABLE),m) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index cc20e6d56807..a2fe711cb5e3 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1077,7 +1077,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, /* First, and without any locks, allocate and initialize * a normal base set structure. */ - set = kzalloc(sizeof(*set), GFP_KERNEL); + set = kzalloc_obj(*set); if (!set) return -ENOMEM; spin_lock_init(&set->lock); @@ -1135,7 +1135,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, /* Wraparound */ goto cleanup; - list = kvcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); + list = kvzalloc_objs(struct ip_set *, i); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -2377,7 +2377,7 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kvcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); + list = kvzalloc_objs(struct ip_set *, inst->ip_set_max); if (!list) return -ENOMEM; inst->is_deleted = false; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5e4453e9ef8e..181daa9c2019 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -998,7 +998,7 @@ resize: /* Resize is in process and kernel side add, save values */ struct mtype_resize_ad *x; - x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); + x = kzalloc_obj(struct mtype_resize_ad, GFP_ATOMIC); if (!x) /* Don't bother */ goto out; @@ -1086,8 +1086,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Resize is in process and kernel side del, * save values */ - x = kzalloc(sizeof(struct mtype_resize_ad), - GFP_ATOMIC); + x = kzalloc_obj(struct mtype_resize_ad, GFP_ATOMIC); if (x) { x->ad = IPSET_DEL; memcpy(&x->d, value, diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 13c7a08aa868..98b91b34c314 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -598,7 +598,7 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; - map = kzalloc(sizeof(*map), GFP_KERNEL); + map = kzalloc_obj(*map); if (!map) return false; diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index fdacbc3c15be..d54d7da58334 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -13,8 +13,7 @@ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 37ebb0cb62b8..005c1134d756 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -17,8 +17,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/interrupt.h> #include <linux/in.h> @@ -1511,8 +1510,7 @@ int __init ip_vs_conn_init(void) */ tab_array_size = array_size(ip_vs_conn_tab_size, sizeof(*ip_vs_conn_tab)); - ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size, - sizeof(*ip_vs_conn_tab), GFP_KERNEL); + ip_vs_conn_tab = kvmalloc_objs(*ip_vs_conn_tab, ip_vs_conn_tab_size); if (!ip_vs_conn_tab) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5ea7ab8bf4dc..90d56f92c0f6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -19,8 +19,7 @@ * Harald Welte don't use nfcache */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4c8fa22be88a..35642de2a0fe 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -13,8 +13,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/init.h> @@ -939,7 +938,7 @@ int ip_vs_stats_init_alloc(struct ip_vs_stats *s) struct ip_vs_stats *ip_vs_stats_alloc(void) { - struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); + struct ip_vs_stats *s = kzalloc_obj(*s); if (s && ip_vs_stats_init_alloc(s) >= 0) return s; @@ -1080,7 +1079,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -EINVAL; } - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); + dest = kzalloc_obj(struct ip_vs_dest); if (dest == NULL) return -ENOMEM; @@ -1422,7 +1421,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ret_hooks = ret; } - svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); + svc = kzalloc_obj(struct ip_vs_service); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; @@ -4440,7 +4439,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); /* procfs stats */ - ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); + ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); if (!ipvs->tot_stats) goto out; if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f4c231f4a0..e1f62f6b25e2 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -154,7 +153,7 @@ static int ip_vs_dh_init_svc(struct ip_vs_service *svc) struct ip_vs_dh_state *s; /* allocate the DH table for this service */ - s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + s = kzalloc_obj(struct ip_vs_dh_state); if (s == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 93a925f1ed9b..b17de33314da 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -12,8 +12,7 @@ * get_stats()) do the per cpu summing. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/jiffies.h> @@ -326,7 +325,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) id = i; } - kd = kzalloc(sizeof(*kd), GFP_KERNEL); + kd = kzalloc_obj(*kd); if (!kd) goto out; kd->ipvs = ipvs; @@ -444,7 +443,7 @@ add_est: td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) { - td = kzalloc(sizeof(*td), GFP_KERNEL); + td = kzalloc_obj(*td); if (!td) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index ab117e5bc34e..d657b47c6511 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -8,8 +8,7 @@ * Kenny Mathis : added initial functionality based on weight */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 206c6700e200..b315c608fda4 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -16,8 +16,7 @@ * Author: Wouter Gadeyne */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/moduleparam.h> diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 156181a3bacd..15ccb2b2fa1f 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -34,8 +34,7 @@ * me to write this module. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -205,7 +204,7 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, return en; ip_vs_lblc_del(en); } - en = kmalloc(sizeof(*en), GFP_ATOMIC); + en = kmalloc_obj(*en, GFP_ATOMIC); if (!en) return NULL; @@ -348,7 +347,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblc_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + tbl = kmalloc_obj(*tbl); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index a021e6aba3d7..c90ea897c3f7 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/module.h> @@ -108,7 +107,7 @@ static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, } } - e = kmalloc(sizeof(*e), GFP_ATOMIC); + e = kmalloc_obj(*e, GFP_ATOMIC); if (e == NULL) return; @@ -364,7 +363,7 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, en = ip_vs_lblcr_get(af, tbl, daddr); if (!en) { - en = kmalloc(sizeof(*en), GFP_ATOMIC); + en = kmalloc_obj(*en, GFP_ATOMIC); if (!en) return NULL; @@ -511,7 +510,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Allocate the ip_vs_lblcr_table for this service */ - tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + tbl = kmalloc_obj(*tbl); if (tbl == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index c2764505e380..38cc38c5d8bb 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -9,8 +9,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index e3d7f5c879ce..020863047562 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -294,9 +293,8 @@ static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, return -EINVAL; if (svc->num_dests >= 1) { - s->dest_setup = kcalloc(svc->num_dests, - sizeof(struct ip_vs_mh_dest_setup), - GFP_KERNEL); + s->dest_setup = kzalloc_objs(struct ip_vs_mh_dest_setup, + svc->num_dests); if (!s->dest_setup) return -ENOMEM; } @@ -384,12 +382,11 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc) struct ip_vs_mh_state *s; /* Allocate the MH table for this service */ - s = kzalloc(sizeof(*s), GFP_KERNEL); + s = kzalloc_obj(*s); if (!s) return -ENOMEM; - s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), - GFP_KERNEL); + s->lookup = kzalloc_objs(struct ip_vs_mh_lookup, IP_VS_MH_TAB_SIZE); if (!s->lookup) { kfree(s); return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 08adcb222986..81974f69e5bb 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -30,8 +30,7 @@ * PASV response can not be NAT-ed) but Active FTP should work */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/types.h> diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index ed7f5c889b41..ada158c610ce 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -26,8 +26,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index c7708b809700..c5c67df80a0b 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -12,8 +12,7 @@ * active connections */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 166c669f0763..3035079ebd99 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index e4ce1d9a63f9..85f31d71e29a 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index a9fd1d3fc2cb..0046aed1fb38 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -8,8 +8,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -67,7 +66,7 @@ register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = - kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); + kzalloc_obj(struct ip_vs_proto_data); if (!pd) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 89602c16f6b6..44e14acc187e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -6,8 +6,7 @@ * Wensong Zhang <wensong@linuxvirtualserver.org> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 83e452916403..63c78a1f3918 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -10,7 +10,8 @@ #include <net/ip_vs.h> static int -sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int sctphoff); static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, @@ -108,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!sctp_csum_check(cp->af, skb, pp)) + if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ @@ -156,7 +157,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!sctp_csum_check(cp->af, skb, pp)) + if (!sctp_csum_check(cp->af, skb, pp, sctphoff)) return 0; /* Call application helper if needed */ @@ -185,19 +186,12 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, } static int -sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int sctphoff) { - unsigned int sctphoff; struct sctphdr *sh; __le32 cmp, val; -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - sctphoff = sizeof(struct ipv6hdr); - else -#endif - sctphoff = ip_hdrlen(skb); - sh = (struct sctphdr *)(skb->data + sctphoff); cmp = sh->checksum; val = sctp_compute_cksum(skb, sctphoff); diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 7da51390cea6..8cc0a8ce6241 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -13,8 +13,7 @@ * protocol ip_vs_proto_data and is handled by netns */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/ip.h> @@ -29,7 +28,8 @@ #include <net/ip_vs.h> static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int tcphoff); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, @@ -166,7 +166,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!tcp_csum_check(cp->af, skb, pp)) + if (!tcp_csum_check(cp->af, skb, pp, tcphoff)) return 0; /* Call application helper if needed */ @@ -244,7 +244,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!tcp_csum_check(cp->af, skb, pp)) + if (!tcp_csum_check(cp->af, skb, pp, tcphoff)) return 0; /* @@ -301,17 +301,9 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, static int -tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int tcphoff) { - unsigned int tcphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - tcphoff = sizeof(struct ipv6hdr); - else -#endif - tcphoff = ip_hdrlen(skb); - switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); @@ -322,7 +314,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, - ipv6_hdr(skb)->nexthdr, + IPPROTO_TCP, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 68260d91c988..f9de632e38cd 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,8 +9,7 @@ * Network name space (netns) aware. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> @@ -25,7 +24,8 @@ #include <net/ip6_checksum.h> static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int udphoff); static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, @@ -155,7 +155,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!udp_csum_check(cp->af, skb, pp)) + if (!udp_csum_check(cp->af, skb, pp, udphoff)) return 0; /* @@ -238,7 +238,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, int ret; /* Some checks before mangling */ - if (!udp_csum_check(cp->af, skb, pp)) + if (!udp_csum_check(cp->af, skb, pp, udphoff)) return 0; /* @@ -297,17 +297,10 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, static int -udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + unsigned int udphoff) { struct udphdr _udph, *uh; - unsigned int udphoff; - -#ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - udphoff = sizeof(struct ipv6hdr); - else -#endif - udphoff = ip_hdrlen(skb); uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); if (uh == NULL) @@ -325,7 +318,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - udphoff, - ipv6_hdr(skb)->nexthdr, + IPPROTO_UDP, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 6baa34dff9f0..4125ee561cdc 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index d4903723be7e..c6e421c4e299 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -12,8 +12,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index a46f99a56618..245a323c84cd 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 92e77d7a6b50..cd67066e3b26 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> @@ -230,7 +229,7 @@ static int ip_vs_sh_init_svc(struct ip_vs_service *svc) struct ip_vs_sh_state *s; /* allocate the SH table for this service */ - s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + s = kzalloc_obj(struct ip_vs_sh_state); if (s == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3402675bf521..b2ba3befbd55 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -32,8 +32,7 @@ * Persistence support, fwmark and time-out. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/slab.h> @@ -330,7 +329,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + if (!(sb=kmalloc_obj(struct ip_vs_sync_buff, GFP_ATOMIC))) return NULL; len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), @@ -418,7 +417,7 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg_v0 *mesg; - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + if (!(sb=kmalloc_obj(struct ip_vs_sync_buff, GFP_ATOMIC))) return NULL; len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), @@ -1435,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1501,7 +1500,7 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr, salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); @@ -1542,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; @@ -1828,7 +1827,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, struct ipvs_master_sync_state *ms; result = -ENOMEM; - ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); + ipvs->ms = kzalloc_objs(ipvs->ms[0], count); if (!ipvs->ms) goto out; ms = ipvs->ms; @@ -1842,8 +1841,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, } } result = -ENOMEM; - ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), - GFP_KERNEL); + ti = kzalloc_objs(struct ip_vs_sync_thread_data, count); if (!ti) goto out; diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index 8d5419edde50..dbb7f5fd4688 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -4,8 +4,7 @@ * Authors: Darby Payne <darby.payne@applovin.com> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/module.h> diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 9fa500927c0a..9da445ca09a1 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 85ce0d04afac..2dcff1040da5 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -13,8 +13,7 @@ * with weight 0 when all weights are zero */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -110,7 +109,7 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) /* * Allocate the mark variable for WRR scheduling */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); + mark = kmalloc_obj(struct ip_vs_wrr_mark); if (mark == NULL) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 95af252b2939..3601eb86d025 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -21,8 +21,7 @@ * - the only place where we can see skb->sk != NULL */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/slab.h> @@ -58,7 +57,7 @@ enum { static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) { - return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); + return kmalloc_obj(struct ip_vs_dest_dst, GFP_ATOMIC); } static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) @@ -295,6 +294,12 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, return true; } +/* rt has device that is down */ +static bool rt_dev_is_down(const struct net_device *dev) +{ + return dev && !netif_running(dev); +} + /* Get route to destination or remote server */ static int __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, @@ -310,9 +315,11 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); - if (likely(dest_dst)) + if (likely(dest_dst)) { rt = dst_rtable(dest_dst->dst_cache); - else { + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + } else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); if (!dest_dst) { @@ -328,14 +335,22 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + /* It is forbidden to attach dest->dest_dst if + * device is going down. + */ + if (!rt_dev_is_down(dst_dev_rcu(&rt->dst))) + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + else + noref = 0; spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, rcuref_read(&rt->dst.__rcuref)); + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + if (!noref) + ip_vs_dest_dst_free(dest_dst); } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.ip; } else { noref = 0; @@ -409,6 +424,9 @@ err_put: return -1; err_unreach: + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + dst_link_failure(skb); return -1; } @@ -469,9 +487,11 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); - if (likely(dest_dst)) + if (likely(dest_dst)) { rt = dst_rt6_info(dest_dst->dst_cache); - else { + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; + } else { u32 cookie; dest_dst = ip_vs_dest_dst_alloc(); @@ -492,14 +512,22 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, } rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); - __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + /* It is forbidden to attach dest->dest_dst if + * device is going down. + */ + if (!rt_dev_is_down(dst_dev_rcu(&rt->dst))) + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + else + noref = 0; spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest_dst->dst_saddr.in6, rcuref_read(&rt->dst.__rcuref)); + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; + if (!noref) + ip_vs_dest_dst_free(dest_dst); } - if (ret_saddr) - *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, @@ -947,7 +975,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, *next_protocol = IPPROTO_IPV6; if (payload_len) *payload_len = - ntohs(old_ipv6h->payload_len) + + ipv6_payload_len(skb, old_ipv6h) + sizeof(*old_ipv6h); old_dsfield = ipv6_get_dsfield(old_ipv6h); *ttl = old_ipv6h->hop_limit; diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 46e667a50d98..c20031891b86 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -170,7 +170,7 @@ static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog, static const struct bpf_link_ops bpf_nf_link_lops = { .release = bpf_nf_link_release, - .dealloc = bpf_nf_link_dealloc, + .dealloc_deferred = bpf_nf_link_dealloc, .detach = bpf_nf_link_detach, .show_fdinfo = bpf_nf_link_show_info, .fill_link_info = bpf_nf_link_fill_link_info, @@ -221,7 +221,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (err) return err; - link = kzalloc(sizeof(*link), GFP_USER); + link = kzalloc_obj(*link, GFP_USER); if (!link) return -ENOMEM; diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 913ede2f57f9..00eed5b4d1b1 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -34,8 +34,9 @@ #define CONNCOUNT_SLOTS 256U -#define CONNCOUNT_GC_MAX_NODES 8 -#define MAX_KEYLEN 5 +#define CONNCOUNT_GC_MAX_NODES 8 +#define CONNCOUNT_GC_MAX_COLLECT 64 +#define MAX_KEYLEN 5 /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { @@ -122,32 +123,94 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } +static bool get_ct_or_tuple_from_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conn **ct, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone **zone, + bool *refcounted) +{ + const struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *found_ct; + + found_ct = nf_ct_get(skb, &ctinfo); + if (found_ct && !nf_ct_is_template(found_ct)) { + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + *zone = nf_ct_zone(found_ct); + *ct = found_ct; + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) + return false; + + if (found_ct) + *zone = nf_ct_zone(found_ct); + + h = nf_conntrack_find_get(net, *zone, tuple); + if (!h) + return true; + + found_ct = nf_ct_tuplehash_to_ctrack(h); + *refcounted = true; + *ct = found_ct; + + return true; +} + static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct = NULL; struct nf_conn *found_ct; unsigned int collect = 0; + bool refcounted = false; + int err = 0; + + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) + return -ENOENT; + + if (ct && nf_ct_is_confirmed(ct)) { + /* local connections are confirmed in postrouting so confirmation + * might have happened before hitting connlimit + */ + if (skb->skb_iif != LOOPBACK_IFINDEX) { + err = -EEXIST; + goto out_put; + } + + /* this is likely a local connection, skip optimization to avoid + * adding duplicates from a 'packet train' + */ + goto check_connections; + } - if ((u32)jiffies == list->last_gc) + if ((u32)jiffies == list->last_gc && + (list->count - list->last_gc_count) < CONNCOUNT_GC_MAX_COLLECT) goto add_new_node; +check_connections: /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { - if (collect > CONNCOUNT_GC_MAX_NODES) + if (collect > CONNCOUNT_GC_MAX_COLLECT) break; found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ + goto out_put; /* already exists */ } else { collect++; } @@ -156,7 +219,7 @@ static int __nf_conncount_add(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -165,7 +228,7 @@ static int __nf_conncount_add(struct net *net, * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); - return 0; + goto out_put; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -179,53 +242,63 @@ static int __nf_conncount_add(struct net *net, nf_ct_put(found_ct); } + list->last_gc = (u32)jiffies; + list->last_gc_count = list->count; add_new_node: - if (WARN_ON_ONCE(list->count > INT_MAX)) - return -EOVERFLOW; + if (WARN_ON_ONCE(list->count > INT_MAX)) { + err = -EOVERFLOW; + goto out_put; + } conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return -ENOMEM; + if (conn == NULL) { + err = -ENOMEM; + goto out_put; + } - conn->tuple = *tuple; + conn->tuple = tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; - list->last_gc = (u32)jiffies; - return 0; + +out_put: + if (refcounted) + nf_ct_put(ct); + return err; } -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +int nf_conncount_add_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, list); spin_unlock_bh(&list->list_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conncount_add); +EXPORT_SYMBOL_GPL(nf_conncount_add_skb); void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; + list->last_gc_count = 0; list->last_gc = (u32)jiffies; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ -bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +static bool __nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -237,10 +310,6 @@ bool nf_conncount_gc_list(struct net *net, if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) - return false; - list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { @@ -262,14 +331,29 @@ bool nf_conncount_gc_list(struct net *net, } nf_ct_put(found_ct); - if (collected > CONNCOUNT_GC_MAX_NODES) + if (collected > CONNCOUNT_GC_MAX_COLLECT) break; } if (!list->count) ret = true; list->last_gc = (u32)jiffies; - spin_unlock(&list->list_lock); + list->last_gc_count = list->count; + + return ret; +} + +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + bool ret; + + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock_bh(&list->list_lock)) + return false; + + ret = __nf_conncount_gc_list(net, list); + spin_unlock_bh(&list->list_lock); return ret; } @@ -309,19 +393,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree) static unsigned int insert_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + bool do_gc = true, refcounted = false; + unsigned int count = 0, gc_count = 0; struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; + struct nf_conntrack_tuple tuple; struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - bool do_gc = true; + struct nf_conncount_rb *rbconn; + struct nf_conn *ct = NULL; spin_lock_bh(&nf_conncount_locks[hash]); restart: @@ -340,8 +427,8 @@ restart: } else { int ret; - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); - if (ret) + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); + if (ret && ret != -EEXIST) count = 0; /* hotdrop */ else count = rbconn->list.count; @@ -364,41 +451,45 @@ restart: goto restart; } - /* expected case: match, insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + goto out_unlock; - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + goto out_unlock; + } - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); + conn->tuple = tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + rbconn->list.count = count; - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + } out_unlock: + if (refcounted) + nf_ct_put(ct); spin_unlock_bh(&nf_conncount_locks[hash]); return count; } static unsigned int count_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct rb_root *root; struct rb_node *parent; @@ -422,7 +513,7 @@ count_tree(struct net *net, } else { int ret; - if (!tuple) { + if (!skb) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } @@ -437,19 +528,23 @@ count_tree(struct net *net, } /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); - if (ret) + if (ret && ret != -EEXIST) { return 0; /* hotdrop */ - else + } else { + /* -EEXIST means add was skipped, update the list */ + if (ret == -EEXIST) + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } } } - if (!tuple) + if (!skb) return 0; - return insert_tree(net, data, root, hash, key, tuple, zone); + return insert_tree(net, skb, l3num, data, root, hash, key); } static void tree_gc_worker(struct work_struct *work) @@ -511,18 +606,19 @@ next: } /* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. + * If 'skb' is not null, insert the corresponding tuple into the accounting + * data structure. Call with RCU read lock. */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key) { - return count_tree(net, data, key, tuple, zone); + return count_tree(net, skb, l3num, data, key); + } -EXPORT_SYMBOL_GPL(nf_conncount_count); +EXPORT_SYMBOL_GPL(nf_conncount_count_skb); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { @@ -536,7 +632,7 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kmalloc_obj(*data); if (!data) return ERR_PTR(-ENOMEM); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 7be4c35e4795..c0132559f6af 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -37,13 +37,13 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); module_param(ts_algo, charp, 0400); MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); -unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) - __read_mostly; +unsigned int (__rcu *nf_nat_amanda_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) + __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_amanda_hook); enum amanda_strings { diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..40c261cd0af3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/btf_ids.h> #include <linux/net_namespace.h> +#include <net/sock.h> #include <net/xdp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> @@ -114,8 +115,6 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, struct nf_conn *ct; int err; - if (!opts || !bpf_tuple) - return ERR_PTR(-EINVAL); if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) return ERR_PTR(-EINVAL); if (opts_len == NF_BPF_CT_OPTS_SZ) { @@ -299,8 +298,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, opts, opts__sz, 10); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } @@ -334,8 +332,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, caller_net = dev_net(ctx->rxq->dev); nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } return nfct; @@ -367,8 +364,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } @@ -402,8 +398,7 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } return nfct; @@ -516,10 +511,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_timeout) +BTF_ID_FLAGS(func, bpf_ct_change_timeout) +BTF_ID_FLAGS(func, bpf_ct_set_status) +BTF_ID_FLAGS(func, bpf_ct_change_status) BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 344f88295976..27ce5fda8993 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -536,7 +536,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; } else { - tmpl = kzalloc(sizeof(*tmpl), flags); + tmpl = kzalloc_obj(*tmpl, flags); if (!tmpl) return NULL; } @@ -1668,7 +1668,7 @@ __nf_conntrack_alloc(struct net *net, /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { + if (unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; @@ -2487,6 +2487,7 @@ void nf_conntrack_cleanup_net(struct net *net) void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { struct nf_ct_iter_data iter_data = {}; + unsigned long start = jiffies; struct net *net; int busy; @@ -2507,6 +2508,8 @@ i_see_dead_people: busy = 1; } if (busy) { + DEBUG_NET_WARN_ONCE(time_after(jiffies, start + 60 * HZ), + "conntrack cleanup blocked for 60s"); schedule(); goto i_see_dead_people; } @@ -2532,7 +2535,7 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; - hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); + hash = kvzalloc_objs(struct hlist_nulls_head, nr_slots); if (hash && nulls) for (i = 0; i < nr_slots; i++) diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 617f744a2e3a..5e00f9123c38 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -43,13 +43,13 @@ module_param_array(ports, ushort, &ports_c, 0400); static bool loose; module_param(loose, bool, 0600); -unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - enum nf_ct_ftp_type type, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp); +unsigned int (__rcu *nf_nat_ftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 540d97715bd2..7b1497ed97d2 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -331,6 +331,8 @@ static int decode_int(struct bitstr *bs, const struct field_t *f, if (nf_h323_error_boundary(bs, 0, 2)) return H323_ERROR_BOUND; len = get_bits(bs, 2) + 1; + if (nf_h323_error_boundary(bs, len, 0)) + return H323_ERROR_BOUND; BYTE_ALIGN(bs); if (base && (f->attr & DECODE)) { /* timeToLive */ unsigned int v = get_uint(bs, len) + f->lb; @@ -796,7 +798,7 @@ static int decode_choice(struct bitstr *bs, const struct field_t *f, if (ext || (son->attr & OPEN)) { BYTE_ALIGN(bs); - if (nf_h323_error_boundary(bs, len, 0)) + if (nf_h323_error_boundary(bs, 2, 0)) return H323_ERROR_BOUND; len = get_len(bs); if (nf_h323_error_boundary(bs, len, 0)) @@ -922,6 +924,8 @@ int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) break; p++; len--; + if (len <= 0) + break; return DecodeH323_UserInformation(buf, p, len, &q931->UUIE); } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 14f73872f647..a2a0e22ccee1 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -23,6 +23,7 @@ #include <linux/skbuff.h> #include <net/route.h> #include <net/ip6_route.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> @@ -1186,13 +1187,13 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, { struct net *net = nf_ct_net(ct); struct nf_conntrack_expect *exp; - struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple tuple = { + .src.l3num = nf_ct_l3num(ct), + .dst.protonum = IPPROTO_TCP, + .dst.u.tcp.port = port, + }; - memset(&tuple.src.u3, 0, sizeof(tuple.src.u3)); - tuple.src.u.tcp.port = 0; memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3)); - tuple.dst.u.tcp.port = port; - tuple.dst.protonum = IPPROTO_TCP; exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); if (exp && exp->master == ct) diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 5703846bea3b..b8e6d724acd1 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -30,12 +30,13 @@ static unsigned int dcc_timeout __read_mostly = 300; static char *irc_buffer; static DEFINE_SPINLOCK(irc_buffer_lock); -unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conntrack_expect *exp) __read_mostly; +unsigned int (__rcu *nf_nat_irc_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) + __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_irc_hook); #define HELPER_NAME "irc" diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3a04665adf99..c156574e1273 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -32,6 +32,7 @@ #include <linux/siphash.h> #include <linux/netfilter.h> +#include <net/ipv6.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> @@ -998,7 +999,7 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) return ERR_PTR(-EOPNOTSUPP); #endif - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc_obj(*filter); if (filter == NULL) return ERR_PTR(-ENOMEM); @@ -3211,7 +3212,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; - struct nf_conn_help *help = nfct_help(ct); + struct nf_conn_help *help; u_int8_t l3proto = nfmsg->nfgen_family; unsigned long last_id = cb->args[1]; struct nf_conntrack_expect *exp; @@ -3219,6 +3220,10 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) if (cb->args[0]) return 0; + help = nfct_help(ct); + if (!help) + return 0; + rcu_read_lock(); restart: @@ -3248,6 +3253,24 @@ out: return skb->len; } +static int ctnetlink_dump_exp_ct_start(struct netlink_callback *cb) +{ + struct nf_conn *ct = cb->data; + + if (!refcount_inc_not_zero(&ct->ct_general.use)) + return -ENOENT; + return 0; +} + +static int ctnetlink_dump_exp_ct_done(struct netlink_callback *cb) +{ + struct nf_conn *ct = cb->data; + + if (ct) + nf_ct_put(ct); + return 0; +} + static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -3263,6 +3286,8 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, + .start = ctnetlink_dump_exp_ct_start, + .done = ctnetlink_dump_exp_ct_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, @@ -3464,7 +3489,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, #if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { - [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT_DIR] = NLA_POLICY_MAX(NLA_BE32, IP_CT_DIR_REPLY), [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; #endif diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c index 068e9489e1c2..a6988eeb1579 100644 --- a/net/netfilter/nf_conntrack_ovs.c +++ b/net/netfilter/nf_conntrack_ovs.c @@ -121,7 +121,7 @@ int nf_ct_skb_network_trim(struct sk_buff *skb, int family) len = skb_ip_totlen(skb); break; case NFPROTO_IPV6: - len = ntohs(ipv6_hdr(skb)->payload_len); + len = skb_ipv6_payload_len(skb); if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) { int err = nf_ip6_check_hbh_len(skb, &len); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e831637bc8ca..cb260eb3d012 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -67,6 +67,7 @@ void nf_conntrack_generic_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l4proto = 255, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index af369e686fc5..94c19bc4edc5 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -33,12 +33,14 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <net/dst.h> +#include <net/gre.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> +#include <net/pptp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> @@ -106,7 +108,7 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, return -EEXIST; } - km = kmalloc(sizeof(*km), GFP_ATOMIC); + km = kmalloc_obj(*km, GFP_ATOMIC); if (!km) return -ENOMEM; memcpy(&km->tuple, t, sizeof(*t)); diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index b38b7164acd5..32148a3a8509 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -365,6 +365,7 @@ void nf_conntrack_icmp_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l4proto = IPPROTO_ICMP, + .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 327b8059025d..e508b3aa370a 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -343,6 +343,7 @@ void nf_conntrack_icmpv6_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, + .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 7c6f7c9f7332..645d2c43ebf7 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -582,7 +582,8 @@ nla_put_failure: } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { - [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_SCTP_STATE] = NLA_POLICY_MAX(NLA_U8, + SCTP_CONNTRACK_HEARTBEAT_SENT), [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index ca748f8dbff1..4ab5ef71d96d 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1534,11 +1534,12 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, { struct tcphdr *th, _tcph; unsigned int dataoff, datalen; - unsigned int matchoff, matchlen, clen; + unsigned int matchoff, matchlen; unsigned int msglen, origlen; const char *dptr, *end; s16 diff, tdiff = 0; int ret = NF_ACCEPT; + unsigned long clen; bool term; if (ctinfo != IP_CT_ESTABLISHED && @@ -1573,6 +1574,9 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, if (dptr + matchoff == end) break; + if (clen > datalen) + break; + term = false; for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) { if (end[0] == '\r' && end[1] == '\n' && diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index daacf2023fa5..387dd6e58f88 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -25,10 +25,10 @@ static unsigned int timeout __read_mostly = 30; module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); -int (*nf_nat_snmp_hook)(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo); +int (__rcu *nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 708b79380f04..207b240b14e5 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -648,7 +648,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { @@ -929,7 +929,7 @@ static struct ctl_table nf_ct_netfilter_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, }; diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 80ee53f29f68..89e9914e5d03 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -32,9 +32,10 @@ static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "Port numbers of TFTP servers"); -unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_expect *exp) __read_mostly; +unsigned int (__rcu *nf_nat_tftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) + __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_tftp_hook); static int tftp_help(struct sk_buff *skb, diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c index 4a5f5195f2d2..cbd5b97a6329 100644 --- a/net/netfilter/nf_flow_table_bpf.c +++ b/net/netfilter/nf_flow_table_bpf.c @@ -105,7 +105,7 @@ __diag_pop() __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_ft_kfunc_set) -BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL) BTF_KFUNCS_END(nf_ft_kfunc_set) static const struct btf_kfunc_id_set nf_flow_kfunc_set = { diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9441ac3d8c1a..2c4140e6f53c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -16,6 +16,7 @@ static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); +static __read_mostly struct kmem_cache *flow_offload_cachep; static void flow_offload_fill_dir(struct flow_offload *flow, @@ -56,7 +57,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) if (unlikely(nf_ct_is_dying(ct))) return NULL; - flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC); if (!flow) return NULL; @@ -118,7 +119,10 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->in_vlan_ingress |= BIT(j); j++; } + + flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: @@ -127,11 +131,11 @@ static int flow_offload_fill_route(struct flow_offload *flow, memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: + flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; @@ -809,9 +813,13 @@ static int __init nf_flow_table_module_init(void) { int ret; + flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN); + if (!flow_offload_cachep) + return -ENOMEM; + ret = register_pernet_subsys(&nf_flow_table_net_ops); if (ret < 0) - return ret; + goto out_pernet; ret = nf_flow_table_offload_init(); if (ret) @@ -827,6 +835,8 @@ out_bpf: nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); +out_pernet: + kmem_cache_destroy(flow_offload_cachep); return ret; } @@ -834,6 +844,7 @@ static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); unregister_pernet_subsys(&nf_flow_table_net_ops); + kmem_cache_destroy(flow_offload_cachep); } module_init(nf_flow_table_module_init); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae211..fd56d663cb5b 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,10 +8,13 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/gre.h> #include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/ip6_tunnel.h> #include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack_acct.h> @@ -142,11 +145,28 @@ static bool ip_has_options(unsigned int thoff) return thoff != sizeof(struct iphdr); } -static void nf_flow_tuple_encap(struct sk_buff *skb, +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; + struct { + /* Tunnel IP header size */ + u32 hdr_size; + /* IP tunnel protocol */ + u8 proto; + } tun; +}; + +static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, struct flow_offload_tuple *tuple) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct ipv6hdr *ip6h; + struct iphdr *iph; + u16 offset = 0; int i = 0; if (skb_vlan_tag_present(skb)) { @@ -159,20 +179,39 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, veth = (struct vlan_ethhdr *)skb_mac_header(skb); tuple->encap[i].id = ntohs(veth->h_vlan_TCI); tuple->encap[i].proto = skb->protocol; + inner_proto = veth->h_vlan_encapsulated_proto; + offset += VLAN_HLEN; break; case htons(ETH_P_PPP_SES): phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; + inner_proto = *((__be16 *)(phdr + 1)); + offset += PPPOE_SES_HLEN; break; } -} -struct nf_flowtable_ctx { - const struct net_device *in; - u32 offset; - u32 hdrsize; -}; + switch (inner_proto) { + case htons(ETH_P_IP): + iph = (struct iphdr *)(skb_network_header(skb) + offset); + if (ctx->tun.proto == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + break; + case htons(ETH_P_IPV6): + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + if (ctx->tun.proto == IPPROTO_IPV6) { + tuple->tun.dst_v6 = ip6h->daddr; + tuple->tun.src_v6 = ip6h->saddr; + tuple->tun.l3_proto = IPPROTO_IPV6; + } + break; + default: + break; + } +} static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) @@ -241,7 +280,7 @@ static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, tuple->l3proto = AF_INET; tuple->l4proto = ipproto; tuple->iifidx = ctx->in->ifindex; - nf_flow_tuple_encap(skb, tuple); + nf_flow_tuple_encap(ctx, skb, tuple); return 0; } @@ -277,11 +316,84 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, - u32 *offset) +static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) +{ + struct iphdr *iph; + u16 size; + + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) + return false; + + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); + size = iph->ihl << 2; + + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) + return false; + + if (iph->ttl <= 1) + return false; + + if (iph->protocol == IPPROTO_IPIP) { + ctx->tun.proto = IPPROTO_IPIP; + ctx->tun.hdr_size = size; + ctx->offset += size; + } + + return true; +} + +static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) { +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + __be16 frag_off; + u8 nexthdr; + int hdrlen; + + ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h); + if (!ip6h) + return false; + + if (ip6h->hop_limit <= 1) + return false; + + nexthdr = ip6h->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr, + &frag_off); + if (hdrlen < 0) + return false; + + if (nexthdr == IPPROTO_IPV6) { + ctx->tun.hdr_size = hdrlen; + ctx->tun.proto = IPPROTO_IPV6; + } + ctx->offset += ctx->tun.hdr_size; + + return true; +#else + return false; +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} + +static void nf_flow_ip_tunnel_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) +{ + if (ctx->tun.proto != IPPROTO_IPIP && + ctx->tun.proto != IPPROTO_IPV6) + return; + + skb_pull(skb, ctx->tun.hdr_size); + skb_reset_network_header(skb); +} + +static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, __be16 proto) +{ + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; - __be16 inner_proto; + bool ret = false; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -290,23 +402,36 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { - *offset += VLAN_HLEN; - return true; + ctx->offset += VLAN_HLEN; + inner_proto = proto; + ret = true; } break; case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { - *offset += PPPOE_SES_HLEN; - return true; + ctx->offset += PPPOE_SES_HLEN; + ret = true; } break; } - return false; + switch (inner_proto) { + case htons(ETH_P_IP): + ret = nf_flow_ip4_tunnel_proto(ctx, skb); + break; + case htons(ETH_P_IPV6): + ret = nf_flow_ip6_tunnel_proto(ctx, skb); + break; + default: + break; + } + + return ret; } -static void nf_flow_encap_pop(struct sk_buff *skb, +static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr; @@ -331,21 +456,24 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } } + + if (skb->protocol == htons(ETH_P_IP) || + skb->protocol == htons(ETH_P_IPV6)) + nf_flow_ip_tunnel_pop(ctx, skb); } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; +}; + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - const struct flow_offload_tuple_rhash *tuplehash, - unsigned short type) + struct nf_flow_xmit *xmit) { - struct net_device *outdev; - - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); - if (!outdev) - return NF_DROP; - - skb->dev = outdev; - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, - tuplehash->tuple.out.h_source, skb->len); + skb->dev = xmit->outdev; + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); return NF_STOLEN; @@ -357,8 +485,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IP))) return NULL; if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) @@ -381,6 +508,9 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*iph); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -399,7 +529,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow_offload_refresh(flow_table, flow, false); - nf_flow_encap_pop(skb, tuplehash); + nf_flow_encap_pop(ctx, skb, tuplehash); thoff -= ctx->offset; iph = ip_hdr(skb); @@ -414,20 +544,231 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +{ + int data_len = skb->len + sizeof(__be16); + struct ppp_hdr { + struct pppoe_hdr hdr; + __be16 proto; + } *ph; + __be16 proto; + + if (skb_cow_head(skb, PPPOE_SES_HLEN)) + return -1; + + switch (skb->protocol) { + case htons(ETH_P_IP): + proto = htons(PPP_IP); + break; + case htons(ETH_P_IPV6): + proto = htons(PPP_IPV6); + break; + default: + return -1; + } + + __skb_push(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + + ph = (struct ppp_hdr *)(skb->data); + ph->hdr.ver = 1; + ph->hdr.type = 1; + ph->hdr.code = 0; + ph->hdr.sid = htons(id); + ph->hdr.length = htons(data_len); + ph->proto = proto; + skb->protocol = htons(ETH_P_PPP_SES); + + return 0; +} + +static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + struct rtable *rt = dst_rtable(tuple->dst_cache); + u8 tos = iph->tos, ttl = iph->ttl; + __be16 frag_off = iph->frag_off; + u32 headroom = sizeof(*iph); + int err; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); + if (err) + return err; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + skb_clear_hash_if_not_l4(skb); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; + iph->protocol = tuple->tun.l3_proto; + iph->tos = tos; + iph->daddr = tuple->tun.src_v4.s_addr; + iph->saddr = tuple->tun.dst_v4.s_addr; + iph->ttl = ttl; + iph->tot_len = htons(skb->len); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + ip_send_check(iph); + + *ip_daddr = tuple->tun.src_v4.s_addr; + + return 0; +} + +static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); + + return 0; +} + +struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct in6_addr **ip6_daddr, + int encap_limit) +{ + struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb); + u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6; + struct rtable *rt = dst_rtable(tuple->dst_cache); + __u8 dsfield = ipv6_get_dsfield(ip6h); + struct flowi6 fl6 = { + .daddr = tuple->tun.src_v6, + .saddr = tuple->tun.dst_v6, + .flowi6_proto = proto, + }; + int err, mtu; + u32 headroom; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); + if (err) + return err; + + skb_set_inner_ipproto(skb, proto); + headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) + + rt->dst.header_len; + if (encap_limit) + headroom += 8; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + mtu = dst_mtu(&rt->dst) - sizeof(*ip6h); + if (encap_limit) + mtu -= 8; + mtu = max(mtu, IPV6_MIN_MTU); + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (encap_limit > 0) { + struct ipv6_tel_txoption opt = { + .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT, + .dst_opt[3] = 1, + .dst_opt[4] = encap_limit, + .dst_opt[5] = IPV6_TLV_PADN, + .dst_opt[6] = 1, + }; + struct ipv6_opt_hdr *hopt; + + opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt; + opt.ops.opt_nflen = 8; + + hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt)); + memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt)); + hopt->nexthdr = IPPROTO_IPV6; + proto = NEXTHDR_DEST; + } + + skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, dsfield, + ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6)); + ip6h->hop_limit = hop_limit; + ip6h->nexthdr = proto; + ip6h->daddr = tuple->tun.src_v6; + ip6h->saddr = tuple->tun.dst_v6; + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h)); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + *ip6_daddr = &tuple->tun.src_v6; + + return 0; +} + +static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct in6_addr **ip6_daddr, + int encap_limit) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr, + encap_limit); + + return 0; +} + +static int nf_flow_encap_push(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i; + + for (i = 0; i < tuple->encap_num; i++) { + switch (tuple->encap[i].proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + skb_reset_mac_header(skb); + if (skb_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id) < 0) + return -1; + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + return -1; + break; + } + } + + return 0; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; + struct nf_flow_xmit xmit = {}; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rtable *rt; - __be32 nexthop; + __be32 ip_daddr; int ret; tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); @@ -450,29 +791,46 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip_daddr = other_tuple->src_v4.s_addr; + + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) + return NF_DROP; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); @@ -641,7 +999,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; tuple->iifidx = ctx->in->ifindex; - nf_flow_tuple_encap(skb, tuple); + nf_flow_tuple_encap(ctx, skb, tuple); return 0; } @@ -649,7 +1007,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash, - struct sk_buff *skb) + struct sk_buff *skb, int encap_limit) { enum flow_offload_tuple_dir dir; struct flow_offload *flow; @@ -660,6 +1018,12 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) { + mtu -= sizeof(*ip6h); + if (encap_limit > 0) + mtu -= 8; /* encap limit option */ + } + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -678,7 +1042,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow_offload_refresh(flow_table, flow, false); - nf_flow_encap_pop(skb, tuplehash); + nf_flow_encap_pop(ctx, skb, tuplehash); ip6h = ipv6_hdr(skb); nf_flow_nat_ipv6(flow, skb, dir, ip6h); @@ -699,8 +1063,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6))) return NULL; if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) @@ -713,15 +1076,18 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + int encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT; struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; - const struct in6_addr *nexthop; + struct nf_flow_xmit xmit = {}; + struct in6_addr *ip6_daddr; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rt6_info *rt; int ret; @@ -729,7 +1095,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb, + encap_limit); if (ret < 0) return NF_DROP; else if (ret == 0) @@ -745,28 +1112,46 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip6_daddr = &other_tuple->src_v6; + + if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple, + &ip6_daddr, encap_limit) < 0) + return NF_DROP; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e06bc36f49fe..9b677e116487 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -6,6 +6,7 @@ #include <linux/netdevice.h> #include <linux/tc_act/tc_csum.h> #include <net/flow_offload.h> +#include <net/ip_tunnels.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> @@ -555,7 +556,7 @@ static void flow_offload_redirect(struct net *net, switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; - ifindex = this_tuple->out.hw_ifidx; + ifindex = this_tuple->out.ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; @@ -740,7 +741,7 @@ nf_flow_offload_rule_alloc(struct net *net, struct nf_flow_rule *flow_rule; int err = -ENOMEM; - flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); + flow_rule = kzalloc_obj(*flow_rule); if (!flow_rule) goto err_flow; @@ -1021,7 +1022,7 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags)) return NULL; - offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = kmalloc_obj(struct flow_offload_work, GFP_ATOMIC); if (!offload) { clear_bit(NF_FLOW_HW_PENDING, &flow->flags); return NULL; diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c new file mode 100644 index 000000000000..6bb9579dcc2a --- /dev/null +++ b/net/netfilter/nf_flow_table_path.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/etherdevice.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <linux/netfilter/nf_tables.h> +#include <net/ip.h> +#include <net/inet_dscp.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_flow_table.h> + +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + if (!nft_is_valid_ether_device(dev)) + goto out; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + +out: + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + struct flow_offload_tunnel tun; + u8 num_tuns; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + case DEV_PATH_TUN: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ + if (path->type == DEV_PATH_TUN) { + if (info->num_tuns) { + info->indev = NULL; + break; + } + info->tun.src_v6 = path->tun.src_v6; + info->tun.dst_v6 = path->tun.dst_v6; + info->tun.l3_proto = path->tun.l3_proto; + info->num_tuns++; + } else { + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = + path->encap.id; + info->encap[info->num_encaps].proto = + path->encap.proto; + info->num_encaps++; + } + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { + info->indev = NULL; + break; + } + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + info->outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (!nft_hook_find_ops_rcu(hook, dev)) + continue; + + found = true; + break; + } + + return found; +} + +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, + struct flow_offload_tunnel *tun, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *cur_dst = route->tuple[dir].dst; + struct dst_entry *tun_dst = NULL; + struct flowi fl = {}; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = tun->dst_v4.s_addr; + fl.u.ip4.saddr = tun->src_v4.s_addr; + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = tun->dst_v6; + fl.u.ip6.saddr = tun->src_v6; + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); + if (!tun_dst) + return -ENOENT; + + route->tuple[dir].dst = tun_dst; + dst_release(cur_dst); + + return 0; +} + +static void nft_dev_forward_path(const struct nft_pktinfo *pkt, + struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (info.outdev) + route->tuple[dir].out.ifindex = info.outdev->ifindex; + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + + if (info.num_tuns && + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; + route->tuple[!dir].in.num_tuns = info.num_tuns; + } + + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].xmit_type = info.xmit_type; + } +} + +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + if (!dst_hold_safe(this_dst)) + return -ENOENT; + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) { + dst_release(this_dst); + return -ENOENT; + } + + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, dir, ft); + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, !dir, ft); + + return 0; +} +EXPORT_SYMBOL_GPL(nft_flow_route); diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c index e1252d042699..86ac65e9b579 100644 --- a/net/netfilter/nf_flow_table_xdp.c +++ b/net/netfilter/nf_flow_table_xdp.c @@ -54,7 +54,7 @@ static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, unsigned long key = (unsigned long)dev; struct flow_offload_xdp_ft *ft_elem; - ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT); + ft_elem = kzalloc_obj(*ft_elem, GFP_KERNEL_ACCOUNT); if (!ft_elem) return -ENOMEM; @@ -70,7 +70,7 @@ static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft, } if (!elem) { - elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT); + elem = kzalloc_obj(*elem, GFP_KERNEL_ACCOUNT); if (!elem) goto err_unlock; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 74cef8bf554c..f4d80654dfe6 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -89,7 +89,7 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) if (pf == NFPROTO_UNSPEC) { for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { if (rcu_access_pointer(loggers[i][logger->type])) { - ret = -EEXIST; + ret = -EBUSY; goto unlock; } } @@ -97,7 +97,7 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) rcu_assign_pointer(loggers[i][logger->type], logger); } else { if (rcu_access_pointer(loggers[pf][logger->type])) { - ret = -EEXIST; + ret = -EBUSY; goto unlock; } rcu_assign_pointer(loggers[pf][logger->type], logger); @@ -317,7 +317,7 @@ EXPORT_SYMBOL_GPL(nf_log_buf_add); struct nf_log_buf *nf_log_buf_open(void) { - struct nf_log_buf *m = kmalloc(sizeof(*m), GFP_ATOMIC); + struct nf_log_buf *m = kmalloc_obj(*m, GFP_ATOMIC); if (unlikely(!m)) { local_bh_disable(); diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 86d5fc5d28e3..41503847d9d7 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -561,7 +561,7 @@ dump_ipv6_packet(struct net *net, struct nf_log_buf *m, /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + ipv6_payload_len(skb, ih) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, (ntohl(*(__be32 *)ih) & 0x000fffff)); diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 481be15609b1..f9dd85ccea01 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_nat_kfunc_set) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info) BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 78a61dac4ade..3b5434e4ec9c 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -294,25 +294,13 @@ nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, ct = nf_ct_tuplehash_to_ctrack(thash); - /* NB: IP_CT_DIR_ORIGINAL should be impossible because - * nf_nat_used_tuple() handles origin collisions. - * - * Handle remote chance other CPU confirmed its ct right after. - */ - if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) - goto out; - /* clashing connection subject to NAT? Retry with new tuple. */ if (READ_ONCE(ct->status) & uses_nat) goto out; if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && - nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { + &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple)) taken = false; - goto out; - } out: nf_ct_put(ct); return taken; @@ -1225,7 +1213,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, } for (i = 0; i < ops_count; i++) { - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (priv) { nat_ops[i].priv = priv; continue; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 1a506b0c6511..a5a23c03fda9 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -115,7 +115,7 @@ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, if (!try_module_get(THIS_MODULE)) goto err_module; - w = kzalloc(sizeof(*w), gfp_flags); + w = kzalloc_obj(*w, gfp_flags); if (w) { /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ atomic_inc(&masq_worker_count); diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c index 0f9a559f6207..31474e8c034a 100644 --- a/net/netfilter/nf_nat_ovs.c +++ b/net/netfilter/nf_nat_ovs.c @@ -2,6 +2,9 @@ /* Support nat functions for openvswitch and used by OVS and TC conntrack. */ #include <net/netfilter/nf_nat.h> +#include <net/ipv6.h> +#include <linux/ip.h> +#include <linux/if_vlan.h> /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index b14a434b9561..97c0f841fc96 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -25,6 +25,7 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> +#include <net/pptp.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3fa3f5dfb264..57f57e2fc80a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -10,6 +10,7 @@ #include <net/netns/generic.h> #include <linux/proc_fs.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/nf_synproxy.h> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index eed434e0a970..3922cff1bb3d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include <linux/rhashtable.h> #include <linux/audit.h> #include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> @@ -123,6 +124,29 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s table->validate_state = new_validate_state; } + +static bool nft_chain_vstate_valid(const struct nft_ctx *ctx, + const struct nft_chain *chain) +{ + const struct nft_base_chain *base_chain; + enum nft_chain_types type; + u8 hooknum; + + if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) + return false; + + base_chain = nft_base_chain(ctx->chain); + hooknum = base_chain->ops.hooknum; + type = base_chain->type->type; + + /* chain is already validated for this call depth */ + if (chain->vstate.depth >= ctx->level && + chain->vstate.hook_mask[type] & BIT(hooknum)) + return true; + + return false; +} + static void nf_tables_trans_destroy_work(struct work_struct *w); static void nft_trans_gc_work(struct work_struct *work); @@ -805,10 +829,14 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); - break; } } +/* Use NFT_ITER_UPDATE iterator even if this may be called from the preparation + * phase, the set clone might already exist from a previous command, or it might + * be a set that is going away and does not require a clone. The netns and + * netlink release paths also need to work on the live set. + */ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { @@ -1080,7 +1108,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, } } - req = kmalloc(sizeof(*req), GFP_KERNEL); + req = kmalloc_obj(*req); if (!req) return -ENOMEM; @@ -1600,7 +1628,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, } err = -ENOMEM; - table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT); + table = kzalloc_obj(*table, GFP_KERNEL_ACCOUNT); if (table == NULL) goto err_kzalloc; @@ -2324,7 +2352,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, struct nft_hook *hook; int err; - hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); + hook = kzalloc_obj(struct nft_hook, GFP_KERNEL_ACCOUNT); if (!hook) return ERR_PTR(-ENOMEM); @@ -2345,7 +2373,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) continue; - ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + ops = kzalloc_obj(struct nf_hook_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err_hook_free; @@ -2692,7 +2720,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, if (err < 0) return err; - basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT); + basechain = kzalloc_obj(*basechain, GFP_KERNEL_ACCOUNT); if (basechain == NULL) { nft_chain_release_hook(&hook); return -ENOMEM; @@ -2724,7 +2752,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, if (flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; - chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT); + chain = kzalloc_obj(*chain, GFP_KERNEL_ACCOUNT); if (chain == NULL) return -ENOMEM; @@ -2799,6 +2827,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, err_register_hook: nft_chain_del(chain); + synchronize_rcu(); err_chain_add: nft_trans_destroy(trans); err_trans: @@ -3877,23 +3906,6 @@ done: return skb->len; } -static int nf_tables_dumpreset_rules(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); - int ret; - - /* Mutex is held is to prevent that two concurrent dump-and-reset calls - * do not underrun counters and quotas. The commit_mutex is used for - * the lack a better lock, this is not transaction path. - */ - mutex_lock(&nft_net->commit_mutex); - ret = nf_tables_dump_rules(skb, cb); - mutex_unlock(&nft_net->commit_mutex); - - return ret; -} - static int nf_tables_dump_rules_start(struct netlink_callback *cb) { struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; @@ -3913,16 +3925,10 @@ static int nf_tables_dump_rules_start(struct netlink_callback *cb) return -ENOMEM; } } - return 0; -} + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + ctx->reset = true; -static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) -{ - struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; - - ctx->reset = true; - - return nf_tables_dump_rules_start(cb); + return 0; } static int nf_tables_dump_rules_done(struct netlink_callback *cb) @@ -3988,6 +3994,8 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; + bool reset = false; + char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -4001,47 +4009,16 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - skb2 = nf_tables_getrule_single(portid, info, nla, false); - if (IS_ERR(skb2)) - return PTR_ERR(skb2); - - return nfnetlink_unicast(skb2, net, portid); -} - -static int nf_tables_getrule_reset(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) -{ - struct nftables_pernet *nft_net = nft_pernet(info->net); - u32 portid = NETLINK_CB(skb).portid; - struct net *net = info->net; - struct sk_buff *skb2; - char *buf; - - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start= nf_tables_dumpreset_rules_start, - .dump = nf_tables_dumpreset_rules, - .done = nf_tables_dump_rules_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - - if (!try_module_get(THIS_MODULE)) - return -EINVAL; - rcu_read_unlock(); - mutex_lock(&nft_net->commit_mutex); - skb2 = nf_tables_getrule_single(portid, info, nla, true); - mutex_unlock(&nft_net->commit_mutex); - rcu_read_lock(); - module_put(THIS_MODULE); + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; + skb2 = nf_tables_getrule_single(portid, info, nla, reset); if (IS_ERR(skb2)) return PTR_ERR(skb2); + if (!reset) + return nfnetlink_unicast(skb2, net, portid); + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), @@ -4079,6 +4056,29 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r nf_tables_rule_destroy(ctx, rule); } +static void nft_chain_vstate_update(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + const struct nft_base_chain *base_chain; + enum nft_chain_types type; + u8 hooknum; + + /* ctx->chain must hold the calling base chain. */ + if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) { + memset(&chain->vstate, 0, sizeof(chain->vstate)); + return; + } + + base_chain = nft_base_chain(ctx->chain); + hooknum = base_chain->ops.hooknum; + type = base_chain->type->type; + + BUILD_BUG_ON(BIT(NF_INET_NUMHOOKS) > U8_MAX); + + chain->vstate.hook_mask[type] |= BIT(hooknum); + if (chain->vstate.depth < ctx->level) + chain->vstate.depth = ctx->level; +} + /** nft_chain_validate - loop detection and hook validation * * @ctx: context containing call depth and base chain @@ -4088,15 +4088,25 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r * and set lookups until either the jump limit is hit or all reachable * chains have been validated. */ -int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) +int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain) { struct nft_expr *expr, *last; struct nft_rule *rule; int err; + BUILD_BUG_ON(NFT_JUMP_STACK_SIZE > 255); if (ctx->level == NFT_JUMP_STACK_SIZE) return -EMLINK; + if (ctx->level > 0) { + /* jumps to base chains are not allowed. */ + if (nft_is_base_chain(chain)) + return -ELOOP; + + if (nft_chain_vstate_valid(ctx, chain)) + return 0; + } + list_for_each_entry(rule, &chain->rules, list) { if (fatal_signal_pending(current)) return -EINTR; @@ -4115,8 +4125,11 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (err < 0) return err; } + + cond_resched(); } + nft_chain_vstate_update(ctx, chain); return 0; } EXPORT_SYMBOL_GPL(nft_chain_validate); @@ -4128,7 +4141,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) .net = net, .family = table->family, }; - int err; + int err = 0; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) @@ -4137,12 +4150,14 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) ctx.chain = chain; err = nft_chain_validate(&ctx, chain); if (err < 0) - return err; - - cond_resched(); + goto err; } - return 0; +err: + list_for_each_entry(chain, &table->chains, list) + memset(&chain->vstate, 0, sizeof(chain->vstate)); + + return err; } int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, @@ -4304,9 +4319,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { - expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS, - sizeof(struct nft_expr_info), - GFP_KERNEL); + expr_info = kvmalloc_objs(struct nft_expr_info, + NFT_RULE_MAXEXPRS); if (!expr_info) return -ENOMEM; @@ -4378,7 +4392,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (!nft_use_inc(&chain->use)) { err = -EMFILE; - goto err_release_rule; + goto err_destroy_flow; } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { @@ -4428,6 +4442,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err_destroy_flow_rule: nft_use_dec_restore(&chain->use); +err_destroy_flow: if (flow) nft_flow_rule_destroy(flow); err_release_rule: @@ -5770,7 +5785,11 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { struct nft_set_binding *i; - struct nft_set_iter iter; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nf_tables_bind_check_setelem, + }; if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -5785,13 +5804,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, goto bind; } - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_bind_check_setelem; - set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_bind_check(ctx, set); @@ -5855,12 +5867,11 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (nft_set_elem_active(ext, genmask)) continue; nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); - break; } } @@ -6195,7 +6206,17 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; - struct nft_set_dump_args args; + struct nft_set_dump_args args = { + .cb = cb, + .skb = skb, + .reset = dump_ctx->reset, + .iter = { + .genmask = nft_genmask_cur(net), + .type = NFT_ITER_READ, + .skip = cb->args[0], + .fn = nf_tables_dump_setelem, + }, + }; bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; @@ -6246,15 +6267,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - args.cb = cb; - args.skb = skb; - args.reset = dump_ctx->reset; - args.iter.genmask = nft_genmask_cur(net); - args.iter.type = NFT_ITER_READ; - args.iter.skip = cb->args[0]; - args.iter.count = 0; - args.iter.err = 0; - args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) @@ -6263,6 +6275,10 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_end(skb, nest); nlmsg_end(skb, nlh); + if (dump_ctx->reset && args.iter.count > args.iter.skip) + audit_log_nft_set_reset(table, cb->seq, + args.iter.count - args.iter.skip); + rcu_read_unlock(); if (args.iter.err && args.iter.err != -EMSGSIZE) @@ -6278,26 +6294,6 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_dumpreset_set(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); - struct nft_set_dump_ctx *dump_ctx = cb->data; - int ret, skip = cb->args[0]; - - mutex_lock(&nft_net->commit_mutex); - - ret = nf_tables_dump_set(skb, cb); - - if (cb->args[0] > skip) - audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, - cb->args[0] - skip); - - mutex_unlock(&nft_net->commit_mutex); - - return ret; -} - static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -6541,8 +6537,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb, { struct netlink_ext_ack *extack = info->extack; struct nft_set_dump_ctx dump_ctx; + int rem, err = 0, nelems = 0; + struct net *net = info->net; struct nlattr *attr; - int rem, err = 0; + bool reset = false; + + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -6552,7 +6553,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, .module = THIS_MODULE, }; - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, reset); if (err) return err; @@ -6563,75 +6564,21 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, reset); if (err) return err; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); - if (err < 0) { - NL_SET_BAD_ATTR(extack, attr); - break; - } - } - - return err; -} - -static int nf_tables_getsetelem_reset(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) -{ - struct nftables_pernet *nft_net = nft_pernet(info->net); - struct netlink_ext_ack *extack = info->extack; - struct nft_set_dump_ctx dump_ctx; - int rem, err = 0, nelems = 0; - struct nlattr *attr; - - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_set_start, - .dump = nf_tables_dumpreset_set, - .done = nf_tables_dump_set_done, - .module = THIS_MODULE, - }; - - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); - if (err) - return err; - - c.data = &dump_ctx; - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - - if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) - return -EINVAL; - - if (!try_module_get(THIS_MODULE)) - return -EINVAL; - rcu_read_unlock(); - mutex_lock(&nft_net->commit_mutex); - rcu_read_lock(); - - err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); - if (err) - goto out_unlock; - - nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, reset); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); - -out_unlock: - rcu_read_unlock(); - mutex_unlock(&nft_net->commit_mutex); - rcu_read_lock(); - module_put(THIS_MODULE); + if (reset) + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(net), + nelems); return err; } @@ -6797,8 +6744,8 @@ static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } } -static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, - struct nft_set_elem_expr *elem_expr) +void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_set_elem_expr *elem_expr) { struct nft_expr *expr; u32 size; @@ -6984,7 +6931,7 @@ static int nft_setelem_catchall_insert(const struct net *net, } } - catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); + catchall = kmalloc_obj(*catchall, GFP_KERNEL_ACCOUNT); if (!catchall) return -ENOMEM; @@ -7225,6 +7172,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; + bool set_full = false; u64 expiration; u64 timeout; int err, i; @@ -7511,10 +7459,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto err_elem_free; + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + unsigned int max = nft_set_maxsize(set), nelems; + + nelems = atomic_inc_return(&set->nelems); + if (nelems > max) + set_full = true; + } + trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err_elem_free; + goto err_set_size; } ext->genmask = nft_genmask_cur(ctx->net); @@ -7566,7 +7522,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans); - goto err_elem_free; + goto err_set_size; } } } @@ -7575,27 +7531,25 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, * and an existing one. */ err = -EEXIST; + } else if (err == -ECANCELED) { + /* ECANCELED reports an existing nul-element in + * interval sets. + */ + err = 0; } goto err_element_clash; } - if (!(flags & NFT_SET_ELEM_CATCHALL)) { - unsigned int max = nft_set_maxsize(set); - - if (!atomic_add_unless(&set->nelems, 1, max)) { - err = -ENFILE; - goto err_set_full; - } - } - nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans); - return 0; -err_set_full: - nft_setelem_remove(ctx->net, set, elem.priv); + return set_full ? -ENFILE : 0; + err_element_clash: kfree(trans); +err_set_size: + if (!(flags & NFT_SET_ELEM_CATCHALL)) + atomic_dec(&set->nelems); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: @@ -7746,7 +7700,8 @@ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, continue; } - if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + if (!te->set->ops->abort_skip_removal || + nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) @@ -7939,9 +7894,12 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { + /* The set backend might need to clone the set, do it now from the + * preparation phase, use NFT_ITER_UPDATE_CLONE iterator type. + */ struct nft_set_iter iter = { .genmask = genmask, - .type = NFT_ITER_UPDATE, + .type = NFT_ITER_UPDATE_CLONE, .fn = nft_setelem_flush, }; @@ -8112,7 +8070,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, struct nft_object *obj; int err = -ENOMEM; - tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL); + tb = kmalloc_objs(*tb, type->maxattr + 1); if (!tb) goto err1; @@ -8484,19 +8442,6 @@ cont: return skb->len; } -static int nf_tables_dumpreset_obj(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); - int ret; - - mutex_lock(&nft_net->commit_mutex); - ret = nf_tables_dump_obj(skb, cb); - mutex_unlock(&nft_net->commit_mutex); - - return ret; -} - static int nf_tables_dump_obj_start(struct netlink_callback *cb) { struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; @@ -8513,16 +8458,10 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb) if (nla[NFTA_OBJ_TYPE]) ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - return 0; -} - -static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb) -{ - struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; - - ctx->reset = true; + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + ctx->reset = true; - return nf_tables_dump_obj_start(cb); + return 0; } static int nf_tables_dump_obj_done(struct netlink_callback *cb) @@ -8585,41 +8524,15 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { u32 portid = NETLINK_CB(skb).portid; - struct sk_buff *skb2; - - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start = nf_tables_dump_obj_start, - .dump = nf_tables_dump_obj, - .done = nf_tables_dump_obj_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - - skb2 = nf_tables_getobj_single(portid, info, nla, false); - if (IS_ERR(skb2)) - return PTR_ERR(skb2); - - return nfnetlink_unicast(skb2, info->net, portid); -} - -static int nf_tables_getobj_reset(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) -{ - struct nftables_pernet *nft_net = nft_pernet(info->net); - u32 portid = NETLINK_CB(skb).portid; struct net *net = info->net; struct sk_buff *skb2; + bool reset = false; char *buf; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { - .start = nf_tables_dumpreset_obj_start, - .dump = nf_tables_dumpreset_obj, + .start = nf_tables_dump_obj_start, + .dump = nf_tables_dump_obj, .done = nf_tables_dump_obj_done, .module = THIS_MODULE, .data = (void *)nla, @@ -8628,18 +8541,16 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } - if (!try_module_get(THIS_MODULE)) - return -EINVAL; - rcu_read_unlock(); - mutex_lock(&nft_net->commit_mutex); - skb2 = nf_tables_getobj_single(portid, info, nla, true); - mutex_unlock(&nft_net->commit_mutex); - rcu_read_lock(); - module_put(THIS_MODULE); + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + reset = true; + skb2 = nf_tables_getobj_single(portid, info, nla, reset); if (IS_ERR(skb2)) return PTR_ERR(skb2); + if (!reset) + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), @@ -9228,7 +9139,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, if (!nft_use_inc(&table->use)) return -EMFILE; - flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT); + flowtable = kzalloc_obj(*flowtable, GFP_KERNEL_ACCOUNT); if (!flowtable) { err = -ENOMEM; goto flowtable_alloc; @@ -9292,6 +9203,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, return 0; err_flowtable_hooks: + synchronize_rcu(); nft_trans_destroy(trans); err_flowtable_trans: nft_hooks_destroy(&flowtable->hook_list); @@ -9548,7 +9460,7 @@ static int nf_tables_dump_flowtable_start(struct netlink_callback *cb) struct nft_flowtable_filter *filter = NULL; if (nla[NFTA_FLOWTABLE_TABLE]) { - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); + filter = kzalloc_obj(*filter, GFP_ATOMIC); if (!filter) return -ENOMEM; @@ -9762,11 +9674,11 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev, break; case NETDEV_REGISTER: /* NOP if not matching or already registered */ - if (!match || (changename && ops)) + if (!match || ops) continue; - ops = kzalloc(sizeof(struct nf_hook_ops), - GFP_KERNEL_ACCOUNT); + ops = kzalloc_obj(struct nf_hook_ops, + GFP_KERNEL_ACCOUNT); if (!ops) return 1; @@ -9957,7 +9869,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_GETRULE_RESET] = { - .call = nf_tables_getrule_reset, + .call = nf_tables_getrule, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, @@ -10011,7 +9923,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM_RESET] = { - .call = nf_tables_getsetelem_reset, + .call = nf_tables_getsetelem, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, @@ -10057,7 +9969,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj_reset, + .call = nf_tables_getobj, .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, @@ -10536,7 +10448,7 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, struct net *net = read_pnet(&set->net); struct nft_trans_gc *trans; - trans = kzalloc(sizeof(*trans), gfp); + trans = kzalloc_obj(*trans, gfp); if (!trans) return NULL; @@ -10567,11 +10479,6 @@ static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) schedule_work(&trans_gc_work); } -static int nft_trans_gc_space(struct nft_trans_gc *trans) -{ - return NFT_TRANS_GC_BATCHCOUNT - trans->count; -} - struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { @@ -10772,7 +10679,7 @@ static int nf_tables_commit_audit_alloc(struct list_head *adl, if (adp->table == table) return 0; } - adp = kzalloc(sizeof(*adp), GFP_KERNEL); + adp = kzalloc_obj(*adp); if (!adp) return -ENOMEM; adp->table = table; @@ -11476,6 +11383,13 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + if (action == NFNL_ABORT_NONE) { + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) + table->validate_state = NFT_VALIDATE_SKIP; + } + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); /* module autoload needs to happen after GC sequence update because it @@ -11678,21 +11592,10 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_data_types type, unsigned int len) { - int err; - switch (reg) { case NFT_REG_VERDICT: if (type != NFT_DATA_VERDICT) return -EINVAL; - - if (data != NULL && - (data->verdict.code == NFT_GOTO || - data->verdict.code == NFT_JUMP)) { - err = nft_chain_validate(ctx, data->verdict.chain); - if (err < 0) - return err; - } - break; default: if (type != NFT_DATA_VALUE) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index fd30e205de84..9101b1703b52 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -11,7 +11,7 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; - flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); + flow = kzalloc_obj(struct nft_flow_rule); if (!flow) return NULL; @@ -111,7 +111,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_first(rule); - ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); + ctx = kzalloc_obj(struct nft_offload_ctx); if (!ctx) { err = -ENOMEM; goto err_out; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 811d02b4c4f7..e62a0dea24ea 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -325,7 +325,7 @@ static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err, { struct nfnl_err *nfnl_err; - nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL); + nfnl_err = kmalloc_obj(struct nfnl_err); if (nfnl_err == NULL) return -ENOMEM; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 505f46a32173..2bfaa773d82f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -260,7 +260,7 @@ static int nfnl_acct_start(struct netlink_callback *cb) if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) return -EINVAL; - filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); + filter = kzalloc_obj(struct nfacct_filter); if (!filter) return -ENOMEM; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 97248963a7d3..d545fa459455 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -192,9 +192,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; - expect_policy = kcalloc(class_max, - sizeof(struct nf_conntrack_expect_policy), - GFP_KERNEL); + expect_policy = kzalloc_objs(struct nf_conntrack_expect_policy, + class_max); if (expect_policy == NULL) return -ENOMEM; @@ -228,7 +227,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) return -EINVAL; - nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL); + nfcth = kzalloc_obj(*nfcth); if (nfcth == NULL) return -ENOMEM; helper = &nfcth->helper; @@ -323,8 +322,7 @@ static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], struct nf_conntrack_expect_policy *policy; int i, ret = 0; - new_policy = kmalloc_array(helper->expect_class_max + 1, - sizeof(*new_policy), GFP_KERNEL); + new_policy = kmalloc_objs(*new_policy, helper->expect_class_max + 1); if (!new_policy) return -ENOMEM; @@ -603,10 +601,10 @@ restart: goto out; } } - } - if (cb->args[1]) { - cb->args[1] = 0; - goto restart; + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } } out: rcu_read_unlock(); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 38d75484e531..fd8652aa7e88 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -74,8 +74,7 @@ ctnl_timeout_parse_policy(void *timeout, struct nlattr **tb; int ret = 0; - tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), - GFP_KERNEL); + tb = kzalloc_objs(*tb, l4proto->ctnl_timeout.nlattr_max + 1); if (!tb) return -ENOMEM; diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 92d869317cba..531706982859 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -408,7 +408,7 @@ static int nfnl_hook_dump_start(struct netlink_callback *cb) if (head && IS_ERR(head)) return PTR_ERR(head); - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (!ctx) return -ENOMEM; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index bfcb9cd335bf..b35a90955e2e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -179,7 +179,7 @@ instance_create(struct net *net, u_int16_t group_num, goto out_unlock; } - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + inst = kzalloc_obj(*inst, GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index c0fc431991e8..45d9ad231a92 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -302,7 +302,9 @@ static int nfnl_osf_add_callback(struct sk_buff *skb, { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; + unsigned int tot_opt_len = 0; int err = 0; + int i; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -318,12 +320,23 @@ static int nfnl_osf_add_callback(struct sk_buff *skb, if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; + for (i = 0; i < f->opt_num; i++) { + if (!f->opt[i].length || f->opt[i].length > MAX_IPOPTLEN) + return -EINVAL; + if (f->opt[i].kind == OSFOPT_MSS && f->opt[i].length < 4) + return -EINVAL; + + tot_opt_len += f->opt[i].length; + if (tot_opt_len > MAX_IPOPTLEN) + return -EINVAL; + } + if (!memchr(f->genre, 0, MAXGENRELEN) || !memchr(f->subtype, 0, MAXGENRELEN) || !memchr(f->version, 0, MAXGENRELEN)) return -EINVAL; - kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); + kf = kmalloc_obj(struct nf_osf_finger); if (!kf) return -ENOMEM; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8b7b39d8a109..47f7f62906e2 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -30,6 +30,8 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> +#include <linux/rhashtable.h> +#include <linux/jhash.h> #include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> @@ -47,6 +49,8 @@ #endif #define NFQNL_QMAX_DEFAULT 1024 +#define NFQNL_HASH_MIN 1024 +#define NFQNL_HASH_MAX 1048576 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we @@ -56,6 +60,26 @@ */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) +/* Composite key for packet lookup: (net, queue_num, packet_id) */ +struct nfqnl_packet_key { + possible_net_t net; + u32 packet_id; + u16 queue_num; +} __aligned(sizeof(u32)); /* jhash2 requires 32-bit alignment */ + +/* Global rhashtable - one for entire system, all netns */ +static struct rhashtable nfqnl_packet_map __read_mostly; + +/* Helper to initialize composite key */ +static inline void nfqnl_init_key(struct nfqnl_packet_key *key, + struct net *net, u32 packet_id, u16 queue_num) +{ + memset(key, 0, sizeof(*key)); + write_pnet(&key->net, net); + key->packet_id = packet_id; + key->queue_num = queue_num; +} + struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; @@ -100,6 +124,39 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num) return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } +/* Extract composite key from nf_queue_entry for hashing */ +static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct nf_queue_entry *entry = data; + struct nfqnl_packet_key key; + + nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num); + + return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed); +} + +/* Compare stack-allocated key against entry */ +static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct nfqnl_packet_key *key = arg->key; + const struct nf_queue_entry *entry = obj; + + return !net_eq(entry->state.net, read_pnet(&key->net)) || + entry->queue_num != key->queue_num || + entry->id != key->packet_id; +} + +static const struct rhashtable_params nfqnl_rhashtable_params = { + .head_offset = offsetof(struct nf_queue_entry, hash_node), + .key_len = sizeof(struct nfqnl_packet_key), + .obj_hashfn = nfqnl_packet_obj_hashfn, + .obj_cmpfn = nfqnl_packet_obj_cmpfn, + .automatic_shrinking = true, + .min_size = NFQNL_HASH_MIN, + .max_size = NFQNL_HASH_MAX, +}; + static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { @@ -121,17 +178,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) unsigned int h; int err; - spin_lock(&q->instances_lock); - if (instance_lookup(q, queue_num)) { - err = -EEXIST; - goto out_unlock; - } - - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) { - err = -ENOMEM; - goto out_unlock; - } + inst = kzalloc_obj(*inst, GFP_KERNEL_ACCOUNT); + if (!inst) + return ERR_PTR(-ENOMEM); inst->queue_num = queue_num; inst->peer_portid = portid; @@ -141,9 +190,15 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { + err = -EEXIST; + goto out_unlock; + } + if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; - goto out_free; + goto out_unlock; } h = instance_hashfn(queue_num); @@ -153,10 +208,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) return inst; -out_free: - kfree(inst); out_unlock: spin_unlock(&q->instances_lock); + kfree(inst); return ERR_PTR(err); } @@ -191,33 +245,45 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) spin_unlock(&q->instances_lock); } -static inline void +static int __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { - list_add_tail(&entry->list, &queue->queue_list); - queue->queue_total++; + int err; + + entry->queue_num = queue->queue_num; + + err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node, + nfqnl_rhashtable_params); + if (unlikely(err)) + return err; + + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; + + return 0; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { + rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node, + nfqnl_rhashtable_params); list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id, + struct net *net) { - struct nf_queue_entry *entry = NULL, *i; + struct nfqnl_packet_key key; + struct nf_queue_entry *entry; - spin_lock_bh(&queue->lock); + nfqnl_init_key(&key, net, id, queue->queue_num); - list_for_each_entry(i, &queue->queue_list, list) { - if (i->id == id) { - entry = i; - break; - } - } + spin_lock_bh(&queue->lock); + entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key, + nfqnl_rhashtable_params); if (entry) __dequeue_entry(queue, entry); @@ -369,6 +435,34 @@ next_hook: nf_queue_entry_free(entry); } +/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us + * exclusively. + */ +static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + + if (!ct || nf_ct_is_confirmed(ct)) + return false; + + if (is_unconfirmed) + *is_unconfirmed = true; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * This can't be solved with serialization here because one clone + * could have been queued for local delivery or could be transmitted + * in parallel on another CPU. + */ + return refcount_read(&ct->ct_general.use) > 1; +#endif + return false; +} + static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; @@ -396,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) break; } } + + if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) { + /* If first queued segment was already reinjected then + * there is a good chance the ct entry is now confirmed. + * + * Handle the rare cases: + * - out-of-order verdict + * - threaded userspace reinjecting in parallel + * - first segment was dropped + * + * In all of those cases we can't handle this packet + * because we can't be sure that another CPU won't modify + * nf_conn->ext in parallel which isn't allowed. + */ + if (nf_ct_drop_unconfirmed(entry, NULL)) + verdict = NF_DROP; + } + nf_reinject(entry, verdict); } @@ -407,8 +519,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue->queue_total--; + __dequeue_entry(queue, entry); nfqnl_reinject(entry, NF_DROP); } } @@ -826,49 +937,6 @@ nlmsg_failure: return NULL; } -static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) -{ -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - struct nf_conn *ct = (void *)skb_nfct(entry->skb); - unsigned long status; - unsigned int use; - - if (!ct) - return false; - - status = READ_ONCE(ct->status); - if ((status & flags) == IPS_DYING) - return true; - - if (status & IPS_CONFIRMED) - return false; - - /* in some cases skb_clone() can occur after initial conntrack - * pickup, but conntrack assumes exclusive skb->_nfct ownership for - * unconfirmed entries. - * - * This happens for br_netfilter and with ip multicast routing. - * We can't be solved with serialization here because one clone could - * have been queued for local delivery. - */ - use = refcount_read(&ct->ct_general.use); - if (likely(use == 1)) - return false; - - /* Can't decrement further? Exclusive ownership. */ - if (!refcount_dec_not_one(&ct->ct_general.use)) - return false; - - skb_set_nfct(entry->skb, 0); - /* No nf_ct_put(): we already decremented .use and it cannot - * drop down to 0. - */ - return true; -#endif - return false; -} - static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) @@ -885,26 +953,23 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, } spin_lock_bh(&queue->lock); - if (nf_ct_drop_unconfirmed(entry)) - goto err_out_free_nskb; + if (queue->queue_total >= queue->queue_maxlen) + goto err_out_queue_drop; - if (queue->queue_total >= queue->queue_maxlen) { - if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { - failopen = 1; - err = 0; - } else { - queue->queue_dropped++; - net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", - queue->queue_total); - } - goto err_out_free_nskb; - } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); + /* Insert into hash BEFORE unicast. If failure don't send to userspace. */ + err = __enqueue_entry(queue, entry); + if (unlikely(err)) + goto err_out_queue_drop; + /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { + /* Unicast failed - remove entry we just inserted */ + __dequeue_entry(queue, entry); + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; @@ -914,12 +979,22 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, goto err_out_unlock; } - __enqueue_entry(queue, entry); - spin_unlock_bh(&queue->lock); return 0; -err_out_free_nskb: +err_out_queue_drop: + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_dropped++; + + if (queue->queue_total >= queue->queue_maxlen) + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + else + net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err); + } kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); @@ -998,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { - unsigned int queued; - struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; + bool ct_is_unconfirmed = false; + struct nfqnl_instance *queue; + unsigned int queued; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -1024,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) break; } + /* Check if someone already holds another reference to + * unconfirmed ct. If so, we cannot queue the skb: + * concurrent modifications of nf_conn->ext are not + * allowed and we can't know if another CPU isn't + * processing the same nf_conn entry in parallel. + */ + if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed)) + return -EINVAL; + if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); @@ -1037,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; + skb_list_walk_safe(segs, segs, nskb) { + if (ct_is_unconfirmed && queued > 0) { + /* skb_gso_segment() increments the ct refcount. + * This is a problem for unconfirmed (not in hash) + * entries, those can race when reinjections happen + * in parallel. + * + * Annotate this for all queued entries except the + * first one. + * + * As long as the first one is reinjected first it + * will do the confirmation for us. + */ + entry->nf_ct_is_unconfirmed = ct_is_unconfirmed; + } + if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -1430,7 +1531,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, verdict = ntohl(vhdr->verdict); - entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net); if (entry == NULL) return -ENOENT; @@ -1445,8 +1546,10 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); - if (err < 0) + if (err < 0) { + nfqnl_reinject(entry, NF_DROP); return err; + } } if (nfqa[NFQA_PAYLOAD]) { @@ -1498,7 +1601,8 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; - int ret = 0; + + WARN_ON_ONCE(!lockdep_nfnl_is_held(NFNL_SUBSYS_QUEUE)); if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); @@ -1544,47 +1648,44 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, } } + /* Lookup queue under RCU. After peer_portid check (or for new queue + * in BIND case), the queue is owned by the socket sending this message. + * A socket cannot simultaneously send a message and close, so while + * processing this CONFIG message, nfqnl_rcv_nl_event() (triggered by + * socket close) cannot destroy this queue. Safe to use without RCU. + */ rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { - ret = -EPERM; - goto err_out_unlock; + rcu_read_unlock(); + return -EPERM; } + rcu_read_unlock(); if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: - if (queue) { - ret = -EBUSY; - goto err_out_unlock; - } - queue = instance_create(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) { - ret = PTR_ERR(queue); - goto err_out_unlock; - } + if (queue) + return -EBUSY; + queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); break; case NFQNL_CFG_CMD_UNBIND: - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } + if (!queue) + return -ENODEV; instance_destroy(q, queue); - goto err_out_unlock; + return 0; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: - ret = -ENOTSUPP; - goto err_out_unlock; + return -EOPNOTSUPP; } } - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } + if (!queue) + return -ENODEV; if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = @@ -1609,9 +1710,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, spin_unlock_bh(&queue->lock); } -err_out_unlock: - rcu_read_unlock(); - return ret; + return 0; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { @@ -1781,10 +1880,14 @@ static int __init nfnetlink_queue_init(void) { int status; + status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params); + if (status < 0) + return status; + status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); - goto out; + goto cleanup_rhashtable; } netlink_register_notifier(&nfqnl_rtnl_notifier); @@ -1809,7 +1912,8 @@ cleanup_netlink_subsys: cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); -out: +cleanup_rhashtable: + rhashtable_destroy(&nfqnl_packet_map); return status; } @@ -1821,6 +1925,8 @@ static void __exit nfnetlink_queue_fini(void) netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); + rhashtable_destroy(&nfqnl_packet_map); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b16185e9a6dd..041426e3bdbf 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -344,7 +344,7 @@ static int nft_netdev_event(unsigned long event, struct net_device *dev, break; case NETDEV_REGISTER: /* NOP if not matching or already registered */ - if (!match || (changename && ops)) + if (!match || ops) continue; ops = kmemdup(&basechain->ops, diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 72711d62fddf..27cc983a7cdf 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -134,7 +134,8 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, } static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { - [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING, + .len = XT_EXTENSION_MAXNAMELEN, }, [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -434,7 +435,8 @@ static void nft_match_eval(const struct nft_expr *expr, } static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { - [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -693,7 +695,12 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, name = nla_data(tb[NFTA_COMPAT_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); - target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + /* x_tables api checks for 'target == 1' to mean target, + * everything else means 'match'. + * In x_tables world, the number is set by kernel, not + * userspace. + */ + target = nla_get_be32(tb[NFTA_COMPAT_TYPE]) == htonl(1); switch(family) { case AF_INET: @@ -808,7 +815,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); + ops = kzalloc_obj(struct nft_expr_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; @@ -898,7 +905,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); + ops = kzalloc_obj(struct nft_expr_ops, GFP_KERNEL_ACCOUNT); if (!ops) { err = -ENOMEM; goto err; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index fc35a11cdca2..43357f99eb00 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, const struct nft_pktinfo *pkt, const struct nft_set_ext *ext) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - const struct nf_conntrack_tuple *tuple_ptr; - struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; unsigned int count; + int err; - tuple_ptr = &tuple; - - ct = nf_ct_get(pkt->skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), - nft_pf(pkt), nft_net(pkt), &tuple)) { - regs->verdict.code = NF_DROP; - return; - } - - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { - regs->verdict.code = NF_DROP; - return; + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); + if (err) { + if (err == -EEXIST) { + /* Call gc to update the list count if any connection has + * been closed already. This is useful for softlimit + * connections like limiting bandwidth based on a number + * of open connections. + */ + nf_conncount_gc_list(nft_net(pkt), priv->list); + } else { + regs->verdict.code = NF_DROP; + return; + } } count = READ_ONCE(priv->list->count); - if ((count > priv->limit) ^ priv->invert) { + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { regs->verdict.code = NFT_BREAK; return; } @@ -77,7 +71,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT); + priv->list = kmalloc_obj(*priv->list, GFP_KERNEL_ACCOUNT); if (!priv->list) return -ENOMEM; @@ -137,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx, return nft_connlimit_do_init(ctx, tb, priv); } +static void nft_connlimit_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_connlimit *newpriv = nft_obj_data(newobj); + struct nft_connlimit *priv = nft_obj_data(obj); + + WRITE_ONCE(priv->limit, newpriv->limit); + WRITE_ONCE(priv->invert, newpriv->invert); +} + static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { @@ -166,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = { .init = nft_connlimit_obj_init, .destroy = nft_connlimit_obj_destroy, .dump = nft_connlimit_obj_dump, + .update = nft_connlimit_obj_update, }; static struct nft_object_type nft_connlimit_obj_type __read_mostly = { @@ -215,7 +220,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp); + priv_dst->list = kmalloc_obj(*priv_dst->list, gfp); if (!priv_dst->list) return -ENOMEM; @@ -238,13 +243,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - - local_bh_disable(); - ret = nf_conncount_gc_list(net, priv->list); - local_bh_enable(); - return ret; + return nf_conncount_gc_list(net, priv->list); } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index cc7325329496..169ae93688bc 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -32,6 +32,9 @@ struct nft_counter_percpu_priv { static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); +/* control plane only: sync fetch+reset */ +static DEFINE_SPINLOCK(nft_counter_lock); + static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -117,8 +120,8 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv, nft_sync = this_cpu_ptr(&nft_counter_sync); u64_stats_update_begin(nft_sync); - u64_stats_add(&this_cpu->packets, -total->packets); - u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_sub(&this_cpu->packets, total->packets); + u64_stats_sub(&this_cpu->bytes, total->bytes); u64_stats_update_end(nft_sync); local_bh_enable(); @@ -148,13 +151,25 @@ static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, } } +static void nft_counter_fetch_and_reset(struct nft_counter_percpu_priv *priv, + struct nft_counter_tot *total) +{ + spin_lock(&nft_counter_lock); + nft_counter_fetch(priv, total); + nft_counter_reset(priv, total); + spin_unlock(&nft_counter_lock); +} + static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { struct nft_counter_tot total; - nft_counter_fetch(priv, &total); + if (unlikely(reset)) + nft_counter_fetch_and_reset(priv, &total); + else + nft_counter_fetch(priv, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD) || @@ -162,9 +177,6 @@ static int nft_counter_do_dump(struct sk_buff *skb, NFTA_COUNTER_PAD)) goto nla_put_failure; - if (reset) - nft_counter_reset(priv, &total); - return 0; nla_put_failure: diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6f2ae7cad731..128ff8155b5d 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -23,6 +23,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_seqadj.h> +#include "nf_internals.h" struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; @@ -543,6 +544,7 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: + nf_queue_nf_hook_drop(ctx->net); mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); @@ -894,8 +896,7 @@ nft_ct_timeout_parse_policy(void *timeouts, struct nlattr **tb; int ret = 0; - tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), - GFP_KERNEL); + tb = kzalloc_objs(*tb, l4proto->ctnl_timeout.nlattr_max + 1); if (!tb) return -ENOMEM; @@ -1016,6 +1017,7 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_ct_timeout *timeout = priv->timeout; + nf_queue_nf_hook_drop(ctx->net); nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); kfree(priv->timeout); @@ -1148,6 +1150,7 @@ static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, { struct nft_ct_helper_obj *priv = nft_obj_data(obj); + nf_queue_nf_hook_drop(ctx->net); if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 7807d8129664..9123277be03c 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -30,18 +30,26 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, const struct nft_set_ext *ext) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); + struct nft_ctx ctx = { + .net = read_pnet(&priv->set->net), + .family = priv->set->table->family, + }; struct nft_expr *expr; int i; for (i = 0; i < priv->num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) - return -1; + goto err_out; elem_expr->size += priv->expr_array[i]->ops->size; } return 0; +err_out: + nft_set_elem_expr_destroy(&ctx, elem_expr); + + return -1; } struct nft_elem_priv *nft_dynset_new(struct nft_set *set, diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 14dd1c0698c3..179d0e59e2b5 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -20,258 +21,6 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; -static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) -{ - if (dst_xfrm(dst)) - return FLOW_OFFLOAD_XMIT_XFRM; - - return FLOW_OFFLOAD_XMIT_NEIGH; -} - -static void nft_default_forward_path(struct nf_flow_route *route, - struct dst_entry *dst_cache, - enum ip_conntrack_dir dir) -{ - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; - route->tuple[dir].dst = dst_cache; - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); -} - -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - -static int nft_dev_fill_forward_path(const struct nf_flow_route *route, - const struct dst_entry *dst_cache, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, u8 *ha, - struct net_device_path_stack *stack) -{ - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; - struct net_device *dev = dst_cache->dev; - struct neighbour *n; - u8 nud_state; - - if (!nft_is_valid_ether_device(dev)) - goto out; - - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -1; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); - neigh_release(n); - - if (!(nud_state & NUD_VALID)) - return -1; - -out: - return dev_fill_forward_path(dev, ha, stack); -} - -struct nft_forward_info { - const struct net_device *indev; - const struct net_device *outdev; - const struct net_device *hw_outdev; - struct id { - __u16 id; - __be16 proto; - } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u8 num_encaps; - u8 ingress_vlans; - u8 h_source[ETH_ALEN]; - u8 h_dest[ETH_ALEN]; - enum flow_offload_xmit_type xmit_type; -}; - -static void nft_dev_path_info(const struct net_device_path_stack *stack, - struct nft_forward_info *info, - unsigned char *ha, struct nf_flowtable *flowtable) -{ - const struct net_device_path *path; - int i; - - memcpy(info->h_dest, ha, ETH_ALEN); - - for (i = 0; i < stack->num_paths; i++) { - path = &stack->path[i]; - switch (path->type) { - case DEV_PATH_ETHERNET: - case DEV_PATH_DSA: - case DEV_PATH_VLAN: - case DEV_PATH_PPPOE: - info->indev = path->dev; - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - if (path->type == DEV_PATH_ETHERNET) - break; - if (path->type == DEV_PATH_DSA) { - i = stack->num_paths; - break; - } - - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { - info->indev = NULL; - break; - } - if (!info->outdev) - info->outdev = path->dev; - info->encap[info->num_encaps].id = path->encap.id; - info->encap[info->num_encaps].proto = path->encap.proto; - info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); - break; - case DEV_PATH_BRIDGE: - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - switch (path->bridge.vlan_mode) { - case DEV_PATH_BR_VLAN_UNTAG_HW: - info->ingress_vlans |= BIT(info->num_encaps - 1); - break; - case DEV_PATH_BR_VLAN_TAG: - info->encap[info->num_encaps].id = path->bridge.vlan_id; - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; - info->num_encaps++; - break; - case DEV_PATH_BR_VLAN_UNTAG: - info->num_encaps--; - break; - case DEV_PATH_BR_VLAN_KEEP: - break; - } - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; - break; - default: - info->indev = NULL; - break; - } - } - if (!info->outdev) - info->outdev = info->indev; - - info->hw_outdev = info->indev; - - if (nf_flowtable_hw_offload(flowtable) && - nft_is_valid_ether_device(info->indev)) - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; -} - -static bool nft_flowtable_find_dev(const struct net_device *dev, - struct nft_flowtable *ft) -{ - struct nft_hook *hook; - bool found = false; - - list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (!nft_hook_find_ops_rcu(hook, dev)) - continue; - - found = true; - break; - } - - return found; -} - -static void nft_dev_forward_path(struct nf_flow_route *route, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - const struct dst_entry *dst = route->tuple[dir].dst; - struct net_device_path_stack stack; - struct nft_forward_info info = {}; - unsigned char ha[ETH_ALEN]; - int i; - - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) - nft_dev_path_info(&stack, &info, ha, &ft->data); - - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) - return; - - route->tuple[!dir].in.ifindex = info.indev->ifindex; - for (i = 0; i < info.num_encaps; i++) { - route->tuple[!dir].in.encap[i].id = info.encap[i].id; - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; - } - route->tuple[!dir].in.num_encaps = info.num_encaps; - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; - - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.ifindex = info.outdev->ifindex; - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; - route->tuple[dir].xmit_type = info.xmit_type; - } -} - -static int nft_flow_route(const struct nft_pktinfo *pkt, - const struct nf_conn *ct, - struct nf_flow_route *route, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - struct dst_entry *this_dst = skb_dst(pkt->skb); - struct dst_entry *other_dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); - fl.u.ip4.flowi4_mark = pkt->skb->mark; - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; - break; - case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); - fl.u.ip6.flowi6_mark = pkt->skb->mark; - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; - break; - } - - if (!dst_hold_safe(this_dst)) - return -ENOENT; - - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) { - dst_release(this_dst); - return -ENOENT; - } - - nft_default_forward_path(route, this_dst, dir); - nft_default_forward_path(route, other_dst, !dir); - - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { - nft_dev_forward_path(route, ct, dir, ft); - nft_dev_forward_path(route, ct, !dir, ft); - } - - return 0; -} - static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index de1b6066bfa8..20706be12807 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -30,7 +30,7 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr, u64 last_jiffies; int err; - last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT); + last = kzalloc_obj(*last, GFP_KERNEL_ACCOUNT); if (!last) return -ENOMEM; @@ -107,7 +107,7 @@ static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_ struct nft_last_priv *priv_dst = nft_expr_priv(dst); struct nft_last_priv *priv_src = nft_expr_priv(src); - priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); + priv_dst->last = kzalloc_obj(*priv_dst->last, gfp); if (!priv_dst->last) return -ENOMEM; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 21d26b79b460..f3b1f791942b 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -110,7 +110,7 @@ static int nft_limit_init(struct nft_limit_priv *priv, invert = true; } - priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); + priv->limit = kmalloc_obj(*priv->limit, GFP_KERNEL_ACCOUNT); if (!priv->limit) return -ENOMEM; @@ -158,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst, priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; - priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); + priv_dst->limit = kmalloc_obj(*priv_dst->limit, gfp); if (!priv_dst->limit) return -ENOMEM; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index e35588137995..bf01cf8a8907 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -26,46 +26,10 @@ struct nft_log { char *prefix; }; -static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) -{ - struct iphdr _iph; - const struct iphdr *ih; - - ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); - if (!ih) - return false; - - audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", - &ih->saddr, &ih->daddr, ih->protocol); - - return true; -} - -static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) -{ - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - u8 nexthdr; - __be16 frag_off; - - ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); - if (!ih) - return false; - - nexthdr = ih->nexthdr; - ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); - - audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", - &ih->saddr, &ih->daddr, nexthdr); - - return true; -} - static void nft_log_eval_audit(const struct nft_pktinfo *pkt) { struct sk_buff *skb = pkt->skb; struct audit_buffer *ab; - int fam = -1; if (!audit_enabled) return; @@ -76,27 +40,7 @@ static void nft_log_eval_audit(const struct nft_pktinfo *pkt) audit_log_format(ab, "mark=%#x", skb->mark); - switch (nft_pf(pkt)) { - case NFPROTO_BRIDGE: - switch (eth_hdr(skb)->h_proto) { - case htons(ETH_P_IP): - fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; - break; - case htons(ETH_P_IPV6): - fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; - break; - } - break; - case NFPROTO_IPV4: - fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; - break; - case NFPROTO_IPV6: - fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; - break; - } - - if (fam == -1) - audit_log_format(ab, " saddr=? daddr=? proto=-1"); + audit_log_nf_skb(ab, skb, nft_pf(pkt)); audit_log_end(ab); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 58c5b14889c4..fc2d7c5d83c8 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -246,19 +246,16 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); - struct nft_set_iter iter; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_setelem_validate, + }; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nft_setelem_validate; - priv->set->ops->walk(ctx, priv->set, &iter); if (!iter.err) iter.err = nft_set_catchall_validate(ctx, priv->set); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index bd058babfc82..0a39e51ec9b7 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -66,7 +66,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); + priv->counter = kmalloc_obj(*priv->counter, GFP_KERNEL_ACCOUNT); if (!priv->counter) return -ENOMEM; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index df0798da2329..2390a993aed9 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -96,7 +96,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return -EOPNOTSUPP; } - priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT); + priv->consumed = kmalloc_obj(*priv->consumed, GFP_KERNEL_ACCOUNT); if (!priv->consumed) return -ENOMEM; @@ -140,11 +140,16 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, u64 consumed, consumed_cap, quota; u32 flags = priv->flags; - /* Since we inconditionally increment consumed quota for each packet + /* Since we unconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to * userspace. */ - consumed = atomic64_read(priv->consumed); + if (reset) { + consumed = atomic64_xchg(priv->consumed, 0); + clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); + } else { + consumed = atomic64_read(priv->consumed); + } quota = atomic64_read(&priv->quota); if (consumed >= quota) { consumed_cap = quota; @@ -160,10 +165,6 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; - if (reset) { - atomic64_sub(consumed, priv->consumed); - clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); - } return 0; nla_put_failure: @@ -247,7 +248,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp priv_dst->quota = priv_src->quota; priv_dst->flags = priv_src->flags; - priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp); + priv_dst->consumed = kmalloc_obj(*priv_dst->consumed, gfp); if (!priv_dst->consumed) return -ENOMEM; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index ba01ce75d6de..b0e571c8e3f3 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -374,6 +374,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, { switch (iter->type) { case NFT_ITER_UPDATE: + case NFT_ITER_UPDATE_CLONE: /* only relevant for netlink dumps which use READ type */ WARN_ON_ONCE(iter->skip != 0); @@ -619,15 +620,20 @@ static struct nft_elem_priv * nft_hash_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { + const u32 *key = (const u32 *)&elem->key.val; struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); struct nft_hash_elem *he; u32 hash; - hash = jhash(elem->key.val.data, set->klen, priv->seed); + if (set->klen == 4) + hash = jhash_1word(*key, priv->seed); + else + hash = jhash(key, set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry_rcu(he, &priv->table[hash], node) { - if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && + if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && nft_set_elem_active(&he->ext, genmask)) return &he->priv; } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 112fe46788b6..7fd24e0cc428 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -652,7 +652,7 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f, if (rules_alloc > (INT_MAX / sizeof(*new_mt))) return -ENOMEM; - new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); + new_mt = kvmalloc_objs(*new_mt, rules_alloc, GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; @@ -1317,8 +1317,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else dup_end = dup_key; - if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && - !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { + if (!memcmp(start, dup_key->data, set->klen) && + !memcmp(end, dup_end->data, set->klen)) { *elem_priv = &dup->priv; return -EEXIST; } @@ -1413,7 +1413,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) struct nft_pipapo_match *new; int i; - new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); + new = kmalloc_flex(*new, f, old->field_count, GFP_KERNEL_ACCOUNT); if (!new) return NULL; @@ -1460,9 +1460,8 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) goto out_mt; - dst->mt = kvmalloc_array(src->rules_alloc, - sizeof(*src->mt), - GFP_KERNEL_ACCOUNT); + dst->mt = kvmalloc_objs(*src->mt, src->rules_alloc, + GFP_KERNEL_ACCOUNT); if (!dst->mt) goto out_mt; @@ -1641,6 +1640,7 @@ static void pipapo_drop(struct nft_pipapo_match *m, int i; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; int g; for (g = 0; g < f->groups; g++) { @@ -1660,7 +1660,7 @@ static void pipapo_drop(struct nft_pipapo_match *m, } pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, - rulemap[i + 1].n, i == m->field_count - 1); + last ? 0 : rulemap[i + 1].n, last); if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { /* We can ignore this, a failure to shrink tables down * doesn't make tables invalid. @@ -1681,11 +1681,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, } /** - * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * pipapo_gc_scan() - Drop expired entries from set and link them to gc list * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc_scan(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); @@ -1698,6 +1698,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) if (!gc) return; + list_add(&gc->list, &priv->gc_head); + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const struct nft_pipapo_field *f; @@ -1725,9 +1727,13 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - return; + if (!nft_trans_gc_space(gc)) { + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; + + list_add(&gc->list, &priv->gc_head); + } nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); @@ -1741,10 +1747,30 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) } } - gc = nft_trans_gc_catchall_sync(gc); + priv->last_gc = jiffies; +} + +/** + * pipapo_gc_queue() - Free expired elements + * @set: nftables API set representation + */ +static void pipapo_gc_queue(struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_trans_gc *gc, *next; + + /* always do a catchall cycle: */ + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + if (gc) + nft_trans_gc_queue_sync_done(gc); + } + + /* always purge queued gc elements. */ + list_for_each_entry_safe(gc, next, &priv->gc_head, list) { + list_del(&gc->list); nft_trans_gc_queue_sync_done(gc); - priv->last_gc = jiffies; } } @@ -1798,6 +1824,10 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * * We also need to create a new working copy for subsequent insertions and * deletions. + * + * After the live copy has been replaced by the clone, we can safely queue + * expired elements that have been collected by pipapo_gc_scan() for + * memory reclaim. */ static void nft_pipapo_commit(struct nft_set *set) { @@ -1808,7 +1838,7 @@ static void nft_pipapo_commit(struct nft_set *set) return; if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + pipapo_gc_scan(set, priv->clone); old = rcu_replace_pointer(priv->match, priv->clone, nft_pipapo_transaction_mutex_held(set)); @@ -1816,6 +1846,8 @@ static void nft_pipapo_commit(struct nft_set *set) if (old) call_rcu(&old->rcu, pipapo_reclaim_match); + + pipapo_gc_queue(set); } static void nft_pipapo_abort(const struct nft_set *set) @@ -2145,13 +2177,20 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_match *m; switch (iter->type) { - case NFT_ITER_UPDATE: + case NFT_ITER_UPDATE_CLONE: m = pipapo_maybe_clone(set); if (!m) { iter->err = -ENOMEM; return; } - + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_UPDATE: + if (priv->clone) + m = priv->clone; + else + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); nft_pipapo_do_walk(ctx, set, m, iter); break; case NFT_ITER_READ: @@ -2237,7 +2276,7 @@ static int nft_pipapo_init(const struct nft_set *set, if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); + m = kmalloc_flex(*m, f, field_count); if (!m) return -ENOMEM; @@ -2273,6 +2312,7 @@ static int nft_pipapo_init(const struct nft_set *set, f->mt = NULL; } + INIT_LIST_HEAD(&priv->gc_head); rcu_assign_pointer(priv->match, m); return 0; @@ -2322,6 +2362,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; + WARN_ON_ONCE(!list_empty(&priv->gc_head)); + m = rcu_dereference_protected(priv->match, true); if (priv->clone) { @@ -2370,6 +2412,7 @@ const struct nft_set_type nft_set_pipapo_type = { .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, + .abort_skip_removal = true, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2394,6 +2437,7 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, + .abort_skip_removal = true, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index eaab422aa56a..9aee9a9eaeb7 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -156,12 +156,14 @@ struct nft_pipapo_match { * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding * @last_gc: Timestamp of last garbage collection run, jiffies + * @gc_head: list of nft_trans_gc to queue up for mem reclaim */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; unsigned long last_gc; + struct list_head gc_head; }; struct nft_pipapo_elem; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ca594161b840..fe8bd497d74a 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -10,21 +10,41 @@ #include <linux/module.h> #include <linux/list.h> #include <linux/rbtree.h> +#include <linux/bsearch.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +struct nft_array_interval { + struct nft_set_ext *from; + struct nft_set_ext *to; +}; + +struct nft_array { + u32 max_intervals; + u32 num_intervals; + struct nft_array_interval *intervals; + struct rcu_head rcu_head; +}; + struct nft_rbtree { struct rb_root root; rwlock_t lock; - seqcount_rwlock_t count; + struct nft_array __rcu *array; + struct nft_array *array_next; + unsigned long start_rbe_cookie; unsigned long last_gc; + struct list_head expired; + u64 last_tstamp; }; struct nft_rbtree_elem { struct nft_elem_priv priv; - struct rb_node node; + union { + struct rb_node node; + struct list_head list; + }; struct nft_set_ext ext; }; @@ -39,6 +59,13 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } +static bool nft_rbtree_interval_null(const struct nft_set *set, + const struct nft_rbtree_elem *rbe) +{ + return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) && + nft_rbtree_interval_end(rbe)); +} + static int nft_rbtree_cmp(const struct nft_set *set, const struct nft_rbtree_elem *e1, const struct nft_rbtree_elem *e2) @@ -47,67 +74,33 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } -static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) -{ - return nft_set_elem_expired(&rbe->ext); -} +struct nft_array_lookup_ctx { + const u32 *key; + u32 klen; +}; -static const struct nft_set_ext * -__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, unsigned int seq) +static int nft_array_lookup_cmp(const void *pkey, const void *entry) { - struct nft_rbtree *priv = nft_set_priv(set); - const struct nft_rbtree_elem *rbe, *interval = NULL; - u8 genmask = nft_genmask_cur(net); - const struct rb_node *parent; - int d; + const struct nft_array_interval *interval = entry; + const struct nft_array_lookup_ctx *ctx = pkey; + int a, b; - parent = rcu_dereference_raw(priv->root.rb_node); - while (parent != NULL) { - if (read_seqcount_retry(&priv->count, seq)) - return NULL; + if (!interval->from) + return 1; - rbe = rb_entry(parent, struct nft_rbtree_elem, node); - - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); - if (d < 0) { - parent = rcu_dereference_raw(parent->rb_left); - if (interval && - !nft_rbtree_cmp(set, rbe, interval) && - nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(interval)) - continue; - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_rbtree_elem_expired(rbe)) - interval = rbe; - } else if (d > 0) - parent = rcu_dereference_raw(parent->rb_right); - else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = rcu_dereference_raw(parent->rb_left); - continue; - } - - if (nft_rbtree_elem_expired(rbe)) - return NULL; - - if (nft_rbtree_interval_end(rbe)) { - if (nft_set_is_anonymous(set)) - return NULL; - parent = rcu_dereference_raw(parent->rb_left); - interval = NULL; - continue; - } + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); + if (!interval->to) + b = -1; + else + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); - return &rbe->ext; - } - } + if (a >= 0 && b < 0) + return 0; - if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_rbtree_interval_start(interval)) - return &interval->ext; + if (a < 0) + return -1; - return NULL; + return 1; } INDIRECT_CALLABLE_SCOPE @@ -116,83 +109,57 @@ nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); - unsigned int seq = read_seqcount_begin(&priv->count); - const struct nft_set_ext *ext; - - ext = __nft_rbtree_lookup(net, set, key, seq); - if (ext || !read_seqcount_retry(&priv->count, seq)) - return ext; - - read_lock_bh(&priv->lock); - seq = read_seqcount_begin(&priv->count); - ext = __nft_rbtree_lookup(net, set, key, seq); - read_unlock_bh(&priv->lock); - - return ext; + struct nft_array *array = rcu_dereference(priv->array); + const struct nft_array_interval *interval; + struct nft_array_lookup_ctx ctx = { + .key = key, + .klen = set->klen, + }; + + if (!array) + return NULL; + + interval = bsearch(&ctx, array->intervals, array->num_intervals, + sizeof(struct nft_array_interval), + nft_array_lookup_cmp); + if (!interval || nft_set_elem_expired(interval->from)) + return NULL; + + return interval->from; } -static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, - const u32 *key, struct nft_rbtree_elem **elem, - unsigned int seq, unsigned int flags, u8 genmask) -{ - struct nft_rbtree_elem *rbe, *interval = NULL; - struct nft_rbtree *priv = nft_set_priv(set); - const struct rb_node *parent; - const void *this; - int d; - - parent = rcu_dereference_raw(priv->root.rb_node); - while (parent != NULL) { - if (read_seqcount_retry(&priv->count, seq)) - return false; - - rbe = rb_entry(parent, struct nft_rbtree_elem, node); - - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); - if (d < 0) { - parent = rcu_dereference_raw(parent->rb_left); - if (!(flags & NFT_SET_ELEM_INTERVAL_END)) - interval = rbe; - } else if (d > 0) { - parent = rcu_dereference_raw(parent->rb_right); - if (flags & NFT_SET_ELEM_INTERVAL_END) - interval = rbe; - } else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = rcu_dereference_raw(parent->rb_left); - continue; - } +struct nft_array_get_ctx { + const u32 *key; + unsigned int flags; + u32 klen; +}; - if (nft_set_elem_expired(&rbe->ext)) - return false; +static int nft_array_get_cmp(const void *pkey, const void *entry) +{ + const struct nft_array_interval *interval = entry; + const struct nft_array_get_ctx *ctx = pkey; + int a, b; - if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || - (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == - (flags & NFT_SET_ELEM_INTERVAL_END)) { - *elem = rbe; - return true; - } + if (!interval->from) + return 1; - if (nft_rbtree_interval_end(rbe)) - interval = NULL; + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); + if (!interval->to) + b = -1; + else + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); - parent = rcu_dereference_raw(parent->rb_left); - } + if (a >= 0) { + if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0) + return 0; + else if (b < 0) + return 0; } - if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && - ((!nft_rbtree_interval_end(interval) && - !(flags & NFT_SET_ELEM_INTERVAL_END)) || - (nft_rbtree_interval_end(interval) && - (flags & NFT_SET_ELEM_INTERVAL_END)))) { - *elem = interval; - return true; - } + if (a < 0) + return -1; - return false; + return 1; } static struct nft_elem_priv * @@ -200,34 +167,41 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); - unsigned int seq = read_seqcount_begin(&priv->count); - struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); - const u32 *key = (const u32 *)&elem->key.val; - u8 genmask = nft_genmask_cur(net); - bool ret; - - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return &rbe->priv; - - read_lock_bh(&priv->lock); - seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - read_unlock_bh(&priv->lock); - - if (!ret) + struct nft_array *array = rcu_dereference(priv->array); + const struct nft_array_interval *interval; + struct nft_array_get_ctx ctx = { + .key = (const u32 *)&elem->key.val, + .flags = flags, + .klen = set->klen, + }; + struct nft_rbtree_elem *rbe; + + if (!array) return ERR_PTR(-ENOENT); + interval = bsearch(&ctx, array->intervals, array->num_intervals, + sizeof(struct nft_array_interval), nft_array_get_cmp); + if (!interval || nft_set_elem_expired(interval->from)) + return ERR_PTR(-ENOENT); + + if (flags & NFT_SET_ELEM_INTERVAL_END) + rbe = container_of(interval->to, struct nft_rbtree_elem, ext); + else + rbe = container_of(interval->from, struct nft_rbtree_elem, ext); + return &rbe->priv; } -static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) +static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { lockdep_assert_held_write(&priv->lock); nft_setelem_data_deactivate(net, set, &rbe->priv); rb_erase(&rbe->node, &priv->root); + + /* collected later on in commit callback */ + list_add(&rbe->list, &priv->expired); } static const struct nft_rbtree_elem * @@ -238,11 +212,6 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct rb_node *prev = rb_prev(&rbe->node); struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_trans_gc *gc; - - gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); - if (!gc) - return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -260,28 +229,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); - - /* There is always room in this trans gc for this element, - * memory allocation never actually happens, hence, the warning - * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, - * this is synchronous gc which never fails. - */ - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - if (WARN_ON_ONCE(!gc)) - return ERR_PTR(-ENOMEM); - - nft_trans_gc_elem_add(gc, rbe_prev); + nft_rbtree_gc_elem_move(net, set, priv, rbe_prev); } - nft_rbtree_gc_elem_remove(net, set, priv, rbe); - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - if (WARN_ON_ONCE(!gc)) - return ERR_PTR(-ENOMEM); - - nft_trans_gc_elem_add(gc, rbe); - - nft_trans_gc_queue_sync_done(gc); + nft_rbtree_gc_elem_move(net, set, priv, rbe); return rbe_prev; } @@ -302,16 +253,97 @@ static bool nft_rbtree_update_first(const struct nft_set *set, return false; } +/* Only for anonymous sets which do not allow updates, all element are active. */ +static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe) +{ + struct rb_node *node; + + node = rb_prev(&rbe->node); + if (!node) + return NULL; + + return rb_entry(node, struct nft_rbtree_elem, node); +} + +static struct nft_rbtree_elem * +__nft_rbtree_next_active(struct rb_node *node, u8 genmask) +{ + struct nft_rbtree_elem *next_rbe; + + while (node) { + next_rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_set_elem_active(&next_rbe->ext, genmask)) { + node = rb_next(node); + continue; + } + + return next_rbe; + } + + return NULL; +} + +static struct nft_rbtree_elem * +nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask) +{ + return __nft_rbtree_next_active(rb_next(&rbe->node), genmask); +} + +static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv, + u64 tstamp) +{ + if (priv->last_tstamp != tstamp) { + priv->start_rbe_cookie = 0; + priv->last_tstamp = tstamp; + } +} + +static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv, + const struct nft_rbtree_elem *rbe) +{ + priv->start_rbe_cookie = (unsigned long)rbe; +} + +static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv, + const struct nft_rbtree_elem *rbe) +{ + return priv->start_rbe_cookie == (unsigned long)rbe; +} + +static bool nft_rbtree_insert_same_interval(const struct net *net, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + u8 genmask = nft_genmask_next(net); + struct nft_rbtree_elem *next_rbe; + + if (!priv->start_rbe_cookie) + return true; + + next_rbe = nft_rbtree_next_active(rbe, genmask); + if (next_rbe) { + /* Closest start element differs from last element added. */ + if (nft_rbtree_interval_start(next_rbe) && + nft_rbtree_cmp_start_cookie(priv, next_rbe)) { + priv->start_rbe_cookie = 0; + return true; + } + } + + priv->start_rbe_cookie = 0; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, - struct nft_elem_priv **elem_priv) + struct nft_elem_priv **elem_priv, u64 tstamp) { - struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - u64 tstamp = nft_net_tstamp(net); int d; /* Descend the tree to search for an existing element greater than the @@ -417,12 +449,18 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } + if (nft_rbtree_interval_null(set, new)) + priv->start_rbe_cookie = 0; + else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) + priv->start_rbe_cookie = 0; + /* - new start element matching existing start element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. */ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { *elem_priv = &rbe_ge->priv; + nft_rbtree_set_start_cookie(priv, rbe_ge); return -EEXIST; } @@ -431,18 +469,37 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + /* - ignore null interval, otherwise NLM_F_CREATE bogusly + * reports EEXIST. + */ + if (nft_rbtree_interval_null(set, new)) + return -ECANCELED; + *elem_priv = &rbe_le->priv; + + /* - start and end element belong to the same interval. */ + if (!nft_rbtree_insert_same_interval(net, priv, rbe_le)) + return -ENOTEMPTY; + return -EEXIST; } /* - new start element with existing closest, less or equal key value * being a start element: partial overlap, reported as -ENOTEMPTY. * Anonymous sets allow for two consecutive start element since they - * are constant, skip them to avoid bogus overlap reports. + * are constant, but validate that this new start element does not + * sit in between an existing start and end elements: partial overlap, + * reported as -ENOTEMPTY. */ - if (!nft_set_is_anonymous(set) && rbe_le && - nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) - return -ENOTEMPTY; + if (rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) { + if (!nft_set_is_anonymous(set)) + return -ENOTEMPTY; + + rbe_prev = nft_rbtree_prev_active(rbe_le); + if (rbe_prev && nft_rbtree_interval_end(rbe_prev)) + return -ENOTEMPTY; + } /* - new end element with existing closest, less or equal key value * being a end element: partial overlap, reported as -ENOTEMPTY. @@ -481,14 +538,101 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, return 0; } +static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals) +{ + struct nft_array_interval *intervals; + + intervals = kvzalloc_objs(struct nft_array_interval, max_intervals, + GFP_KERNEL_ACCOUNT); + if (!intervals) + return -ENOMEM; + + if (array->intervals) + kvfree(array->intervals); + + array->intervals = intervals; + array->max_intervals = max_intervals; + + return 0; +} + +static struct nft_array *nft_array_alloc(u32 max_intervals) +{ + struct nft_array *array; + + array = kzalloc_obj(*array, GFP_KERNEL_ACCOUNT); + if (!array) + return NULL; + + if (nft_array_intervals_alloc(array, max_intervals) < 0) { + kfree(array); + return NULL; + } + + return array; +} + +#define NFT_ARRAY_EXTRA_SIZE 10240 + +/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider + * packed representation coming from userspace for anonymous sets too. + */ +static u32 nft_array_elems(const struct nft_set *set) +{ + u32 nelems = atomic_read(&set->nelems); + + /* Adjacent intervals are represented with a single start element in + * anonymous sets, use the current element counter as is. + */ + if (nft_set_is_anonymous(set)) + return nelems; + + /* Add extra room for never matching interval at the beginning and open + * interval at the end which only use a single element to represent it. + * The conversion to array will compact intervals, this allows reduce + * memory consumption. + */ + return (nelems / 2) + 2; +} + +static int nft_array_may_resize(const struct nft_set *set) +{ + u32 nelems = nft_array_elems(set), new_max_intervals; + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_array *array; + + if (!priv->array_next) { + array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE); + if (!array) + return -ENOMEM; + + priv->array_next = array; + } + + if (nelems < priv->array_next->max_intervals) + return 0; + + new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE; + if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) + return -ENOMEM; + + return 0; +} + static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); + u64 tstamp = nft_net_tstamp(net); int err; + nft_rbtree_maybe_reset_start_cookie(priv, tstamp); + + if (nft_array_may_resize(set) < 0) + return -ENOMEM; + do { if (fatal_signal_pending(current)) return -EINTR; @@ -496,9 +640,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, cond_resched(); write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, elem_priv); - write_seqcount_end(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp); write_unlock_bh(&priv->lock); } while (err == -EAGAIN); @@ -508,9 +650,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } @@ -533,6 +673,48 @@ static void nft_rbtree_activate(const struct net *net, nft_clear(net, &rbe->ext); } +static struct nft_rbtree_elem * +nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask) +{ + struct nft_rbtree_elem *next_rbe; + struct rb_node *node; + + node = rb_next(&rbe->node); + if (node) { + next_rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_start(next_rbe) && + !nft_set_elem_active(&next_rbe->ext, genmask)) + return next_rbe; + } + + return NULL; +} + +static bool nft_rbtree_deactivate_same_interval(const struct net *net, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + u8 genmask = nft_genmask_next(net); + struct nft_rbtree_elem *next_rbe; + + if (!priv->start_rbe_cookie) + return true; + + next_rbe = nft_rbtree_next_inactive(rbe, genmask); + if (next_rbe) { + /* Closest start element differs from last element added. */ + if (nft_rbtree_interval_start(next_rbe) && + nft_rbtree_cmp_start_cookie(priv, next_rbe)) { + priv->start_rbe_cookie = 0; + return true; + } + } + + priv->start_rbe_cookie = 0; + + return false; +} + static void nft_rbtree_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -547,12 +729,21 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); - const struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); u64 tstamp = nft_net_tstamp(net); int d; + nft_rbtree_maybe_reset_start_cookie(priv, tstamp); + + if (nft_rbtree_interval_start(this) || + nft_rbtree_interval_null(set, this)) + priv->start_rbe_cookie = 0; + + if (nft_array_may_resize(set) < 0) + return NULL; + while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -577,6 +768,12 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, parent = parent->rb_left; continue; } + + if (nft_rbtree_interval_start(rbe)) + nft_rbtree_set_start_cookie(priv, rbe); + else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe)) + return NULL; + nft_rbtree_flush(net, set, &rbe->priv); return &rbe->priv; } @@ -613,8 +810,15 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_rbtree *priv = nft_set_priv(set); switch (iter->type) { + case NFT_ITER_UPDATE_CLONE: + if (nft_array_may_resize(set) < 0) { + iter->err = -ENOMEM; + break; + } + fallthrough; case NFT_ITER_UPDATE: lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); break; case NFT_ITER_READ: @@ -629,29 +833,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, } } -static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) -{ - nft_setelem_data_deactivate(net, set, &rbe->priv); - nft_rbtree_erase(priv, rbe); -} - -static void nft_rbtree_gc(struct nft_set *set) +static void nft_rbtree_gc_scan(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct net *net = read_pnet(&set->net); u64 tstamp = nft_net_tstamp(net); struct rb_node *node, *next; - struct nft_trans_gc *gc; - - set = nft_set_container_of(priv); - net = read_pnet(&set->net); - - gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); - if (!gc) - return; for (node = rb_first(&priv->root); node ; node = next) { next = rb_next(node); @@ -669,34 +857,46 @@ static void nft_rbtree_gc(struct nft_set *set) if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - goto try_later; - /* end element needs to be removed first, it has * no timeout extension. */ + write_lock_bh(&priv->lock); if (rbe_end) { - nft_rbtree_gc_remove(net, set, priv, rbe_end); - nft_trans_gc_elem_add(gc, rbe_end); + nft_rbtree_gc_elem_move(net, set, priv, rbe_end); rbe_end = NULL; } - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - goto try_later; - - nft_rbtree_gc_remove(net, set, priv, rbe); - nft_trans_gc_elem_add(gc, rbe); + nft_rbtree_gc_elem_move(net, set, priv, rbe); + write_unlock_bh(&priv->lock); } -try_later: + priv->last_gc = jiffies; +} + +static void nft_rbtree_gc_queue(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *rbe_end; + struct nft_trans_gc *gc; + + if (list_empty(&priv->expired)) + return; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; + + list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) { + list_del(&rbe->list); + nft_trans_gc_elem_add(gc, rbe); - if (gc) { - gc = nft_trans_gc_catchall_sync(gc); - nft_trans_gc_queue_sync_done(gc); - priv->last_gc = jiffies; + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return; } + + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -714,24 +914,45 @@ static int nft_rbtree_init(const struct nft_set *set, BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); rwlock_init(&priv->lock); - seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; + INIT_LIST_HEAD(&priv->expired); + + priv->array = NULL; + priv->array_next = NULL; return 0; } +static void __nft_array_free(struct nft_array *array) +{ + kvfree(array->intervals); + kfree(array); +} + static void nft_rbtree_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *next; + struct nft_array *array; struct rb_node *node; + list_for_each_entry_safe(rbe, next, &priv->expired, list) { + list_del(&rbe->list); + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); + } + while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } + + array = rcu_dereference_protected(priv->array, true); + if (array) + __nft_array_free(array); + if (priv->array_next) + __nft_array_free(priv->array_next); } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, @@ -752,12 +973,105 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_array_free_rcu(struct rcu_head *rcu_head) +{ + struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head); + + __nft_array_free(array); +} + static void nft_rbtree_commit(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *prev_rbe; + struct nft_array *old; + u32 num_intervals = 0; + struct rb_node *node; + + /* No changes, skip, eg. elements updates only. */ + if (!priv->array_next) + return; + /* GC can be performed if the binary search blob is going + * to be rebuilt. It has to be done in two phases: first + * scan tree and move all expired elements to the expired + * list. + * + * Then, after blob has been re-built and published to other + * CPUs, queue collected entries for freeing. + */ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - nft_rbtree_gc(set); + nft_rbtree_gc_scan(set); + + /* Reverse walk to create an array from smaller to largest interval. */ + node = rb_last(&priv->root); + if (node) + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); + else + prev_rbe = NULL; + + while (prev_rbe) { + rbe = prev_rbe; + + if (nft_rbtree_interval_start(rbe)) + priv->array_next->intervals[num_intervals].from = &rbe->ext; + else if (nft_rbtree_interval_end(rbe)) + priv->array_next->intervals[num_intervals++].to = &rbe->ext; + + if (num_intervals >= priv->array_next->max_intervals) { + pr_warn_once("malformed interval set from userspace?"); + goto err_out; + } + + node = rb_prev(node); + if (!node) + break; + + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); + + /* For anonymous sets, when adjacent ranges are found, + * the end element is not added to the set to pack the set + * representation. Use next start element to complete this + * interval. + */ + if (nft_rbtree_interval_start(rbe) && + nft_rbtree_interval_start(prev_rbe) && + priv->array_next->intervals[num_intervals].from) + priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext; + + if (num_intervals >= priv->array_next->max_intervals) { + pr_warn_once("malformed interval set from userspace?"); + goto err_out; + } + } + + if (priv->array_next->intervals[num_intervals].from) + num_intervals++; +err_out: + priv->array_next->num_intervals = num_intervals; + old = rcu_replace_pointer(priv->array, priv->array_next, + lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex)); + priv->array_next = NULL; + if (old) + call_rcu(&old->rcu_head, nft_array_free_rcu); + + /* New blob is public, queue collected entries for freeing. + * call_rcu ensures elements stay around until readers are done. + */ + nft_rbtree_gc_queue(set); +} + +static void nft_rbtree_abort(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_array *array_next; + + if (!priv->array_next) + return; + + array_next = priv->array_next; + priv->array_next = NULL; + __nft_array_free(array_next); } static void nft_rbtree_gc_init(const struct nft_set *set) @@ -821,6 +1135,7 @@ const struct nft_set_type nft_set_rbtree_type = { .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .commit = nft_rbtree_commit, + .abort = nft_rbtree_abort, .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 5d3e51825985..b71ef18b0e8c 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -7,6 +7,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_synproxy.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_synproxy.h> @@ -48,7 +49,7 @@ static void nft_synproxy_eval_v4(const struct nft_synproxy *priv, struct tcphdr *_tcph, struct synproxy_options *opts) { - struct nf_synproxy_info info = priv->info; + struct nf_synproxy_info info = READ_ONCE(priv->info); struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; @@ -79,7 +80,7 @@ static void nft_synproxy_eval_v6(const struct nft_synproxy *priv, struct tcphdr *_tcph, struct synproxy_options *opts) { - struct nf_synproxy_info info = priv->info; + struct nf_synproxy_info info = READ_ONCE(priv->info); struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; @@ -340,7 +341,7 @@ static void nft_synproxy_obj_update(struct nft_object *obj, struct nft_synproxy *newpriv = nft_obj_data(newobj); struct nft_synproxy *priv = nft_obj_data(obj); - priv->info = newpriv->info; + WRITE_ONCE(priv->info, newpriv->info); } static struct nft_object_type nft_synproxy_obj_type; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 90b7630421c4..e594b3b7ad82 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1742,7 +1742,7 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) if (!num_hooks) return ERR_PTR(-EINVAL); - ops = kcalloc(num_hooks, sizeof(*ops), GFP_KERNEL); + ops = kzalloc_objs(*ops, num_hooks); if (ops == NULL) return ERR_PTR(-ENOMEM); @@ -1764,7 +1764,7 @@ EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); int xt_register_template(const struct xt_table *table, int (*table_init)(struct net *net)) { - int ret = -EEXIST, af = table->af; + int ret = -EBUSY, af = table->af; struct xt_template *t; mutex_lock(&xt[af].mutex); @@ -1775,7 +1775,7 @@ int xt_register_template(const struct xt_table *table, } ret = -ENOMEM; - t = kzalloc(sizeof(*t), GFP_KERNEL); + t = kzalloc_obj(*t); if (!t) goto out_unlock; @@ -1993,7 +1993,7 @@ static int __init xt_init(void) } } - xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); + xt = kzalloc_objs(struct xt_af, NFPROTO_NUMPROTO); if (!xt) return -ENOMEM; diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index b6a015aee0ce..4c18606b8654 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -28,46 +28,10 @@ MODULE_ALIAS("ip6t_AUDIT"); MODULE_ALIAS("ebt_AUDIT"); MODULE_ALIAS("arpt_AUDIT"); -static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) -{ - struct iphdr _iph; - const struct iphdr *ih; - - ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); - if (!ih) - return false; - - audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", - &ih->saddr, &ih->daddr, ih->protocol); - - return true; -} - -static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) -{ - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - u8 nexthdr; - __be16 frag_off; - - ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); - if (!ih) - return false; - - nexthdr = ih->nexthdr; - ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); - - audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", - &ih->saddr, &ih->daddr, nexthdr); - - return true; -} - static unsigned int audit_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct audit_buffer *ab; - int fam = -1; if (audit_enabled == AUDIT_OFF) goto errout; @@ -77,27 +41,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par) audit_log_format(ab, "mark=%#x", skb->mark); - switch (xt_family(par)) { - case NFPROTO_BRIDGE: - switch (eth_hdr(skb)->h_proto) { - case htons(ETH_P_IP): - fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; - break; - case htons(ETH_P_IPV6): - fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; - break; - } - break; - case NFPROTO_IPV4: - fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; - break; - case NFPROTO_IPV6: - fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; - break; - } - - if (fam == -1) - audit_log_format(ab, " saddr=? daddr=? proto=-1"); + audit_log_nf_skb(ab, skb, xt_family(par)); audit_log_end(ab); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 3ba94c34297c..498f5871c84a 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -16,6 +16,7 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> +#include "nf_internals.h" static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) { @@ -283,6 +284,9 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, struct nf_conn_help *help; if (ct) { + if (info->helper[0] || info->timeout[0]) + nf_queue_nf_hook_drop(par->net); + help = nfct_help(ct); xt_ct_put_helper(help); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index d73957592c9d..517106165ad2 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -135,7 +135,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; - info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kzalloc_obj(*info->timer); if (!info->timer) { ret = -ENOMEM; goto out; @@ -184,7 +184,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; - info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kmalloc_obj(*info->timer); if (!info->timer) { ret = -ENOMEM; goto out; @@ -318,6 +318,12 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); + mutex_unlock(&list_mutex); + return -EINVAL; + } + info->timer->refcnt++; mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 90dcf088071a..caaaf4d2c584 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -111,7 +111,7 @@ static int led_tg_check(const struct xt_tgchk_param *par) } err = -ENOMEM; - ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL); + ledinternal = kzalloc_obj(struct xt_led_info_internal); if (!ledinternal) goto exit_mutex_only; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 4f49cfc27831..91270d467ffd 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -139,7 +139,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } ret = -ENOMEM; - est = kzalloc(sizeof(*est), GFP_KERNEL); + est = kzalloc_obj(*est); if (!est) goto err1; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index a5ebd5640457..5d34ceb893ed 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -106,7 +106,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par) if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc_obj(*priv); if (priv == NULL) return -ENOMEM; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 0189f8b6b0bd..848287ab79cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - struct nf_conntrack_tuple tuple; - const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) u32 key[5]; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if (ct) zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - xt_family(par), net, &tuple)) { - goto hotdrop; - } if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, tuple_ptr, - zone); + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) - /* kmalloc failed, drop it entirely */ + /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index e5a13ecbe67a..037ab93e25d0 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -62,10 +62,10 @@ dccp_find_option(u_int8_t option, return true; } - if (op[i] < 2) + if (op[i] < 2 || i == optlen - 1) i++; else - i += op[i+1]?:1; + i += op[i + 1] ? : 1; } spin_unlock_bh(&dccp_buflock); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3b507694e81e..3bd127bfc114 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -293,7 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, if (size < 16) size = 16; } - hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL); + hinfo = kvmalloc_flex(*hinfo, hash, size); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 8b4fd27857f2..87d74da14c0b 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -115,7 +115,7 @@ static int limit_mt_check(const struct xt_mtchk_param *par) return -ERANGE; } - priv = kmalloc(sizeof(*priv), GFP_KERNEL); + priv = kmalloc_obj(*priv); if (priv == NULL) return -ENOMEM; diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 4452cc93b990..b05c5c8dac78 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -50,7 +50,7 @@ static int quota_mt_check(const struct xt_mtchk_param *par) if (q->flags & ~XT_QUOTA_MASK) return -EINVAL; - q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); + q->master = kmalloc_obj(*q->master); if (q->master == NULL) return -ENOMEM; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 588a5e6ad899..f72752fa4374 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -188,7 +188,7 @@ recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, } nstamps_max += 1; - e = kmalloc(struct_size(e, stamps, nstamps_max), GFP_ATOMIC); + e = kmalloc_flex(*e, stamps, nstamps_max, GFP_ATOMIC); if (e == NULL) return NULL; memcpy(&e->addr, addr, sizeof(e->addr)); @@ -391,7 +391,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, goto out; } - t = kvzalloc(struct_size(t, iphash, ip_list_hash_size), GFP_KERNEL); + t = kvzalloc_flex(*t, iphash, ip_list_hash_size); if (t == NULL) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index b26c1dcfc27b..334e09771abf 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -58,7 +58,7 @@ static int statistic_mt_check(const struct xt_mtchk_param *par) info->flags & ~XT_STATISTIC_MASK) return -EINVAL; - info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); + info->master = kzalloc_obj(*info->master); if (info->master == NULL) return -ENOMEM; atomic_set(&info->master->count, info->u.nth.count); diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 37704ab01799..0d32d4841cb3 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -61,7 +61,7 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) return (mssval >= info->mss_min && mssval <= info->mss_max) ^ info->invert; } - if (op[i] < 2) + if (op[i] < 2 || i == optlen - 1) i++; else i += op[i+1] ? : 1; diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index e8991130a3de..f76cf18f1a24 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -59,8 +59,10 @@ tcp_find_option(u_int8_t option, for (i = 0; i < optlen; ) { if (op[i] == option) return !invert; - if (op[i] < 2) i++; - else i += op[i+1]?:1; + if (op[i] < 2 || i == optlen - 1) + i++; + else + i += op[i + 1] ? : 1; } return invert; diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 6aa12d0f54e2..d9d74011bb64 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -14,6 +14,7 @@ #include <linux/ktime.h> #include <linux/module.h> +#include <linux/rtc.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> @@ -64,11 +65,6 @@ static const u_int16_t days_since_epoch[] = { 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, }; -static inline bool is_leap(unsigned int y) -{ - return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); -} - /* * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. * Since we match against days and daytime, the SSTE value needs to be @@ -138,7 +134,7 @@ static void localtime_3(struct xtm *r, time64_t time) * (A different approach to use would be to subtract a monthlength * from w repeatedly while counting.) */ - if (is_leap(year)) { + if (is_leap_year(year)) { /* use days_since_leapyear[] in a leap year */ for (i = ARRAY_SIZE(days_since_leapyear) - 1; i > 0 && days_since_leapyear[i] > w; --i) @@ -227,13 +223,13 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) localtime_2(¤t_time, stamp); - if (!(info->weekdays_match & (1 << current_time.weekday))) + if (!(info->weekdays_match & (1U << current_time.weekday))) return false; /* Do not spend time computing monthday if all days match anyway */ if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) { localtime_3(¤t_time, stamp); - if (!(info->monthdays_match & (1 << current_time.monthday))) + if (!(info->monthdays_match & (1U << current_time.monthday))) return false; } diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index a07c2216d28b..e1efd888b4a2 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -95,7 +95,7 @@ static int netlbl_calipso_add_pass(struct genl_info *info, int ret_val; struct calipso_doi *doi_def = NULL; - doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + doi_def = kmalloc_obj(*doi_def); if (!doi_def) return -ENOMEM; doi_def->type = CALIPSO_MAP_PASS; diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index fa08ee75ac06..b080e666523f 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -139,10 +139,10 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, NULL) != 0) return -EINVAL; - doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + doi_def = kmalloc_obj(*doi_def); if (doi_def == NULL) return -ENOMEM; - doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); + doi_def->map.std = kzalloc_obj(*doi_def->map.std); if (doi_def->map.std == NULL) { kfree(doi_def); return -ENOMEM; @@ -332,7 +332,7 @@ static int netlbl_cipsov4_add_pass(struct genl_info *info, if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) return -EINVAL; - doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + doi_def = kmalloc_obj(*doi_def); if (doi_def == NULL) return -ENOMEM; doi_def->type = CIPSO_V4_MAP_PASS; @@ -371,7 +371,7 @@ static int netlbl_cipsov4_add_local(struct genl_info *info, if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) return -EINVAL; - doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + doi_def = kmalloc_obj(*doi_def); if (doi_def == NULL) return -ENOMEM; doi_def->type = CIPSO_V4_MAP_LOCAL; diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index 8158a25972b4..3fe31648ba2d 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -367,13 +367,11 @@ int __init netlbl_domhsh_init(u32 size) if (size == 0) return -EINVAL; - hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); + hsh_tbl = kmalloc_obj(*hsh_tbl); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; - hsh_tbl->tbl = kcalloc(hsh_tbl->size, - sizeof(struct list_head), - GFP_KERNEL); + hsh_tbl->tbl = kzalloc_objs(struct list_head, hsh_tbl->size); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; @@ -453,7 +451,7 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, ret_val = -EINVAL; goto add_return; } - entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC); + entry_b = kzalloc_obj(*entry_b, GFP_ATOMIC); if (entry_b == NULL) { ret_val = -ENOMEM; goto add_return; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 33b77084a4e5..3583fa63dd01 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -104,7 +104,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain, struct netlbl_domaddr4_map *map4 = NULL; struct netlbl_domaddr6_map *map6 = NULL; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (entry == NULL) return -ENOMEM; if (domain != NULL) { @@ -117,7 +117,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain, if (addr == NULL && mask == NULL) entry->def.type = NETLBL_NLTYPE_UNLABELED; else if (addr != NULL && mask != NULL) { - addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + addrmap = kzalloc_obj(*addrmap, GFP_ATOMIC); if (addrmap == NULL) goto cfg_unlbl_map_add_failure; INIT_LIST_HEAD(&addrmap->list4); @@ -127,7 +127,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain, case AF_INET: { const struct in_addr *addr4 = addr; const struct in_addr *mask4 = mask; - map4 = kzalloc(sizeof(*map4), GFP_ATOMIC); + map4 = kzalloc_obj(*map4, GFP_ATOMIC); if (map4 == NULL) goto cfg_unlbl_map_add_failure; map4->def.type = NETLBL_NLTYPE_UNLABELED; @@ -144,7 +144,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain, case AF_INET6: { const struct in6_addr *addr6 = addr; const struct in6_addr *mask6 = mask; - map6 = kzalloc(sizeof(*map6), GFP_ATOMIC); + map6 = kzalloc_obj(*map6, GFP_ATOMIC); if (map6 == NULL) goto cfg_unlbl_map_add_failure; map6->def.type = NETLBL_NLTYPE_UNLABELED; @@ -336,7 +336,7 @@ int netlbl_cfg_cipsov4_map_add(u32 doi, if (doi_def == NULL) return -ENOENT; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (entry == NULL) goto out_entry; entry->family = AF_INET; @@ -350,13 +350,13 @@ int netlbl_cfg_cipsov4_map_add(u32 doi, entry->def.cipso = doi_def; entry->def.type = NETLBL_NLTYPE_CIPSOV4; } else if (addr != NULL && mask != NULL) { - addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + addrmap = kzalloc_obj(*addrmap, GFP_ATOMIC); if (addrmap == NULL) goto out_addrmap; INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); - addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); + addrinfo = kzalloc_obj(*addrinfo, GFP_ATOMIC); if (addrinfo == NULL) goto out_addrinfo; addrinfo->def.cipso = doi_def; @@ -462,7 +462,7 @@ int netlbl_cfg_calipso_map_add(u32 doi, if (doi_def == NULL) return -ENOENT; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (entry == NULL) goto out_entry; entry->family = AF_INET6; @@ -476,13 +476,13 @@ int netlbl_cfg_calipso_map_add(u32 doi, entry->def.calipso = doi_def; entry->def.type = NETLBL_NLTYPE_CALIPSO; } else if (addr != NULL && mask != NULL) { - addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + addrmap = kzalloc_obj(*addrmap, GFP_ATOMIC); if (addrmap == NULL) goto out_addrmap; INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); - addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); + addrinfo = kzalloc_obj(*addrinfo, GFP_ATOMIC); if (addrinfo == NULL) goto out_addrinfo; addrinfo->def.calipso = doi_def; diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 079fe72a6384..3e89c3b3a1a4 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -84,7 +84,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, struct calipso_doi *calipso = NULL; #endif u32 tmp_val; - struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL); + struct netlbl_dom_map *entry = kzalloc_obj(*entry); if (!entry) return -ENOMEM; @@ -148,7 +148,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, struct in_addr *mask; struct netlbl_domaddr4_map *map; - addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); + addrmap = kzalloc_obj(*addrmap); if (addrmap == NULL) { ret_val = -ENOMEM; goto add_doi_put_def; @@ -169,7 +169,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]); mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]); - map = kzalloc(sizeof(*map), GFP_KERNEL); + map = kzalloc_obj(*map); if (map == NULL) { ret_val = -ENOMEM; goto add_free_addrmap; @@ -195,7 +195,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, struct in6_addr *mask; struct netlbl_domaddr6_map *map; - addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL); + addrmap = kzalloc_obj(*addrmap); if (addrmap == NULL) { ret_val = -ENOMEM; goto add_doi_put_def; @@ -216,7 +216,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]); mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]); - map = kzalloc(sizeof(*map), GFP_KERNEL); + map = kzalloc_obj(*map); if (map == NULL) { ret_val = -ENOMEM; goto add_free_addrmap; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index dfda9ea61971..ca7a9e2a3de7 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -236,7 +236,7 @@ static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, int ret_val; struct netlbl_unlhsh_addr4 *entry; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (entry == NULL) return -ENOMEM; @@ -276,7 +276,7 @@ static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, int ret_val; struct netlbl_unlhsh_addr6 *entry; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (entry == NULL) return -ENOMEM; @@ -314,7 +314,7 @@ static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) u32 bkt; struct netlbl_unlhsh_iface *iface; - iface = kzalloc(sizeof(*iface), GFP_ATOMIC); + iface = kzalloc_obj(*iface, GFP_ATOMIC); if (iface == NULL) return NULL; @@ -1413,13 +1413,11 @@ int __init netlbl_unlabel_init(u32 size) if (size == 0) return -EINVAL; - hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); + hsh_tbl = kmalloc_obj(*hsh_tbl); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; - hsh_tbl->tbl = kcalloc(hsh_tbl->size, - sizeof(struct list_head), - GFP_KERNEL); + hsh_tbl->tbl = kzalloc_objs(struct list_head, hsh_tbl->size); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; @@ -1534,7 +1532,7 @@ int __init netlbl_unlabel_defconf(void) audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; - entry = kzalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc_obj(*entry); if (entry == NULL) return -ENOMEM; entry->family = AF_UNSPEC; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2b46c0cd752a..4d609d5cf406 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -596,10 +596,8 @@ static void netlink_remove(struct sock *sk) table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, - netlink_rhashtable_params)) { - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + netlink_rhashtable_params)) __sock_put(sk); - } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { @@ -968,7 +966,7 @@ static void netlink_undo_bind(int group, long unsigned int groups, nlk->netlink_unbind(sock_net(sk), undo + 1); } -static int netlink_bind(struct socket *sock, struct sockaddr *addr, +static int netlink_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sock *sk = sock->sk; @@ -1056,7 +1054,7 @@ unlock: return err; } -static int netlink_connect(struct socket *sock, struct sockaddr *addr, +static int netlink_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { int err = 0; @@ -2929,7 +2927,7 @@ static int __init netlink_proto_init(void) BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); - nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); + nl_table = kzalloc_objs(*nl_table, MAX_LINKS); if (!nl_table) goto panic; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index b8e58132e8af..1dfc340736b8 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -107,7 +107,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, num--; if (!hti) { - hti = kmalloc(sizeof(*hti), GFP_KERNEL); + hti = kmalloc_obj(*hti); if (!hti) return -ENOMEM; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 978c129c6095..a23d4c51c089 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -659,7 +659,7 @@ static int genl_sk_privs_alloc(struct genl_family *family) if (!family->sock_priv_size) return 0; - family->sock_privs = kzalloc(sizeof(*family->sock_privs), GFP_KERNEL); + family->sock_privs = kzalloc_obj(*family->sock_privs); if (!family->sock_privs) return -ENOMEM; xa_init(family->sock_privs); @@ -912,7 +912,7 @@ EXPORT_SYMBOL(genlmsg_put); static struct genl_dumpit_info *genl_dumpit_info_alloc(void) { - return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); + return kmalloc_obj(struct genl_dumpit_info); } static void genl_dumpit_info_free(const struct genl_dumpit_info *info) @@ -937,8 +937,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, if (!ops->maxattr) return NULL; - attrbuf = kmalloc_array(ops->maxattr + 1, - sizeof(struct nlattr *), GFP_KERNEL); + attrbuf = kmalloc_objs(struct nlattr *, ops->maxattr + 1); if (!attrbuf) return ERR_PTR(-ENOMEM); @@ -1591,7 +1590,7 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) return 0; } - ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL); + ctx->op_iter = kmalloc_obj(*ctx->op_iter); if (!ctx->op_iter) return -ENOMEM; diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 99458da6be32..f39cd7cc4fb5 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -102,8 +102,7 @@ static struct netlink_policy_dump_state *alloc_state(void) { struct netlink_policy_dump_state *state; - state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC), - GFP_KERNEL); + state = kzalloc_flex(*state, policies, INITIAL_POLICIES_ALLOC); if (!state) return ERR_PTR(-ENOMEM); state->n_alloc = INITIAL_POLICIES_ALLOC; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 3331669d8e33..b816c56124ab 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -561,7 +561,7 @@ static int nr_release(struct socket *sock) return 0; } -static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int nr_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -632,8 +632,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return 0; } -static int nr_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +static int nr_connect(struct socket *sock, struct sockaddr_unsized *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -1401,7 +1401,7 @@ static int __init nr_proto_init(void) goto unregister_proto; } - dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); + dev_nr = kzalloc_objs(struct net_device *, nr_ndevs); if (!dev_nr) { pr_err("NET/ROM: %s - unable to allocate device array\n", __func__); diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c index 5e531394a724..2b3cbceb0b52 100644 --- a/net/netrom/nr_out.c +++ b/net/netrom/nr_out.c @@ -43,8 +43,10 @@ void nr_output(struct sock *sk, struct sk_buff *skb) frontlen = skb_headroom(skb); while (skb->len > 0) { - if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL) + if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL) { + kfree_skb(skb); return; + } skb_reserve(skbn, frontlen); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index b94cb2ffbaf8..9cc29ae85b06 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -752,7 +752,7 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) unsigned char *dptr; ax25_cb *ax25s; int ret; - struct sk_buff *skbn; + struct sk_buff *nskb, *oskb; /* * Reject malformed packets early. Check that it contains at least 2 @@ -811,14 +811,16 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) /* We are going to change the netrom headers so we should get our own skb, we also did not know until now how much header space we had to reserve... - RXQ */ - if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) { + nskb = skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC); + + if (!nskb) { nr_node_unlock(nr_node); nr_node_put(nr_node); dev_put(dev); return 0; } - kfree_skb(skb); - skb=skbn; + oskb = skb; + skb = nskb; skb->data[14]--; dptr = skb_push(skb, 1); @@ -837,6 +839,9 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) nr_node_unlock(nr_node); nr_node_put(nr_node); + if (ret) + kfree_skb(oskb); + return ret; } diff --git a/net/nfc/core.c b/net/nfc/core.c index ae1c842f9c64..a92a6566e6a0 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -879,7 +879,7 @@ int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) if (se) return -EALREADY; - se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL); + se = kzalloc_obj(struct nfc_se); if (!se) return -ENOMEM; @@ -1062,7 +1062,7 @@ struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, if (!supported_protocols) return NULL; - dev = kzalloc(sizeof(struct nfc_dev), GFP_KERNEL); + dev = kzalloc_obj(struct nfc_dev); if (!dev) return NULL; @@ -1147,12 +1147,13 @@ int nfc_register_device(struct nfc_dev *dev) EXPORT_SYMBOL(nfc_register_device); /** - * nfc_unregister_device - unregister a nfc device in the nfc subsystem + * nfc_unregister_rfkill - unregister a nfc device in the rfkill subsystem * * @dev: The nfc device to unregister */ -void nfc_unregister_device(struct nfc_dev *dev) +void nfc_unregister_rfkill(struct nfc_dev *dev) { + struct rfkill *rfk = NULL; int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); @@ -1164,13 +1165,26 @@ void nfc_unregister_device(struct nfc_dev *dev) device_lock(&dev->dev); if (dev->rfkill) { - rfkill_unregister(dev->rfkill); - rfkill_destroy(dev->rfkill); + rfk = dev->rfkill; dev->rfkill = NULL; } dev->shutting_down = true; device_unlock(&dev->dev); + if (rfk) { + rfkill_unregister(rfk); + rfkill_destroy(rfk); + } +} +EXPORT_SYMBOL(nfc_unregister_rfkill); + +/** + * nfc_remove_device - remove a nfc device in the nfc subsystem + * + * @dev: The nfc device to remove + */ +void nfc_remove_device(struct nfc_dev *dev) +{ if (dev->ops->check_presence) { timer_delete_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); @@ -1183,6 +1197,18 @@ void nfc_unregister_device(struct nfc_dev *dev) device_del(&dev->dev); mutex_unlock(&nfc_devlist_mutex); } +EXPORT_SYMBOL(nfc_remove_device); + +/** + * nfc_unregister_device - unregister a nfc device in the nfc subsystem + * + * @dev: The nfc device to unregister + */ +void nfc_unregister_device(struct nfc_dev *dev) +{ + nfc_unregister_rfkill(dev); + nfc_remove_device(dev); +} EXPORT_SYMBOL(nfc_unregister_device); static int __init nfc_init(void) diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index dae378f1d52b..7cb1e6aaae90 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -231,7 +231,7 @@ int digital_send_cmd(struct nfc_digital_dev *ddev, u8 cmd_type, { struct digital_cmd *cmd; - cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + cmd = kzalloc_obj(*cmd); if (!cmd) return -ENOMEM; @@ -279,7 +279,7 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) struct digital_tg_mdaa_params *params; int rc; - params = kzalloc(sizeof(*params), GFP_KERNEL); + params = kzalloc_obj(*params); if (!params) return -ENOMEM; @@ -706,9 +706,11 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, struct digital_data_exch *data_exch; int rc; - data_exch = kzalloc(sizeof(*data_exch), GFP_KERNEL); - if (!data_exch) + data_exch = kzalloc_obj(*data_exch); + if (!data_exch) { + kfree_skb(skb); return -ENOMEM; + } data_exch->cb = cb; data_exch->cb_context = cb_context; @@ -731,8 +733,10 @@ static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target, data_exch); exit: - if (rc) + if (rc) { + kfree_skb(skb); kfree(data_exch); + } return rc; } @@ -762,7 +766,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops !ops->switch_rf || (ops->tg_listen_md && !ops->tg_get_rf_tech)) return NULL; - ddev = kzalloc(sizeof(*ddev), GFP_KERNEL); + ddev = kzalloc_obj(*ddev); if (!ddev) return NULL; diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 3adf4589852a..63f1b721c71d 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -490,7 +490,7 @@ static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, goto exit; } - target = kzalloc(sizeof(struct nfc_target), GFP_KERNEL); + target = kzalloc_obj(struct nfc_target); if (!target) { rc = -ENOMEM; goto exit; @@ -688,7 +688,7 @@ static void digital_in_recv_sensb_res(struct nfc_digital_dev *ddev, void *arg, else ddev->target_fsc = digital_ats_fsc[fsci]; - target = kzalloc(sizeof(struct nfc_target), GFP_KERNEL); + target = kzalloc_obj(struct nfc_target); if (!target) { rc = -ENOMEM; goto exit; @@ -863,7 +863,7 @@ static void digital_in_recv_iso15693_inv_res(struct nfc_digital_dev *ddev, goto out_free_skb; } - target = kzalloc(sizeof(*target), GFP_KERNEL); + target = kzalloc_obj(*target); if (!target) { rc = -ENOMEM; goto out_free_skb; diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 8618d57c23da..0d33c81a15fe 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -291,7 +291,7 @@ int nfc_hci_target_discovered(struct nfc_hci_dev *hdev, u8 gate) pr_debug("from gate %d\n", gate); - targets = kzalloc(sizeof(struct nfc_target), GFP_KERNEL); + targets = kzalloc_obj(struct nfc_target); if (targets == NULL) return -ENOMEM; @@ -964,7 +964,7 @@ struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops, if (protocols == 0) return NULL; - hdev = kzalloc(sizeof(struct nfc_hci_dev), GFP_KERNEL); + hdev = kzalloc_obj(struct nfc_hci_dev); if (hdev == NULL) return NULL; diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c index 4902f5064098..4d19c697d6ec 100644 --- a/net/nfc/hci/hcp.c +++ b/net/nfc/hci/hcp.c @@ -30,7 +30,7 @@ int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe, int hci_len, err; bool firstfrag = true; - cmd = kzalloc(sizeof(struct hci_msg), GFP_KERNEL); + cmd = kzalloc_obj(struct hci_msg); if (cmd == NULL) return -ENOMEM; diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index e6cf4eb06b46..51dfa0d5dfa2 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -49,7 +49,7 @@ int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) { struct nfc_llc_engine *llc_engine; - llc_engine = kzalloc(sizeof(struct nfc_llc_engine), GFP_KERNEL); + llc_engine = kzalloc_obj(struct nfc_llc_engine); if (llc_engine == NULL) return -ENOMEM; @@ -90,7 +90,7 @@ struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev, if (llc_engine == NULL) return NULL; - llc = kzalloc(sizeof(struct nfc_llc), GFP_KERNEL); + llc = kzalloc_obj(struct nfc_llc); if (llc == NULL) return NULL; diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c index a58716f16954..f9c826a138e5 100644 --- a/net/nfc/hci/llc_nop.c +++ b/net/nfc/hci/llc_nop.c @@ -28,7 +28,7 @@ static void *llc_nop_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, *rx_headroom = 0; *rx_tailroom = 0; - llc_nop = kzalloc(sizeof(struct llc_nop), GFP_KERNEL); + llc_nop = kzalloc_obj(struct llc_nop); if (llc_nop == NULL) return NULL; diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 4fc37894860c..6819695d993c 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -728,7 +728,7 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, *rx_headroom = SHDLC_LLC_HEAD_ROOM; *rx_tailroom = 0; - shdlc = kzalloc(sizeof(struct llc_shdlc), GFP_KERNEL); + shdlc = kzalloc_obj(struct llc_shdlc); if (shdlc == NULL) return NULL; @@ -762,6 +762,14 @@ static void llc_shdlc_deinit(struct nfc_llc *llc) { struct llc_shdlc *shdlc = nfc_llc_get_data(llc); + timer_shutdown_sync(&shdlc->connect_timer); + timer_shutdown_sync(&shdlc->t1_timer); + timer_shutdown_sync(&shdlc->t2_timer); + shdlc->t1_active = false; + shdlc->t2_active = false; + + cancel_work_sync(&shdlc->sm_work); + skb_queue_purge(&shdlc->rcv_q); skb_queue_purge(&shdlc->send_q); skb_queue_purge(&shdlc->ack_pending_q); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index e2680a3bef79..291f26facbf3 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -108,7 +108,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) struct nfc_llcp_sdp_tlv *sdres; u8 value[2]; - sdres = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); + sdres = kzalloc_obj(struct nfc_llcp_sdp_tlv); if (sdres == NULL) return NULL; @@ -141,7 +141,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, if (WARN_ON_ONCE(uri_len > U8_MAX - 4)) return NULL; - sdreq = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); + sdreq = kzalloc_obj(struct nfc_llcp_sdp_tlv); if (sdreq == NULL) return NULL; @@ -778,8 +778,23 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, if (likely(frag_len > 0)) skb_put_data(pdu, msg_ptr, frag_len); + spin_lock(&local->tx_queue.lock); + + if (list_empty(&local->list)) { + spin_unlock(&local->tx_queue.lock); + + kfree_skb(pdu); + + len -= remaining_len; + if (len == 0) + len = -ENXIO; + break; + } + /* No need to check for the peer RW for UI frames */ - skb_queue_tail(&local->tx_queue, pdu); + __skb_queue_tail(&local->tx_queue, pdu); + + spin_unlock(&local->tx_queue.lock); remaining_len -= frag_len; msg_ptr += frag_len; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index beeb3b4d28ca..366d7566308c 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -316,7 +316,9 @@ static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) spin_lock(&llcp_devices_lock); list_for_each_entry_safe(local, tmp, &llcp_devices, list) if (local->dev == dev) { - list_del(&local->list); + spin_lock(&local->tx_queue.lock); + list_del_init(&local->list); + spin_unlock(&local->tx_queue.lock); spin_unlock(&llcp_devices_lock); return local; } @@ -1619,7 +1621,7 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) { struct nfc_llcp_local *local; - local = kzalloc(sizeof(struct nfc_llcp_local), GFP_KERNEL); + local = kzalloc_obj(struct nfc_llcp_local); if (local == NULL) return -ENOMEM; diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 57a2f97004e1..f1be1e84f665 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -56,7 +56,7 @@ static struct proto llcp_sock_proto = { .obj_size = sizeof(struct nfc_llcp_sock), }; -static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +static int llcp_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -146,7 +146,7 @@ error: return ret; } -static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, +static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; @@ -648,7 +648,7 @@ out: return err; } -static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, +static int llcp_sock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index fc921cd2cdff..43d871525dbc 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -567,6 +567,10 @@ static int nci_close_device(struct nci_dev *ndev) flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); timer_delete_sync(&ndev->data_timer); + if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + nci_data_exchange_complete(ndev, NULL, + ndev->cur_conn_id, + -ENODEV); mutex_unlock(&ndev->req_lock); return 0; } @@ -598,6 +602,11 @@ static int nci_close_device(struct nci_dev *ndev) flush_workqueue(ndev->cmd_wq); timer_delete_sync(&ndev->cmd_timer); + timer_delete_sync(&ndev->data_timer); + + if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + nci_data_exchange_complete(ndev, NULL, ndev->cur_conn_id, + -ENODEV); /* Clear flags except NCI_UNREG */ ndev->flags &= BIT(NCI_UNREG); @@ -1035,18 +1044,23 @@ static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; - if (!conn_info) + if (!conn_info) { + kfree_skb(skb); return -EPROTO; + } pr_debug("target_idx %d, len %d\n", target->idx, skb->len); if (!ndev->target_active_prot) { pr_err("unable to exchange data, no active target\n"); + kfree_skb(skb); return -EINVAL; } - if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) + if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags)) { + kfree_skb(skb); return -EBUSY; + } /* store cb and context to be used on receiving data */ conn_info->data_exchange_cb = cb; @@ -1171,7 +1185,7 @@ struct nci_dev *nci_allocate_device(const struct nci_ops *ops, if (!supported_protocols) return NULL; - ndev = kzalloc(sizeof(struct nci_dev), GFP_KERNEL); + ndev = kzalloc_obj(struct nci_dev); if (!ndev) return NULL; @@ -1303,6 +1317,8 @@ void nci_unregister_device(struct nci_dev *ndev) { struct nci_conn_info *conn_info, *n; + nfc_unregister_rfkill(ndev->nfc_dev); + /* This set_bit is not protected with specialized barrier, * However, it is fine because the mutex_lock(&ndev->req_lock); * in nci_close_device() will help to emit one. @@ -1320,7 +1336,7 @@ void nci_unregister_device(struct nci_dev *ndev) /* conn_info is allocated with devm_kzalloc */ } - nfc_unregister_device(ndev->nfc_dev); + nfc_remove_device(ndev->nfc_dev); } EXPORT_SYMBOL(nci_unregister_device); @@ -1480,10 +1496,20 @@ static bool nci_valid_size(struct sk_buff *skb) unsigned int hdr_size = NCI_CTRL_HDR_SIZE; if (skb->len < hdr_size || - !nci_plen(skb->data) || skb->len < hdr_size + nci_plen(skb->data)) { return false; } + + if (!nci_plen(skb->data)) { + /* Allow zero length in proprietary notifications (0x20 - 0x3F). */ + if (nci_opcode_oid(nci_opcode(skb->data)) >= 0x20 && + nci_mt(skb->data) == NCI_MT_NTF_PKT) + return true; + + /* Disallow zero length otherwise. */ + return false; + } + return true; } diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 78f4131af3cf..5f98c73db5af 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -33,7 +33,8 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { kfree_skb(skb); - goto exit; + clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); + return; } cb = conn_info->data_exchange_cb; @@ -45,6 +46,12 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, timer_delete_sync(&ndev->data_timer); clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); + /* Mark the exchange as done before calling the callback. + * The callback (e.g. rawsock_data_exchange_complete) may + * want to immediately queue another data exchange. + */ + clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); + if (cb) { /* forward skb to nfc core */ cb(cb_context, skb, err); @@ -54,9 +61,6 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, /* no waiting callback, free skb */ kfree_skb(skb); } - -exit: - clear_bit(NCI_DATA_EXCHANGE, &ndev->flags); } /* ----------------- NCI TX Data ----------------- */ diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 082ab66f120b..40ae8e5a7ec7 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -777,7 +777,7 @@ struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev) { struct nci_hci_dev *hdev; - hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); + hdev = kzalloc_obj(*hdev); if (!hdev) return NULL; diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 418b84e2b260..c96512bb8653 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -58,7 +58,7 @@ static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct nci_conn_info *conn_info; int i; - if (skb->len < sizeof(struct nci_core_conn_credit_ntf)) + if (skb->len < offsetofend(struct nci_core_conn_credit_ntf, num_entries)) return -EINVAL; ntf = (struct nci_core_conn_credit_ntf *)skb->data; @@ -68,6 +68,10 @@ static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, if (ntf->num_entries > NCI_MAX_NUM_CONN) ntf->num_entries = NCI_MAX_NUM_CONN; + if (skb->len < offsetofend(struct nci_core_conn_credit_ntf, num_entries) + + ntf->num_entries * sizeof(struct conn_credit_entry)) + return -EINVAL; + /* update the credits */ for (i = 0; i < ntf->num_entries; i++) { ntf->conn_entries[i].conn_id = @@ -138,23 +142,48 @@ static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfca_poll *nfca_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for sens_res (2 bytes) */ + if (data_len < 2) + return ERR_PTR(-EINVAL); + nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data)); data += 2; + data_len -= 2; + + /* Check if we have enough data for nfcid1_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); nfca_poll->nfcid1_len = min_t(__u8, *data++, NFC_NFCID1_MAXSIZE); + data_len--; pr_debug("sens_res 0x%x, nfcid1_len %d\n", nfca_poll->sens_res, nfca_poll->nfcid1_len); + /* Check if we have enough data for nfcid1 */ + if (data_len < nfca_poll->nfcid1_len) + return ERR_PTR(-EINVAL); + memcpy(nfca_poll->nfcid1, data, nfca_poll->nfcid1_len); data += nfca_poll->nfcid1_len; + data_len -= nfca_poll->nfcid1_len; + + /* Check if we have enough data for sel_res_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); nfca_poll->sel_res_len = *data++; + data_len--; + + if (nfca_poll->sel_res_len != 0) { + /* Check if we have enough data for sel_res (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); - if (nfca_poll->sel_res_len != 0) nfca_poll->sel_res = *data++; + } pr_debug("sel_res_len %d, sel_res 0x%x\n", nfca_poll->sel_res_len, @@ -166,12 +195,21 @@ nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcb_poll *nfcb_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for sensb_res_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE); + data_len--; pr_debug("sensb_res_len %d\n", nfcb_poll->sensb_res_len); + /* Check if we have enough data for sensb_res */ + if (data_len < nfcb_poll->sensb_res_len) + return ERR_PTR(-EINVAL); + memcpy(nfcb_poll->sensb_res, data, nfcb_poll->sensb_res_len); data += nfcb_poll->sensb_res_len; @@ -181,14 +219,29 @@ nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcf_poll *nfcf_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for bit_rate (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcf_poll->bit_rate = *data++; + data_len--; + + /* Check if we have enough data for sensf_res_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE); + data_len--; pr_debug("bit_rate %d, sensf_res_len %d\n", nfcf_poll->bit_rate, nfcf_poll->sensf_res_len); + /* Check if we have enough data for sensf_res */ + if (data_len < nfcf_poll->sensf_res_len) + return ERR_PTR(-EINVAL); + memcpy(nfcf_poll->sensf_res, data, nfcf_poll->sensf_res_len); data += nfcf_poll->sensf_res_len; @@ -198,22 +251,49 @@ nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, static const __u8 * nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, struct rf_tech_specific_params_nfcv_poll *nfcv_poll, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Skip 1 byte (reserved) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + ++data; + data_len--; + + /* Check if we have enough data for dsfid (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcv_poll->dsfid = *data++; + data_len--; + + /* Check if we have enough data for uid (8 bytes) */ + if (data_len < NFC_ISO15693_UID_MAXSIZE) + return ERR_PTR(-EINVAL); + memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE); data += NFC_ISO15693_UID_MAXSIZE; + return data; } static const __u8 * nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, struct rf_tech_specific_params_nfcf_listen *nfcf_listen, - const __u8 *data) + const __u8 *data, ssize_t data_len) { + /* Check if we have enough data for local_nfcid2_len (1 byte) */ + if (data_len < 1) + return ERR_PTR(-EINVAL); + nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, NFC_NFCID2_MAXSIZE); + data_len--; + + /* Check if we have enough data for local_nfcid2 */ + if (data_len < nfcf_listen->local_nfcid2_len) + return ERR_PTR(-EINVAL); + memcpy(nfcf_listen->local_nfcid2, data, nfcf_listen->local_nfcid2_len); data += nfcf_listen->local_nfcid2_len; @@ -364,7 +444,7 @@ static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, const __u8 *data; bool add_target = true; - if (skb->len < sizeof(struct nci_rf_discover_ntf)) + if (skb->len < offsetofend(struct nci_rf_discover_ntf, rf_tech_specific_params_len)) return -EINVAL; data = skb->data; @@ -380,26 +460,42 @@ static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, pr_debug("rf_tech_specific_params_len %d\n", ntf.rf_tech_specific_params_len); + if (skb->len < (data - skb->data) + + ntf.rf_tech_specific_params_len + sizeof(ntf.ntf_type)) + return -EINVAL; + if (ntf.rf_tech_specific_params_len > 0) { switch (ntf.rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfca_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfca_poll), data); + &(ntf.rf_tech_specific_params.nfca_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; case NCI_NFC_B_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcb_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcb_poll), data); + &(ntf.rf_tech_specific_params.nfcb_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; case NCI_NFC_F_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcf_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcf_poll), data); + &(ntf.rf_tech_specific_params.nfcf_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; case NCI_NFC_V_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcv_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcv_poll), data); + &(ntf.rf_tech_specific_params.nfcv_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return PTR_ERR(data); break; default: @@ -596,7 +692,7 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, const __u8 *data; int err = NCI_STATUS_OK; - if (skb->len < sizeof(struct nci_rf_intf_activated_ntf)) + if (skb->len < offsetofend(struct nci_rf_intf_activated_ntf, rf_tech_specific_params_len)) return -EINVAL; data = skb->data; @@ -628,26 +724,41 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, if (ntf.rf_interface == NCI_RF_INTERFACE_NFCEE_DIRECT) goto listen; + if (skb->len < (data - skb->data) + ntf.rf_tech_specific_params_len) + return -EINVAL; + if (ntf.rf_tech_specific_params_len > 0) { switch (ntf.activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfca_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfca_poll), data); + &(ntf.rf_tech_specific_params.nfca_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_B_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcb_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcb_poll), data); + &(ntf.rf_tech_specific_params.nfcb_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_F_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcf_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcf_poll), data); + &(ntf.rf_tech_specific_params.nfcf_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_V_PASSIVE_POLL_MODE: data = nci_extract_rf_params_nfcv_passive_poll(ndev, - &(ntf.rf_tech_specific_params.nfcv_poll), data); + &(ntf.rf_tech_specific_params.nfcv_poll), data, + ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; case NCI_NFC_A_PASSIVE_LISTEN_MODE: @@ -657,7 +768,9 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, case NCI_NFC_F_PASSIVE_LISTEN_MODE: data = nci_extract_rf_params_nfcf_passive_listen(ndev, &(ntf.rf_tech_specific_params.nfcf_listen), - data); + data, ntf.rf_tech_specific_params_len); + if (IS_ERR(data)) + return -EINVAL; break; default: @@ -668,6 +781,13 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, } } + if (skb->len < (data - skb->data) + + sizeof(ntf.data_exch_rf_tech_and_mode) + + sizeof(ntf.data_exch_tx_bit_rate) + + sizeof(ntf.data_exch_rx_bit_rate) + + sizeof(ntf.activation_params_len)) + return -EINVAL; + ntf.data_exch_rf_tech_and_mode = *data++; ntf.data_exch_tx_bit_rate = *data++; ntf.data_exch_rx_bit_rate = *data++; @@ -679,6 +799,9 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, pr_debug("data_exch_rx_bit_rate 0x%x\n", ntf.data_exch_rx_bit_rate); pr_debug("activation_params_len %d\n", ntf.activation_params_len); + if (skb->len < (data - skb->data) + ntf.activation_params_len) + return -EINVAL; + if (ntf.activation_params_len > 0) { switch (ntf.rf_interface) { case NCI_RF_INTERFACE_ISO_DEP: diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index aab107727f18..5a3aed7f84f5 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -113,7 +113,7 @@ static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver) if (!nci_uart_drivers[driver]) return -ENOENT; - nu = kzalloc(sizeof(*nu), GFP_KERNEL); + nu = kzalloc_obj(*nu); if (!nu) return -ENOMEM; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index a18e2c503da6..0c58824cb150 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -604,7 +604,7 @@ static int nfc_genl_dump_devices(struct sk_buff *skb, if (!iter) { first_call = true; - iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); + iter = kmalloc_obj(struct class_dev_iter); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; @@ -1370,7 +1370,7 @@ static int nfc_genl_dump_ses(struct sk_buff *skb, if (!iter) { first_call = true; - iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); + iter = kmalloc_obj(struct class_dev_iter); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; @@ -1541,7 +1541,7 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); - ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); + ctx = kzalloc_obj(struct se_io_ctx); if (!ctx) { rc = -ENOMEM; goto put_dev; @@ -1875,7 +1875,7 @@ static int nfc_genl_rcv_nl_event(struct notifier_block *this, pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); - w = kmalloc(sizeof(*w), GFP_ATOMIC); + w = kmalloc_obj(*w, GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 5125392bb68e..f7d7a599fade 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -67,13 +67,24 @@ static int rawsock_release(struct socket *sock) if (sock->type == SOCK_RAW) nfc_sock_unlink(&raw_sk_list, sk); + if (sk->sk_state == TCP_ESTABLISHED) { + /* Prevent rawsock_tx_work from starting new transmits and + * wait for any in-progress work to finish. This must happen + * before the socket is orphaned to avoid a race where + * rawsock_tx_work runs after the NCI device has been freed. + */ + sk->sk_shutdown |= SEND_SHUTDOWN; + cancel_work_sync(&nfc_rawsock(sk)->tx_work); + rawsock_write_queue_purge(sk); + } + sock_orphan(sk); sock_put(sk); return 0; } -static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, +static int rawsock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e573e9221302..7c9256572284 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -928,8 +928,8 @@ static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) } static int ovs_ct_check_limit(struct net *net, - const struct ovs_conntrack_info *info, - const struct nf_conntrack_tuple *tuple) + const struct sk_buff *skb, + const struct ovs_conntrack_info *info) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; @@ -942,8 +942,9 @@ static int ovs_ct_check_limit(struct net *net, if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) return 0; - connections = nf_conncount_count(net, ct_limit_info->data, - &conncount_key, tuple, &info->zone); + connections = nf_conncount_count_skb(net, skb, info->family, + ct_limit_info->data, + &conncount_key); if (connections > per_zone_limit) return -ENOMEM; @@ -972,8 +973,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) if (static_branch_unlikely(&ovs_ct_limit_enabled)) { if (!nf_ct_is_confirmed(ct)) { - err = ovs_ct_check_limit(net, info, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + err = ovs_ct_check_limit(net, skb, info); if (err) { net_warn_ratelimited("openvswitch: zone: %u " "exceeds conntrack limit\n", @@ -1586,15 +1586,13 @@ static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) { int i, err; - ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info), - GFP_KERNEL); + ovs_net->ct_limit_info = kmalloc_obj(*ovs_net->ct_limit_info); if (!ovs_net->ct_limit_info) return -ENOMEM; ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT; ovs_net->ct_limit_info->limits = - kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head), - GFP_KERNEL); + kmalloc_objs(struct hlist_head, CT_LIMIT_HASH_BUCKETS); if (!ovs_net->ct_limit_info->limits) { kfree(ovs_net->ct_limit_info); return -ENOMEM; @@ -1688,8 +1686,7 @@ static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, } else { struct ovs_ct_limit *ct_limit; - ct_limit = kmalloc(sizeof(*ct_limit), - GFP_KERNEL_ACCOUNT); + ct_limit = kmalloc_obj(*ct_limit, GFP_KERNEL_ACCOUNT); if (!ct_limit) return -ENOMEM; @@ -1770,8 +1767,8 @@ static int __ovs_ct_limit_get_zone_limit(struct net *net, zone_limit.limit = limit; nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, - &ct_zone); + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, + &conncount_key); return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d5b6e2002bc1..e209099218b4 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1029,7 +1029,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - key = kzalloc(sizeof(*key), GFP_KERNEL); + key = kzalloc_obj(*key); if (!key) { error = -ENOMEM; goto err_kfree_flow; @@ -1797,9 +1797,7 @@ static int ovs_dp_vport_init(struct datapath *dp) { int i; - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); + dp->ports = kmalloc_objs(struct hlist_head, DP_VPORT_HASH_BUCKETS); if (!dp->ports) return -ENOMEM; @@ -1828,7 +1826,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; err = -ENOMEM; - dp = kzalloc(sizeof(*dp), GFP_KERNEL); + dp = kzalloc_obj(*dp); if (dp == NULL) goto err_destroy_reply; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 1cb4f97335d8..67fbf6e48a30 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1890,7 +1890,7 @@ int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, return 0; /* If UFID was not provided, use unmasked key. */ - new_key = kmalloc(sizeof(*new_key), GFP_KERNEL); + new_key = kmalloc_obj(*new_key); if (!new_key) return -ENOMEM; memcpy(new_key, key, sizeof(*key)); @@ -2802,13 +2802,20 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } -static bool validate_push_nsh(const struct nlattr *attr, bool log) +static bool validate_push_nsh(const struct nlattr *a, bool log) { + struct nlattr *nsh_key = nla_data(a); struct sw_flow_match match; struct sw_flow_key key; + /* There must be one and only one NSH header. */ + if (!nla_ok(nsh_key, nla_len(a)) || + nla_total_size(nla_len(nsh_key)) != nla_len(a) || + nla_type(nsh_key) != OVS_KEY_ATTR_NSH) + return false; + ovs_match_init(&match, &key, true, NULL); - return !nsh_key_put_from_nlattr(attr, &match, false, true, log); + return !nsh_key_put_from_nlattr(nsh_key, &match, false, true, log); } /* Return false if there are any non-masked bits set. @@ -3389,7 +3396,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; } mac_proto = MAC_PROTO_NONE; - if (!validate_push_nsh(nla_data(a), log)) + if (!validate_push_nsh(a, log)) return -EINVAL; break; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index ffc72a741a50..61c6a5f77c2e 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -150,14 +150,13 @@ static void __table_instance_destroy(struct table_instance *ti) static struct table_instance *table_instance_alloc(int new_size) { - struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + struct table_instance *ti = kmalloc_obj(*ti); int i; if (!ti) return NULL; - ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head), - GFP_KERNEL); + ti->buckets = kvmalloc_objs(struct hlist_head, new_size); if (!ti->buckets) { kfree(ti); return NULL; @@ -367,7 +366,7 @@ static struct mask_cache *tbl_mask_cache_alloc(u32 size) (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) return NULL; - new = kzalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc_obj(*new); if (!new) return NULL; @@ -965,7 +964,7 @@ static struct sw_flow_mask *mask_alloc(void) { struct sw_flow_mask *mask; - mask = kmalloc(sizeof(*mask), GFP_KERNEL); + mask = kmalloc_obj(*mask); if (mask) mask->ref_count = 1; @@ -1110,8 +1109,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table) int i; /* Build array of all current entries with use counters. */ - masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), - GFP_KERNEL); + masks_and_count = kmalloc_objs(*masks_and_count, ma->max); if (!masks_and_count) return; diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index cc08e0403909..a02c47277337 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -69,7 +69,7 @@ static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) { struct dp_meter_instance *ti; - ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); + ti = kvzalloc_flex(*ti, dp_meters, size); if (!ti) return NULL; @@ -341,7 +341,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ - meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); + meter = kzalloc_flex(*meter, bands, n_bands, GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 91a11067e458..6574f9bcdc02 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -160,10 +160,19 @@ void ovs_netdev_detach_dev(struct vport *vport) static void netdev_destroy(struct vport *vport) { - rtnl_lock(); - if (netif_is_ovs_port(vport->dev)) - ovs_netdev_detach_dev(vport); - rtnl_unlock(); + /* When called from ovs_db_notify_wq() after a dp_device_event(), the + * port has already been detached, so we can avoid taking the RTNL by + * checking this first. + */ + if (netif_is_ovs_port(vport->dev)) { + rtnl_lock(); + /* Check again while holding the lock to ensure we don't race + * with the netdev notifier and detach twice. + */ + if (netif_is_ovs_port(vport->dev)) + ovs_netdev_detach_dev(vport); + rtnl_unlock(); + } call_rcu(&vport->rcu, vport_netdev_free); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6bbbc16ab778..23f629e94a36 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -34,8 +34,7 @@ static struct hlist_head *dev_table; */ int ovs_vport_init(void) { - dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), - GFP_KERNEL); + dev_table = kzalloc_objs(struct hlist_head, VPORT_HASH_BUCKETS); if (!dev_table) return -ENOMEM; @@ -310,22 +309,23 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) */ int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) { + u64 tx_success = 0, tx_fail = 0; struct nlattr *nla; int i; - __u64 tx_success = 0; - __u64 tx_fail = 0; - for_each_possible_cpu(i) { const struct vport_upcall_stats_percpu *stats; + u64 n_success, n_fail; unsigned int start; stats = per_cpu_ptr(vport->upcall_stats, i); do { start = u64_stats_fetch_begin(&stats->syncp); - tx_success += u64_stats_read(&stats->n_success); - tx_fail += u64_stats_read(&stats->n_fail); + n_success = u64_stats_read(&stats->n_success); + n_fail = u64_stats_read(&stats->n_fail); } while (u64_stats_fetch_retry(&stats->syncp, start)); + tx_success += n_success; + tx_fail += n_fail; } nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 173e6edda08f..72d0935139f0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -572,8 +572,9 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) - proto = __vlan_get_protocol_offset(skb, proto, - skb_mac_offset(skb), NULL); + proto = vlan_get_protocol_offset_inline(skb, proto, + skb_mac_offset(skb), + NULL); return proto; } @@ -1710,7 +1711,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; - rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); + rollover = kzalloc_obj(*rollover); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); @@ -1753,8 +1754,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; - match = kvzalloc(struct_size(match, arr, args->max_num_members), - GFP_KERNEL); + match = kvzalloc_flex(*match, arr, args->max_num_members); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); @@ -3279,11 +3279,12 @@ out_unlock: * Bind a packet socket to a device */ -static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, +static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data_min) + 1]; + struct sockaddr *sa = (struct sockaddr *)uaddr; + char name[sizeof(sa->sa_data) + 1]; /* * Check legality @@ -3294,13 +3295,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); - name[sizeof(uaddr->sa_data_min)] = 0; + memcpy(name, sa->sa_data, sizeof(sa->sa_data)); + name[sizeof(sa->sa_data)] = 0; return packet_do_bind(sk, name, 0, 0); } -static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; @@ -3580,11 +3581,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); @@ -3691,7 +3692,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) goto done; err = -ENOBUFS; - i = kmalloc(sizeof(*i), GFP_KERNEL); + i = kmalloc_obj(*i); if (i == NULL) goto done; @@ -4388,7 +4389,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) struct pgv *pg_vec; int i; - pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); + pg_vec = kzalloc_objs(struct pgv, block_nr, GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 238a9638d2b0..d89225d6bfd3 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -129,9 +129,12 @@ static int pn_header_create(struct sk_buff *skb, struct net_device *dev, return 1; } -static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) +static int pn_header_parse(const struct sk_buff *skb, + const struct net_device *dev, + unsigned char *haddr) { const u8 *media = skb_mac_header(skb); + *haddr = *media; return 1; } diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 4db564d9d522..120e711ea78c 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -882,7 +882,8 @@ drop: return newsk; } -static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len) +static int pep_sock_connect(struct sock *sk, struct sockaddr_unsized *addr, + int len) { struct pep_sock *pn = pep_sk(sk); int err; diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 5c36bae37b8f..86325b7fc1b6 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -48,7 +48,7 @@ struct phonet_device_list *phonet_device_list(struct net *net) static struct phonet_device *__phonet_device_alloc(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); - struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC); + struct phonet_device *pnd = kmalloc_obj(*pnd, GFP_ATOMIC); if (pnd == NULL) return NULL; pnd->netdev = dev; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index db2d552e9b32..4423d483c630 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(pn_sock_unhash); static DEFINE_MUTEX(port_mutex); -static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) +static int pn_socket_bind(struct socket *sock, struct sockaddr_unsized *addr, int len) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -206,16 +206,16 @@ static int pn_socket_autobind(struct socket *sock) memset(&sa, 0, sizeof(sa)); sa.spn_family = AF_PHONET; - err = pn_socket_bind(sock, (struct sockaddr *)&sa, - sizeof(struct sockaddr_pn)); + err = pn_socket_bind(sock, (struct sockaddr_unsized *)&sa, + sizeof(struct sockaddr_pn)); if (err != -EINVAL) return err; BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); return 0; /* socket was already bound */ } -static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, - int len, int flags) +static int pn_socket_connect(struct socket *sock, struct sockaddr_unsized *addr, + int len, int flags) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); diff --git a/net/psample/psample.c b/net/psample/psample.c index 25f92ba0840c..7763662036fb 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -143,7 +143,7 @@ static struct psample_group *psample_group_create(struct net *net, { struct psample_group *group; - group = kzalloc(sizeof(*group), GFP_ATOMIC); + group = kzalloc_obj(*group, GFP_ATOMIC); if (!group) return NULL; diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 371e8771f3bd..84d6b0f25460 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SKB_EXTENSIONS select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP Security Protocol (PSP). diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 9fdd6f831803..22a48d0fa378 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/psp.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -47,6 +48,11 @@ static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, }; +/* PSP_CMD_GET_STATS - do */ +static const struct nla_policy psp_get_stats_nl_policy[PSP_A_STATS_DEV_ID + 1] = { + [PSP_A_STATS_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -99,6 +105,20 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_ASSOC_SOCK_FD, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_GET_STATS, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_get_stats_doit, + .post_doit = psp_device_unlock, + .policy = psp_get_stats_nl_policy, + .maxattr = PSP_A_STATS_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_GET_STATS, + .dumpit = psp_nl_get_stats_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 25268ed11fb5..599c5f1c82f2 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/psp.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_PSP_GEN_H #define _LINUX_PSP_GEN_H @@ -28,6 +29,8 @@ int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_get_stats_dumpit(struct sk_buff *skb, struct netlink_callback *cb); enum { PSP_NLGRP_MGMT, diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 481aaf0fc9fc..d4c04c923c5a 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -60,10 +60,11 @@ psp_dev_create(struct net_device *netdev, !psd_ops->key_rotate || !psd_ops->rx_spi_alloc || !psd_ops->tx_key_add || - !psd_ops->tx_key_del)) + !psd_ops->tx_key_del || + !psd_ops->get_stats)) return ERR_PTR(-EINVAL); - psd = kzalloc(sizeof(*psd), GFP_KERNEL); + psd = kzalloc_obj(*psd); if (!psd) return ERR_PTR(-ENOMEM); @@ -165,9 +166,46 @@ static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, { struct udphdr *uh = udp_hdr(skb); struct psphdr *psph = (struct psphdr *)(uh + 1); + const struct sock *sk = skb->sk; uh->dest = htons(PSP_DEFAULT_UDP_PORT); - uh->source = udp_flow_src_port(net, skb, 0, 0, false); + + /* A bit of theory: Selection of the source port. + * + * We need some entropy, so that multiple flows use different + * source ports for better RSS spreading at the receiver. + * + * We also need that all packets belonging to one TCP flow + * use the same source port through their duration, + * so that all these packets land in the same receive queue. + * + * udp_flow_src_port() is using sk_txhash, inherited from + * skb_set_hash_from_sk() call in __tcp_transmit_skb(). + * This field is subject to reshuffling, thanks to + * sk_rethink_txhash() calls in various TCP functions. + * + * Instead, use sk->sk_hash which is constant through + * the whole flow duration. + */ + if (likely(sk)) { + u32 hash = sk->sk_hash; + int min, max; + + /* These operations are cheap, no need to cache the result + * in another socket field. + */ + inet_get_local_port_range(net, &min, &max); + /* Since this is being sent on the wire obfuscate hash a bit + * to minimize possibility that any useful information to an + * attacker is leaked. Only upper 16 bits are relevant in the + * computation for 16 bit port value because we use a + * reciprocal divide. + */ + hash ^= hash << 16; + uh->source = htons((((u64)hash * (max - min)) >> 32) + min); + } else { + uh->source = udp_flow_src_port(net, skb, 0, 0, false); + } uh->check = 0; uh->len = htons(udp_len); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 8aaca62744c3..6afd7707ec12 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/ethtool.h> #include <linux/skbuff.h> #include <linux/xarray.h> #include <net/genetlink.h> @@ -262,6 +263,7 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) psd->generation & ~PSP_GEN_VALID_MASK); psp_assocs_key_rotated(psd); + psd->stats.rotations++; nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, @@ -503,3 +505,94 @@ err_free_msg: nlmsg_free(rsp); return err; } + +static int +psp_nl_stats_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + unsigned int required_cnt = sizeof(struct psp_dev_stats) / sizeof(u64); + struct psp_dev_stats stats; + void *hdr; + int i; + + memset(&stats, 0xff, sizeof(stats)); + psd->ops->get_stats(psd, &stats); + + for (i = 0; i < required_cnt; i++) + if (WARN_ON_ONCE(stats.required[i] == ETHTOOL_STAT_NOT_SET)) + return -EOPNOTSUPP; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_STATS_DEV_ID, psd->id) || + nla_put_uint(rsp, PSP_A_STATS_KEY_ROTATIONS, + psd->stats.rotations) || + nla_put_uint(rsp, PSP_A_STATS_STALE_EVENTS, psd->stats.stales) || + nla_put_uint(rsp, PSP_A_STATS_RX_PACKETS, stats.rx_packets) || + nla_put_uint(rsp, PSP_A_STATS_RX_BYTES, stats.rx_bytes) || + nla_put_uint(rsp, PSP_A_STATS_RX_AUTH_FAIL, stats.rx_auth_fail) || + nla_put_uint(rsp, PSP_A_STATS_RX_ERROR, stats.rx_error) || + nla_put_uint(rsp, PSP_A_STATS_RX_BAD, stats.rx_bad) || + nla_put_uint(rsp, PSP_A_STATS_TX_PACKETS, stats.tx_packets) || + nla_put_uint(rsp, PSP_A_STATS_TX_BYTES, stats.tx_bytes) || + nla_put_uint(rsp, PSP_A_STATS_TX_ERROR, stats.tx_error)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_stats_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_stats_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_stats_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_get_stats_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_stats_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index a931d825d1cc..a85b0ed88842 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -50,8 +50,8 @@ struct psp_assoc *psp_assoc_create(struct psp_dev *psd) lockdep_assert_held(&psd->lock); - pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), - GFP_KERNEL_ACCOUNT); + pas = kzalloc_flex(*pas, drv_data, psd->caps->assoc_drv_spc, + GFP_KERNEL_ACCOUNT); if (!pas) return NULL; @@ -253,8 +253,10 @@ void psp_assocs_key_rotated(struct psp_dev *psd) /* Mark the stale associations as invalid, they will no longer * be able to Rx any traffic. */ - list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) + list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) { pas->generation |= ~PSP_GEN_VALID_MASK; + psd->stats.stales++; + } list_splice_init(&psd->prev_assocs, &psd->stale_assocs); list_splice_init(&psd->active_assocs, &psd->prev_assocs); diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 00c51cf693f3..55fd2dd37588 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -271,7 +271,7 @@ static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port, mutex_lock(&node->qrtr_tx_lock); flow = radix_tree_lookup(&node->qrtr_tx_flow, key); if (!flow) { - flow = kzalloc(sizeof(*flow), GFP_KERNEL); + flow = kzalloc_obj(*flow); if (flow) { init_waitqueue_head(&flow->resume_tx); if (radix_tree_insert(&node->qrtr_tx_flow, key, flow)) { @@ -589,7 +589,7 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) if (!ep || !ep->xmit) return -EINVAL; - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = kzalloc_obj(*node); if (!node) return -ENOMEM; @@ -824,7 +824,7 @@ static int qrtr_autobind(struct socket *sock) } /* Bind socket to specified sockaddr. */ -static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) +static int qrtr_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); @@ -1084,7 +1084,7 @@ out: return rc; } -static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, +static int qrtr_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index 69f53625a049..80e341d2f8a4 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -24,13 +24,25 @@ static void qcom_mhi_qrtr_dl_callback(struct mhi_device *mhi_dev, struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev); int rc; - if (!qdev || mhi_res->transaction_status) + if (!qdev || (mhi_res->transaction_status && mhi_res->transaction_status != -ENOTCONN)) return; + /* Channel got reset. So just free the buffer */ + if (mhi_res->transaction_status == -ENOTCONN) { + devm_kfree(&mhi_dev->dev, mhi_res->buf_addr); + return; + } + rc = qrtr_endpoint_post(&qdev->ep, mhi_res->buf_addr, mhi_res->bytes_xferd); if (rc == -EINVAL) dev_err(qdev->dev, "invalid ipcrouter packet\n"); + + /* Done with the buffer, now recycle it for future use */ + rc = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, mhi_res->buf_addr, + mhi_dev->mhi_cntrl->buffer_len, MHI_EOT); + if (rc) + dev_err(&mhi_dev->dev, "Failed to recycle the buffer: %d\n", rc); } /* From QRTR to MHI */ @@ -72,6 +84,29 @@ free_skb: return rc; } +static int qcom_mhi_qrtr_queue_dl_buffers(struct mhi_device *mhi_dev) +{ + u32 free_desc; + void *buf; + int ret; + + free_desc = mhi_get_free_desc_count(mhi_dev, DMA_FROM_DEVICE); + while (free_desc--) { + buf = devm_kmalloc(&mhi_dev->dev, mhi_dev->mhi_cntrl->buffer_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, buf, mhi_dev->mhi_cntrl->buffer_len, + MHI_EOT); + if (ret) { + dev_err(&mhi_dev->dev, "Failed to queue buffer: %d\n", ret); + return ret; + } + } + + return 0; +} + static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id) { @@ -87,20 +122,30 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, qdev->ep.xmit = qcom_mhi_qrtr_send; dev_set_drvdata(&mhi_dev->dev, qdev); - rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); - if (rc) - return rc; /* start channels */ - rc = mhi_prepare_for_transfer_autoqueue(mhi_dev); - if (rc) { - qrtr_endpoint_unregister(&qdev->ep); + rc = mhi_prepare_for_transfer(mhi_dev); + if (rc) return rc; - } + + rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); + if (rc) + goto err_unprepare; + + rc = qcom_mhi_qrtr_queue_dl_buffers(mhi_dev); + if (rc) + goto err_unregister; dev_dbg(qdev->dev, "Qualcomm MHI QRTR driver probed\n"); return 0; + +err_unregister: + qrtr_endpoint_unregister(&qdev->ep); +err_unprepare: + mhi_unprepare_from_transfer(mhi_dev); + + return rc; } static void qcom_mhi_qrtr_remove(struct mhi_device *mhi_dev) @@ -151,11 +196,13 @@ static int __maybe_unused qcom_mhi_qrtr_pm_resume_early(struct device *dev) if (state == MHI_STATE_M3) return 0; - rc = mhi_prepare_for_transfer_autoqueue(mhi_dev); - if (rc) + rc = mhi_prepare_for_transfer(mhi_dev); + if (rc) { dev_err(dev, "failed to prepare for autoqueue transfer %d\n", rc); + return rc; + } - return rc; + return qcom_mhi_qrtr_queue_dl_buffers(mhi_dev); } static const struct dev_pm_ops qcom_mhi_qrtr_pm_ops = { diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 3de9350cbf30..3203b2220860 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -78,7 +78,7 @@ static struct qrtr_node *node_get(unsigned int node_id) return node; /* If node didn't exist, allocate and insert it to the tree */ - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = kzalloc_obj(*node); if (!node) return NULL; @@ -229,7 +229,7 @@ static struct qrtr_server *server_add(unsigned int service, if (!service || !port) return NULL; - srv = kzalloc(sizeof(*srv), GFP_KERNEL); + srv = kzalloc_obj(*srv); if (!srv) return NULL; @@ -534,7 +534,7 @@ static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, if (from->sq_node != qrtr_ns.local_node) return -EINVAL; - lookup = kzalloc(sizeof(*lookup), GFP_KERNEL); + lookup = kzalloc_obj(*lookup); if (!lookup) return -ENOMEM; @@ -714,7 +714,7 @@ int qrtr_ns_init(void) sq.sq_port = QRTR_PORT_CTRL; qrtr_ns.local_node = sq.sq_node; - ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); + ret = kernel_bind(qrtr_ns.sock, (struct sockaddr_unsized *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); goto err_wq; diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c index 304b41fea5ab..9726db31fc7e 100644 --- a/net/qrtr/tun.c +++ b/net/qrtr/tun.c @@ -33,7 +33,7 @@ static int qrtr_tun_open(struct inode *inode, struct file *filp) struct qrtr_tun *tun; int ret; - tun = kzalloc(sizeof(*tun), GFP_KERNEL); + tun = kzalloc_obj(*tun); if (!tun) return -ENOMEM; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 4a7217fbeab6..b396c673dfaf 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -533,7 +533,7 @@ out: } -static int rds_connect(struct socket *sock, struct sockaddr *uaddr, +static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/rds/bind.c b/net/rds/bind.c index 97a29172a8ee..f800d920d969 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -160,7 +160,7 @@ void rds_remove_bound(struct rds_sock *rs) rs->rs_bound_addr = in6addr_any; } -int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); diff --git a/net/rds/cong.c b/net/rds/cong.c index 8b689ebbd5b5..3133b91f9e69 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -143,7 +143,7 @@ static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) unsigned long i; unsigned long flags; - map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); + map = kzalloc_obj(struct rds_cong_map); if (!map) return NULL; @@ -242,7 +242,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); } rcu_read_unlock(); } diff --git a/net/rds/connection.c b/net/rds/connection.c index 68bc88cce84e..412441aaa298 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -169,6 +169,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; + struct rds_conn_path *free_cp = NULL; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); @@ -196,7 +197,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn = ERR_PTR(-ENOMEM); goto out; } - conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp); + conn->c_path = kzalloc_objs(struct rds_conn_path, npaths, gfp); if (!conn->c_path) { kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-ENOMEM); @@ -269,6 +270,11 @@ static struct rds_connection *__rds_conn_create(struct net *net, __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; + conn->c_path[i].cp_wq = + alloc_ordered_workqueue("krds_cp_wq#%lu/%d", 0, + rds_conn_count, i); + if (!conn->c_path[i].cp_wq) + conn->c_path[i].cp_wq = rds_wq; } rcu_read_lock(); if (rds_destroy_pending(conn)) @@ -277,7 +283,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); - kfree(conn->c_path); + free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; @@ -300,7 +306,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); - kfree(conn->c_path); + free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { @@ -327,7 +333,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } - kfree(conn->c_path); + free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = found; } else { @@ -342,6 +348,13 @@ static struct rds_connection *__rds_conn_create(struct net *net, rcu_read_unlock(); out: + if (free_cp) { + for (i = 0; i < npaths; i++) + if (free_cp[i].cp_wq != rds_wq) + destroy_workqueue(free_cp[i].cp_wq); + kfree(free_cp); + } + return conn; } @@ -382,6 +395,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp) if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_DISCONNECTING) && + !rds_conn_path_transition(cp, RDS_CONN_RESETTING, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", @@ -427,13 +442,24 @@ void rds_conn_shutdown(struct rds_conn_path *cp) * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); + + clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); + if (conn->c_trans->t_mp_capable && + cp->cp_index == 0) + rds_send_ping(conn, 0); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } + + /* we do not hold the socket lock here but it is safe because + * fan-out is disabled when calling conn_slots_available() + */ + if (conn->c_trans->conn_slots_available) + conn->c_trans->conn_slots_available(conn, false); } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over @@ -469,6 +495,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); + if (cp->cp_wq != rds_wq) { + destroy_workqueue(cp->cp_wq); + cp->cp_wq = NULL; + } + cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } @@ -884,7 +915,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) rcu_read_unlock(); return; } - queue_work(rds_wq, &cp->cp_down_w); + queue_work(cp->cp_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); @@ -909,7 +940,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) - queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); diff --git a/net/rds/ib.c b/net/rds/ib.c index 9826fe7f9d00..ac6affa33ce7 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -172,9 +172,7 @@ static int rds_ib_add_one(struct ib_device *device) rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; - rds_ibdev->vector_load = kcalloc(device->num_comp_vectors, - sizeof(int), - GFP_KERNEL); + rds_ibdev->vector_load = kzalloc_objs(int, device->num_comp_vectors); if (!rds_ibdev->vector_load) { pr_err("RDS/IB: %s failed to allocate vector memory\n", __func__); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 26b069e1999d..0c64c504f79d 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1203,7 +1203,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) int ret; /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_ib_connection), gfp); + ic = kzalloc_obj(struct rds_ib_connection, gfp); if (!ic) return -ENOMEM; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 6585164c7059..077f7041df15 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -67,7 +67,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; - i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); + i_ipaddr = kmalloc_obj(*i_ipaddr); if (!i_ipaddr) return -ENOMEM; @@ -585,7 +585,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, if (key_ret) *key_ret = ib_mr->rkey; - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + ibmr = kzalloc_obj(*ibmr); if (!ibmr) { ib_dereg_mr(ib_mr); ret = -ENOMEM; @@ -641,7 +641,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, { struct rds_ib_mr_pool *pool; - pool = kzalloc(sizeof(*pool), GFP_KERNEL); + pool = kzalloc_obj(*pool); if (!pool) return ERR_PTR(-ENOMEM); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 4248dfa816eb..357128d34a54 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -457,7 +457,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) (must_wake || (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || rds_ib_ring_empty(&ic->i_recv_ring))) { - queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + queue_delayed_work(conn->c_path->cp_wq, &conn->c_recv_w, 1); } if (can_wait) cond_resched(); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 4190b90ff3b1..fcd04c29f543 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -297,7 +297,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0); /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { @@ -419,7 +419,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0); WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); @@ -577,16 +577,42 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ if (rm->rdma.op_active) { - struct rds_ext_header_rdma ext_hdr; + struct rds_ext_header_rdma ext_hdr = {}; + struct rds_ext_header_rdma_bytes + rdma_bytes_ext_hdr = {}; ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); - rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + if (rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, + &ext_hdr)) { + /* prepare the rdma bytes ext header */ + rdma_bytes_ext_hdr.h_rflags = + rm->rdma.op_write ? + RDS_FLAG_RDMA_WR_BYTES : + RDS_FLAG_RDMA_RD_BYTES; + rdma_bytes_ext_hdr.h_rdma_bytes = + cpu_to_be32(rm->rdma.op_bytes); + } else { + rdsdebug("RDS_EXTHDR_RDMA dropped"); + } + + if (rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA_BYTES, + &rdma_bytes_ext_hdr)) { + /* rdma bytes ext header was added successfully, + * notify the remote side via flag in header + */ + rm->m_inc.i_hdr.h_flags |= + RDS_FLAG_EXTHDR_EXTENSION; + } else { + rdsdebug("RDS_EXTHDR_RDMA_BYTES dropped"); + } } - if (rm->m_rdma_cookie) { - rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, - rds_rdma_cookie_key(rm->m_rdma_cookie), - rds_rdma_cookie_offset(rm->m_rdma_cookie)); + if (rm->m_rdma_cookie && + !rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie))) { + rdsdebug("RDS_EXTHDR_RDMA_DEST dropped\n"); } /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so diff --git a/net/rds/info.c b/net/rds/info.c index b6b46a8214a0..f1b29994934a 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -187,7 +187,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) >> PAGE_SHIFT; - pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_objs(struct page *, nr_pages); if (!pages) { ret = -ENOMEM; goto out; diff --git a/net/rds/loop.c b/net/rds/loop.c index 1d73ad79c847..ac9295a766b1 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -137,7 +137,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_loop_connection *lc; unsigned long flags; - lc = kzalloc(sizeof(struct rds_loop_connection), gfp); + lc = kzalloc_obj(struct rds_loop_connection, gfp); if (!lc) return -ENOMEM; diff --git a/net/rds/message.c b/net/rds/message.c index 199a899a43e9..eaa6f22601a4 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -44,8 +44,10 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes), [RDS_EXTHDR_NPATHS] = sizeof(__be16), [RDS_EXTHDR_GEN_NUM] = sizeof(__be32), +[RDS_EXTHDR_SPORT_IDX] = 1, }; void rds_message_addref(struct rds_message *rm) @@ -191,31 +193,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport, hdr->h_sport = sport; hdr->h_dport = dport; hdr->h_sequence = cpu_to_be64(seq); - hdr->h_exthdr[0] = RDS_EXTHDR_NONE; + /* see rds_find_next_ext_space for reason why we memset the + * ext header + */ + memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE); } EXPORT_SYMBOL_GPL(rds_message_populate_header); -int rds_message_add_extension(struct rds_header *hdr, unsigned int type, - const void *data, unsigned int len) +/* + * Find the next place we can add an RDS header extension with + * specific length. Extension headers are pushed one after the + * other. In the following, the number after the colon is the number + * of bytes: + * + * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE + * + * If the extension headers fill the complete extension header space + * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted. + */ +static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len, + u8 **ext_start) { - unsigned int ext_len = sizeof(u8) + len; - unsigned char *dst; + unsigned int ext_len; + unsigned int type; + int ind = 0; + + while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) { + if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) { + *ext_start = hdr->h_exthdr + ind; + return 0; + } - /* For now, refuse to add more than one extension header */ - if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) - return 0; + type = hdr->h_exthdr[ind]; + + ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0; + WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type); + if (!ext_len) + return -EINVAL; + + /* ind points to a valid ext hdr with known length */ + ind += 1 + ext_len; + } + + /* no room for extension */ + return -ENOSPC; +} + +/* The ext hdr space is prefilled with zero from the kzalloc() */ +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data) +{ + unsigned char *dst; + unsigned int len; - if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) + len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0; + if (!len) return 0; - if (ext_len >= RDS_HEADER_EXT_SPACE) + if (rds_find_next_ext_space(hdr, len, &dst)) return 0; - dst = hdr->h_exthdr; *dst++ = type; memcpy(dst, data, len); - dst[len] = RDS_EXTHDR_NONE; return 1; } EXPORT_SYMBOL_GPL(rds_message_add_extension); @@ -272,7 +312,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); ext_hdr.h_rdma_offset = cpu_to_be32(offset); - return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); + return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr); } EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); @@ -375,7 +415,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * */ sg = rm->data.op_sg; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc_obj(*info); if (!info) return -ENOMEM; INIT_LIST_HEAD(&info->rs_zcookie_next); diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 00dbcd4d28e6..aa6465dc742c 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -228,13 +228,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kzalloc_objs(struct page *, nr_pages); if (!pages) { ret = -ENOMEM; goto out; } - mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); + mr = kzalloc_obj(struct rds_mr); if (!mr) { ret = -ENOMEM; goto out; @@ -269,7 +269,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } else { nents = ret; - sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); + sg = kmalloc_objs(*sg, nents); if (!sg) { ret = -ENOMEM; goto out; @@ -571,9 +571,7 @@ int rds_rdma_extra_size(struct rds_rdma_args *args, if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; - iov->iov = kcalloc(args->nr_local, - sizeof(struct rds_iovec), - GFP_KERNEL); + iov->iov = kzalloc_objs(struct rds_iovec, args->nr_local); if (!iov->iov) return -ENOMEM; @@ -654,7 +652,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_ret; } - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kzalloc_objs(struct page *, nr_pages); if (!pages) { ret = -ENOMEM; goto out_ret; @@ -681,7 +679,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + op->op_notifier = kmalloc_obj(struct rds_notifier); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; @@ -730,8 +728,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = -EOPNOTSUPP; goto out_pages; } - local_odp_mr = - kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); + local_odp_mr = kzalloc_obj(*local_odp_mr); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; @@ -937,7 +934,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + rm->atomic.op_notifier = kmalloc_obj(*rm->atomic.op_notifier); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; diff --git a/net/rds/rds.h b/net/rds/rds.h index 5b1c072e2e7f..6e0790e4b570 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -118,6 +118,7 @@ struct rds_conn_path { void *cp_transport_data; + struct workqueue_struct *cp_wq; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; @@ -146,6 +147,7 @@ struct rds_connection { c_ping_triggered:1, c_pad_to_32:29; int c_npaths; + bool c_with_sport_idx; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -168,6 +170,8 @@ struct rds_connection { u32 c_my_gen_num; u32 c_peer_gen_num; + + u64 c_cp0_mprds_catchup_tx_seq; }; static inline @@ -182,10 +186,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net) write_pnet(&conn->c_net, net); } -#define RDS_FLAG_CONG_BITMAP 0x01 -#define RDS_FLAG_ACK_REQUIRED 0x02 -#define RDS_FLAG_RETRANSMITTED 0x04 -#define RDS_MAX_ADV_CREDIT 255 +#define RDS_FLAG_CONG_BITMAP 0x01 +#define RDS_FLAG_ACK_REQUIRED 0x02 +#define RDS_FLAG_RETRANSMITTED 0x04 +#define RDS_FLAG_EXTHDR_EXTENSION 0x20 +#define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. @@ -257,13 +262,29 @@ struct rds_ext_header_rdma_dest { __be32 h_rdma_offset; }; +/* + * This extension header tells the peer about delivered RDMA byte count. + */ +#define RDS_EXTHDR_RDMA_BYTES 4 + +struct rds_ext_header_rdma_bytes { + __be32 h_rdma_bytes; /* byte count */ + u8 h_rflags; /* direction of RDMA, write or read */ + u8 h_pad[3]; +}; + +#define RDS_FLAG_RDMA_WR_BYTES 0x01 +#define RDS_FLAG_RDMA_RD_BYTES 0x02 + /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 +#define RDS_EXTHDR_SPORT_IDX 8 #define __RDS_EXTHDR_MAX 16 /* for now */ + #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 @@ -505,33 +526,6 @@ struct rds_notifier { */ #define RDS_TRANS_LOOP 3 -/** - * struct rds_transport - transport specific behavioural hooks - * - * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send - * part of a message. The caller serializes on the send_sem so this - * doesn't need to be reentrant for a given conn. The header must be - * sent before the data payload. .xmit must be prepared to send a - * message with no data payload. .xmit should return the number of - * bytes that were sent down the connection, including header bytes. - * Returning 0 tells the caller that it doesn't need to perform any - * additional work now. This is usually the case when the transport has - * filled the sending queue for its connection and will handle - * triggering the rds thread to continue the send when space becomes - * available. Returning -EAGAIN tells the caller to retry the send - * immediately. Returning -ENOMEM tells the caller to retry the send at - * some point in the future. - * - * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once - * it returns the connection can not call rds_recv_incoming(). - * This will only be called once after conn_connect returns - * non-zero success and will The caller serializes this with - * the send and connecting paths (xmit_* and conn_*). The - * transport is responsible for other serialization, including - * rds_recv_incoming(). This is called in process context but - * should try hard not to block. - */ - struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; @@ -544,10 +538,49 @@ struct rds_transport { __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); + + /* + * conn_slots_available is invoked when a previously unavailable + * connection slot becomes available again. rds_tcp_accept_one_path may + * return -ENOBUFS if it cannot find an available slot, and then stashes + * the new socket in "rds_tcp_accepted_sock". This function re-issues + * `rds_tcp_accept_one_path`, which picks up the stashed socket and + * continuing where it left with "-ENOBUFS" last time. This ensures + * messages received on the new socket are not discarded when no + * connection path was available at the time. + */ + void (*conn_slots_available)(struct rds_connection *conn, bool fan_out); int (*conn_path_connect)(struct rds_conn_path *cp); + + /* + * conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rds_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rds_recv_incoming(). This is called in process context but + * should try hard not to block. + */ void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); + + /* + * .xmit is called by rds_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + */ int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); @@ -682,42 +715,43 @@ static inline int rds_sk_rcvbuf(struct rds_sock *rs) } struct rds_statistics { - uint64_t s_conn_reset; - uint64_t s_recv_drop_bad_checksum; - uint64_t s_recv_drop_old_seq; - uint64_t s_recv_drop_no_sock; - uint64_t s_recv_drop_dead_sock; - uint64_t s_recv_deliver_raced; - uint64_t s_recv_delivered; - uint64_t s_recv_queued; - uint64_t s_recv_immediate_retry; - uint64_t s_recv_delayed_retry; - uint64_t s_recv_ack_required; - uint64_t s_recv_rdma_bytes; - uint64_t s_recv_ping; - uint64_t s_send_queue_empty; - uint64_t s_send_queue_full; - uint64_t s_send_lock_contention; - uint64_t s_send_lock_queue_raced; - uint64_t s_send_immediate_retry; - uint64_t s_send_delayed_retry; - uint64_t s_send_drop_acked; - uint64_t s_send_ack_required; - uint64_t s_send_queued; - uint64_t s_send_rdma; - uint64_t s_send_rdma_bytes; - uint64_t s_send_pong; - uint64_t s_page_remainder_hit; - uint64_t s_page_remainder_miss; - uint64_t s_copy_to_user; - uint64_t s_copy_from_user; - uint64_t s_cong_update_queued; - uint64_t s_cong_update_received; - uint64_t s_cong_send_error; - uint64_t s_cong_send_blocked; - uint64_t s_recv_bytes_added_to_socket; - uint64_t s_recv_bytes_removed_from_socket; - uint64_t s_send_stuck_rm; + u64 s_conn_reset; + u64 s_recv_drop_bad_checksum; + u64 s_recv_drop_old_seq; + u64 s_recv_drop_no_sock; + u64 s_recv_drop_dead_sock; + u64 s_recv_deliver_raced; + u64 s_recv_delivered; + u64 s_recv_queued; + u64 s_recv_immediate_retry; + u64 s_recv_delayed_retry; + u64 s_recv_ack_required; + u64 s_recv_rdma_bytes; + u64 s_recv_ping; + u64 s_send_queue_empty; + u64 s_send_queue_full; + u64 s_send_lock_contention; + u64 s_send_lock_queue_raced; + u64 s_send_immediate_retry; + u64 s_send_delayed_retry; + u64 s_send_drop_acked; + u64 s_send_ack_required; + u64 s_send_queued; + u64 s_send_rdma; + u64 s_send_rdma_bytes; + u64 s_send_pong; + u64 s_page_remainder_hit; + u64 s_page_remainder_miss; + u64 s_copy_to_user; + u64 s_copy_from_user; + u64 s_cong_update_queued; + u64 s_cong_update_received; + u64 s_cong_send_error; + u64 s_cong_send_blocked; + u64 s_recv_bytes_added_to_socket; + u64 s_recv_bytes_removed_from_socket; + u64 s_send_stuck_rm; + u64 s_mprds_catchup_tx0_retries; }; /* af_rds.c */ @@ -735,7 +769,7 @@ extern wait_queue_head_t rds_poll_waitq; /* bind.c */ -int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); @@ -858,7 +892,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); int rds_message_add_extension(struct rds_header *hdr, - unsigned int type, const void *data, unsigned int len); + unsigned int type, const void *data); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); diff --git a/net/rds/recv.c b/net/rds/recv.c index 66205d6924bf..4b3f9e4a8bfd 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -204,8 +204,14 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, struct rds_ext_header_version version; __be16 rds_npaths; __be32 rds_gen_num; + u8 dummy; } buffer; + bool new_with_sport_idx = false; u32 new_peer_gen_num = 0; + int new_npaths; + bool fan_out; + + new_npaths = conn->c_npaths; while (1) { len = sizeof(buffer); @@ -215,21 +221,48 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, /* Process extension header here */ switch (type) { case RDS_EXTHDR_NPATHS: - conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, - be16_to_cpu(buffer.rds_npaths)); + new_npaths = min_t(int, RDS_MPATH_WORKERS, + be16_to_cpu(buffer.rds_npaths)); break; case RDS_EXTHDR_GEN_NUM: new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; + case RDS_EXTHDR_SPORT_IDX: + new_with_sport_idx = true; + break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); } } + + conn->c_with_sport_idx = new_with_sport_idx; + + if (new_npaths > 1 && new_npaths != conn->c_npaths) { + /* We're about to fan-out. + * Make sure that messages from cp_index#0 + * are sent prior to handling other lanes. + */ + struct rds_conn_path *cp0 = conn->c_path; + unsigned long flags; + + spin_lock_irqsave(&cp0->cp_lock, flags); + conn->c_cp0_mprds_catchup_tx_seq = cp0->cp_next_tx_seq; + spin_unlock_irqrestore(&cp0->cp_lock, flags); + fan_out = true; + } else { + fan_out = false; + } + /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ - conn->c_npaths = max_t(int, conn->c_npaths, 1); + conn->c_npaths = max_t(int, new_npaths, 1); + conn->c_ping_triggered = 0; rds_conn_peer_gen_update(conn, new_peer_gen_num); + + if (conn->c_npaths > 1 && + conn->c_trans->conn_slots_available) + conn->c_trans->conn_slots_available(conn, fan_out); } /* rds_start_mprds() will synchronously start multiple paths when appropriate. diff --git a/net/rds/send.c b/net/rds/send.c index 0b3d0ef2f008..a1039e422a38 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -120,6 +120,57 @@ static void release_in_xmit(struct rds_conn_path *cp) } /* + * Helper function for multipath fanout to ensure lane 0 transmits queued + * messages before other lanes to prevent out-of-order delivery. + * + * Returns true if lane 0 still has messages or false otherwise + */ +static bool rds_mprds_cp0_catchup(struct rds_connection *conn) +{ + struct rds_conn_path *cp0 = conn->c_path; + struct rds_message *rm0; + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&cp0->cp_lock, flags); + + /* the oldest / first message in the retransmit queue + * has to be at or beyond c_cp0_mprds_catchup_tx_seq + */ + if (!list_empty(&cp0->cp_retrans)) { + rm0 = list_entry(cp0->cp_retrans.next, struct rds_message, + m_conn_item); + if (be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) < + conn->c_cp0_mprds_catchup_tx_seq) { + /* the retransmit queue of cp_index#0 has not + * quite caught up yet + */ + ret = true; + goto unlock; + } + } + + /* the oldest / first message of the send queue + * has to be at or beyond c_cp0_mprds_catchup_tx_seq + */ + rm0 = cp0->cp_xmit_rm; + if (!rm0 && !list_empty(&cp0->cp_send_queue)) + rm0 = list_entry(cp0->cp_send_queue.next, struct rds_message, + m_conn_item); + if (rm0 && be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) < + conn->c_cp0_mprds_catchup_tx_seq) { + /* the send queue of cp_index#0 has not quite + * caught up yet + */ + ret = true; + } + +unlock: + spin_unlock_irqrestore(&cp0->cp_lock, flags); + return ret; +} + +/* * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: @@ -248,6 +299,14 @@ restart: if (batch_count >= send_batch_count) goto over_batch; + /* make sure cp_index#0 caught up during fan-out in + * order to avoid lane races + */ + if (cp->cp_index > 0 && rds_mprds_cp0_catchup(conn)) { + rds_stats_inc(s_mprds_catchup_tx0_retries); + goto over_batch; + } + spin_lock_irqsave(&cp->cp_lock, flags); if (!list_empty(&cp->cp_send_queue)) { @@ -458,7 +517,8 @@ over_batch: if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + queue_delayed_work(cp->cp_wq, + &cp->cp_send_w, 1); rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); @@ -1041,39 +1101,6 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } -static int rds_send_mprds_hash(struct rds_sock *rs, - struct rds_connection *conn, int nonblock) -{ - int hash; - - if (conn->c_npaths == 0) - hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); - else - hash = RDS_MPATH_HASH(rs, conn->c_npaths); - if (conn->c_npaths == 0 && hash != 0) { - rds_send_ping(conn, 0); - - /* The underlying connection is not up yet. Need to wait - * until it is up to be sure that the non-zero c_path can be - * used. But if we are interrupted, we have to use the zero - * c_path in case the connection ends up being non-MP capable. - */ - if (conn->c_npaths == 0) { - /* Cannot wait for the connection be made, so just use - * the base c_path. - */ - if (nonblock) - return 0; - if (wait_event_interruptible(conn->c_hs_waitq, - conn->c_npaths != 0)) - hash = 0; - } - if (conn->c_npaths == 1) - hash = 0; - } - return hash; -} - static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) { struct rds_rdma_args *args; @@ -1303,10 +1330,32 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) rs->rs_conn = conn; } - if (conn->c_trans->t_mp_capable) - cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; - else + if (conn->c_trans->t_mp_capable) { + /* Use c_path[0] until we learn that + * the peer supports more (c_npaths > 1) + */ + cpath = &conn->c_path[RDS_MPATH_HASH(rs, conn->c_npaths ? : 1)]; + } else { cpath = &conn->c_path[0]; + } + + /* If we're multipath capable and path 0 is down, queue reconnect + * and send a ping. This initiates the multipath handshake through + * rds_send_probe(), which sends RDS_EXTHDR_NPATHS to the peer, + * starting multipath capability negotiation. + */ + if (conn->c_trans->t_mp_capable && + !rds_conn_path_up(&conn->c_path[0])) { + /* Ensures that only one request is queued. And + * rds_send_ping() ensures that only one ping is + * outstanding. + */ + if (!test_and_set_bit(RDS_RECONNECT_PENDING, + &conn->c_path[0].cp_flags)) + queue_delayed_work(conn->c_path[0].cp_wq, + &conn->c_path[0].cp_conn_w, 0); + rds_send_ping(conn, 0); + } rm->m_conn_path = cpath; @@ -1380,11 +1429,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else - queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); + queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); + + if (ret) + goto out; } - if (ret) - goto out; + rds_message_put(rm); for (ind = 0; ind < vct.indx; ind++) @@ -1456,24 +1507,26 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, cp->cp_conn->c_trans->t_mp_capable) { __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); + u8 dummy = 0; rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_NPATHS, &npaths, - sizeof(npaths)); + RDS_EXTHDR_NPATHS, &npaths); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, - &my_gen_num, - sizeof(u32)); + &my_gen_num); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_SPORT_IDX, + &dummy); } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - /* schedule the send work on rds_wq */ + /* schedule the send work on cp_wq */ rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1); rcu_read_unlock(); rds_message_put(rm); diff --git a/net/rds/stats.c b/net/rds/stats.c index cb2e3d2cdf73..24ee22d09e8c 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -79,6 +79,7 @@ static const char *const rds_stat_names[] = { "recv_bytes_added_to_sock", "recv_bytes_freed_fromsock", "send_stuck_rm", + "mprds_catchup_tx0_retries", }; void rds_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 3cc2f303bf78..654e23d13e3d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -213,6 +213,8 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) sock->sk->sk_data_ready = sock->sk->sk_user_data; tc->t_sock = sock; + if (!tc->t_rtn) + tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid); tc->t_cpath = cp; tc->t_orig_data_ready = sock->sk->sk_data_ready; tc->t_orig_write_space = sock->sk->sk_write_space; @@ -371,16 +373,18 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) int ret = 0; for (i = 0; i < RDS_MPATH_WORKERS; i++) { - tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); + tc = kmem_cache_zalloc(rds_tcp_conn_slab, gfp); if (!tc) { ret = -ENOMEM; goto fail; } mutex_init(&tc->t_conn_path_lock); tc->t_sock = NULL; + tc->t_rtn = NULL; tc->t_tinc = NULL; tc->t_tinc_hdr_rem = sizeof(struct rds_header); tc->t_tinc_data_rem = 0; + init_waitqueue_head(&tc->t_recv_done_waitq); conn->c_path[i].cp_transport_data = tc; tc->t_cpath = &conn->c_path[i]; @@ -458,6 +462,7 @@ struct rds_transport rds_tcp_transport = { .recv_path = rds_tcp_recv_path, .conn_alloc = rds_tcp_conn_alloc, .conn_free = rds_tcp_conn_free, + .conn_slots_available = rds_tcp_conn_slots_available, .conn_path_connect = rds_tcp_conn_path_connect, .conn_path_shutdown = rds_tcp_conn_path_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, @@ -473,17 +478,7 @@ struct rds_transport rds_tcp_transport = { .t_unloading = rds_tcp_is_unloading, }; -static unsigned int rds_tcp_netid; - -/* per-network namespace private data for this module */ -struct rds_tcp_net { - struct socket *rds_tcp_listen_sock; - struct work_struct rds_tcp_accept_w; - struct ctl_table_header *rds_tcp_sysctl; - struct ctl_table *ctl_table; - int sndbuf_size; - int rcvbuf_size; -}; +int rds_tcp_netid; /* All module specific customizations to the RDS-TCP socket should be done in * rds_tcp_tune() and applied after socket creation. @@ -495,18 +490,24 @@ bool rds_tcp_tune(struct socket *sock) struct rds_tcp_net *rtn; tcp_sock_set_nodelay(sock->sk); - lock_sock(sk); /* TCP timer functions might access net namespace even after * a process which created this net namespace terminated. */ if (!sk->sk_net_refcnt) { - if (!maybe_get_net(net)) { - release_sock(sk); + if (!maybe_get_net(net)) return false; - } + /* + * sk_net_refcnt_upgrade() must be called before lock_sock() + * because it does a GFP_KERNEL allocation, which can trigger + * fs_reclaim and create a circular lock dependency with the + * socket lock. The fields it modifies (sk_net_refcnt, + * ns_tracker) are not accessed by any concurrent code path + * at this point. + */ sk_net_refcnt_upgrade(sk); put_net(net); } + lock_sock(sk); rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; @@ -526,15 +527,12 @@ static void rds_tcp_accept_worker(struct work_struct *work) struct rds_tcp_net, rds_tcp_accept_w); - while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0) + while (rds_tcp_accept_one(rtn) == 0) cond_resched(); } -void rds_tcp_accept_work(struct sock *sk) +void rds_tcp_accept_work(struct rds_tcp_net *rtn) { - struct net *net = sock_net(sk); - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - queue_work(rds_wq, &rtn->rds_tcp_accept_w); } @@ -546,6 +544,8 @@ static __net_init int rds_tcp_init_net(struct net *net) memset(rtn, 0, sizeof(*rtn)); + mutex_init(&rtn->rds_tcp_accept_lock); + /* {snd, rcv}buf_size default to 0, which implies we let the * stack pick the value, and permit auto-tuning of buffer size. */ @@ -609,6 +609,8 @@ static void rds_tcp_kill_sock(struct net *net) rtn->rds_tcp_listen_sock = NULL; rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); + if (rtn->rds_tcp_accepted_sock) + sock_release(rtn->rds_tcp_accepted_sock); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 053aa7da87ef..39c86347188c 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -4,6 +4,21 @@ #define RDS_TCP_PORT 16385 +/* per-network namespace private data for this module */ +struct rds_tcp_net { + /* serialize "rds_tcp_accept_one" with "rds_tcp_accept_lock" + * to protect "rds_tcp_accepted_sock" + */ + struct mutex rds_tcp_accept_lock; + struct socket *rds_tcp_listen_sock; + struct socket *rds_tcp_accepted_sock; + struct work_struct rds_tcp_accept_w; + struct ctl_table_header *rds_tcp_sysctl; + const struct ctl_table *ctl_table; + int sndbuf_size; + int rcvbuf_size; +}; + struct rds_tcp_incoming { struct rds_incoming ti_inc; struct sk_buff_head ti_skb_list; @@ -19,6 +34,8 @@ struct rds_tcp_connection { */ struct mutex t_conn_path_lock; struct socket *t_sock; + u32 t_client_port_group; + struct rds_tcp_net *t_rtn; void *t_orig_write_space; void *t_orig_data_ready; void *t_orig_state_change; @@ -38,6 +55,9 @@ struct rds_tcp_connection { u32 t_last_sent_nxt; u32 t_last_expected_una; u32 t_last_seen_una; + + /* for rds_tcp_conn_path_shutdown */ + wait_queue_head_t t_recv_done_waitq; }; struct rds_tcp_statistics { @@ -49,6 +69,7 @@ struct rds_tcp_statistics { }; /* tcp.c */ +extern int rds_tcp_netid; bool rds_tcp_tune(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); @@ -57,7 +78,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); extern struct rds_transport rds_tcp_transport; -void rds_tcp_accept_work(struct sock *sk); +void rds_tcp_accept_work(struct rds_tcp_net *rtn); int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id); /* tcp_connect.c */ @@ -69,7 +90,8 @@ void rds_tcp_state_change(struct sock *sk); struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); -int rds_tcp_accept_one(struct socket *sock); +void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out); +int rds_tcp_accept_one(struct rds_tcp_net *rtn); void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); @@ -86,6 +108,7 @@ void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp); void rds_tcp_xmit_path_complete(struct rds_conn_path *cp); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); +int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack); void rds_tcp_write_space(struct sock *sk); /* tcp_stats.c */ diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index a0046e99d6df..b77c88ffb199 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -75,8 +75,16 @@ void rds_tcp_state_change(struct sock *sk) rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } break; + case TCP_CLOSING: + case TCP_TIME_WAIT: + if (wq_has_sleeper(&tc->t_recv_done_waitq)) + wake_up(&tc->t_recv_done_waitq); + break; case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: case TCP_CLOSE: + if (wq_has_sleeper(&tc->t_recv_done_waitq)) + wake_up(&tc->t_recv_done_waitq); rds_conn_path_drop(cp, false); break; default: @@ -93,6 +101,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; + int port_low, port_high, port; + int port_groups, groups_left; int addrlen; bool isv6; int ret; @@ -145,7 +155,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = kernel_bind(sock, addr, addrlen); + /* encode cp->cp_index in lowest bits of source-port */ + inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high); + port_low = ALIGN(port_low, RDS_MPATH_WORKERS); + port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS; + ret = -EADDRINUSE; + groups_left = port_groups; + while (groups_left-- > 0 && ret) { + if (++tc->t_client_port_group >= port_groups) + tc->t_client_port_group = 0; + port = port_low + + tc->t_client_port_group * RDS_MPATH_WORKERS + + cp->cp_index; + + if (isv6) + sin6.sin6_port = htons(port); + else + sin.sin_port = htons(port); + ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, + addrlen); + } if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); @@ -173,7 +202,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, (struct sockaddr_unsized *)addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) @@ -205,18 +234,58 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; + struct sock *sk; + unsigned int rounds; rdsdebug("shutting down conn %p tc %p sock %p\n", cp->cp_conn, tc, sock); if (sock) { + sk = sock->sk; if (rds_destroy_pending(cp->cp_conn)) - sock_no_linger(sock->sk); - sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); - lock_sock(sock->sk); + sock_no_linger(sk); + + sock->ops->shutdown(sock, SHUT_WR); + + /* after sending FIN, + * wait until we processed all incoming messages + * and we're sure that there won't be any more: + * i.e. state CLOSING, TIME_WAIT, CLOSE_WAIT, + * LAST_ACK, or CLOSE (RFC 793). + * + * Give up waiting after 5 seconds and allow messages + * to theoretically get dropped, if the TCP transition + * didn't happen. + */ + rounds = 0; + do { + /* we need to ensure messages are dequeued here + * since "rds_recv_worker" only dispatches messages + * while the connection is still in RDS_CONN_UP + * and there is no guarantee that "rds_tcp_data_ready" + * was called nor that "sk_data_ready" still points to + * it. + */ + rds_tcp_recv_path(cp); + } while (!wait_event_timeout(tc->t_recv_done_waitq, + (sk->sk_state == TCP_CLOSING || + sk->sk_state == TCP_TIME_WAIT || + sk->sk_state == TCP_CLOSE_WAIT || + sk->sk_state == TCP_LAST_ACK || + sk->sk_state == TCP_CLOSE) && + skb_queue_empty_lockless(&sk->sk_receive_queue), + msecs_to_jiffies(100)) && + ++rounds < 50); + lock_sock(sk); + + /* discard messages that the peer received already */ + tc->t_last_seen_una = rds_tcp_snd_una(tc); + rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), + rds_tcp_is_acked); + rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ - release_sock(sock->sk); + release_sock(sk); sock_release(sock); } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 91e34af3fe5d..08a506aa7ce7 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -35,6 +35,8 @@ #include <linux/in.h> #include <net/tcp.h> #include <trace/events/sock.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include "rds.h" #include "tcp.h" @@ -54,49 +56,102 @@ void rds_tcp_keepalive(struct socket *sock) tcp_sock_set_keepintvl(sock->sk, keepidle); } +static int +rds_tcp_get_peer_sport(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return -1; + + return ntohs(READ_ONCE(inet_sk(sk)->inet_dport)); +} + /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the * client's ipaddr < server's ipaddr. Otherwise, close the accepted * socket and force a reconneect from smaller -> larger ip addr. The reason * we special case cp_index 0 is to allow the rds probe ping itself to itself * get through efficiently. - * Since reconnects are only initiated from the node with the numerically - * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side - * by moving them to CONNECTING in this function. */ -static -struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) +static struct rds_tcp_connection * +rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock) { - int i; - int npaths = max_t(int, 1, conn->c_npaths); + int sport, npaths, i_min, i_max, i; - /* for mprds, all paths MUST be initiated by the peer - * with the smaller address. - */ - if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { - /* Make sure we initiate at least one path if this - * has not already been done; rds_start_mprds() will - * take care of additional paths, if necessary. - */ - if (npaths == 1) - rds_conn_path_connect_if_down(&conn->c_path[0]); - return NULL; + if (conn->c_with_sport_idx) + /* cp->cp_index is encoded in lowest bits of source-port */ + sport = rds_tcp_get_peer_sport(sock); + else + sport = -1; + + npaths = max_t(int, 1, conn->c_npaths); + + if (sport >= 0) { + i_min = sport % npaths; + i_max = i_min; + } else { + i_min = 0; + i_max = npaths - 1; } - for (i = 0; i < npaths; i++) { + for (i = i_min; i <= i_max; i++) { struct rds_conn_path *cp = &conn->c_path[i]; if (rds_conn_path_transition(cp, RDS_CONN_DOWN, - RDS_CONN_CONNECTING) || - rds_conn_path_transition(cp, RDS_CONN_ERROR, - RDS_CONN_CONNECTING)) { + RDS_CONN_CONNECTING)) return cp->cp_transport_data; - } } + return NULL; } -int rds_tcp_accept_one(struct socket *sock) +void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out) +{ + struct rds_tcp_connection *tc; + struct rds_tcp_net *rtn; + struct socket *sock; + int sport, npaths; + + if (rds_destroy_pending(conn)) + return; + + tc = conn->c_path->cp_transport_data; + rtn = tc->t_rtn; + if (!rtn) + return; + + sock = tc->t_sock; + + /* During fan-out, check that the connection we already + * accepted in slot#0 carried the proper source port modulo. + */ + if (fan_out && conn->c_with_sport_idx && sock && + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) > 0) { + /* cp->cp_index is encoded in lowest bits of source-port */ + sport = rds_tcp_get_peer_sport(sock); + npaths = max_t(int, 1, conn->c_npaths); + if (sport >= 0 && sport % npaths != 0) + /* peer initiated with a non-#0 lane first */ + rds_conn_path_drop(conn->c_path, 0); + } + + /* As soon as a connection went down, + * it is safe to schedule a "rds_tcp_accept_one" + * attempt even if there are no connections pending: + * Function "rds_tcp_accept_one" won't block + * but simply return -EAGAIN in that case. + * + * Doing so is necessary to address the case where an + * incoming connection on "rds_tcp_listen_sock" is ready + * to be acccepted prior to a free slot being available: + * the -ENOBUFS case in "rds_tcp_accept_one". + */ + rds_tcp_accept_work(rtn); +} + +int rds_tcp_accept_one(struct rds_tcp_net *rtn) { + struct socket *listen_sock = rtn->rds_tcp_listen_sock; struct socket *new_sock = NULL; struct rds_connection *conn; int ret; @@ -104,23 +159,30 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; + struct sock *sk; struct in6_addr *my_addr, *peer_addr; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; #endif int dev_if = 0; - if (!sock) /* module unload or netns delete in progress */ + if (!listen_sock) /* module unload or netns delete in progress */ return -ENETUNREACH; - ret = kernel_accept(sock, &new_sock, O_NONBLOCK); - if (ret) - return ret; + mutex_lock(&rtn->rds_tcp_accept_lock); + new_sock = rtn->rds_tcp_accepted_sock; + rtn->rds_tcp_accepted_sock = NULL; - rds_tcp_keepalive(new_sock); - if (!rds_tcp_tune(new_sock)) { - ret = -EINVAL; - goto out; + if (!new_sock) { + ret = kernel_accept(listen_sock, &new_sock, O_NONBLOCK); + if (ret) + goto out; + + rds_tcp_keepalive(new_sock); + if (!rds_tcp_tune(new_sock)) { + ret = -EINVAL; + goto out; + } } inet = inet_sk(new_sock->sk); @@ -135,7 +197,7 @@ int rds_tcp_accept_one(struct socket *sock) peer_addr = &daddr; #endif rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", - sock->sk->sk_family, + listen_sock->sk->sk_family, my_addr, ntohs(inet->inet_sport), peer_addr, ntohs(inet->inet_dport)); @@ -155,13 +217,13 @@ int rds_tcp_accept_one(struct socket *sock) } #endif - if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) { + if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) { /* local address connection is only allowed via loopback */ ret = -EOPNOTSUPP; goto out; } - conn = rds_conn_create(sock_net(sock->sk), + conn = rds_conn_create(sock_net(listen_sock->sk), my_addr, peer_addr, &rds_tcp_transport, 0, GFP_KERNEL, dev_if); @@ -174,15 +236,62 @@ int rds_tcp_accept_one(struct socket *sock) * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ - rs_tcp = rds_tcp_accept_one_path(conn); - if (!rs_tcp) + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) { + /* Try to obtain a free connection slot. + * If unsuccessful, we need to preserve "new_sock" + * that we just accepted, since its "sk_receive_queue" + * may contain messages already that have been acknowledged + * to and discarded by the sender. + * We must not throw those away! + */ + rs_tcp = rds_tcp_accept_one_path(conn, new_sock); + if (!rs_tcp) { + /* It's okay to stash "new_sock", since + * "rds_tcp_conn_slots_available" triggers + * "rds_tcp_accept_one" again as soon as one of the + * connection slots becomes available again + */ + rtn->rds_tcp_accepted_sock = new_sock; + new_sock = NULL; + ret = -ENOBUFS; + goto out; + } + } else { + /* This connection request came from a peer with + * a larger address. + * Function "rds_tcp_state_change" makes sure + * that the connection doesn't transition + * to state "RDS_CONN_UP", and therefore + * we should not have received any messages + * on this socket yet. + * This is the only case where it's okay to + * not dequeue messages from "sk_receive_queue". + */ + if (conn->c_npaths <= 1) + rds_conn_path_connect_if_down(&conn->c_path[0]); + rs_tcp = NULL; goto rst_nsk; + } + mutex_lock(&rs_tcp->t_conn_path_lock); cp = rs_tcp->t_cpath; conn_state = rds_conn_path_state(cp); WARN_ON(conn_state == RDS_CONN_UP); - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) { + rds_conn_path_drop(cp, 0); goto rst_nsk; + } + /* Save a local pointer to sk and hold a reference before setting + * callbacks. Once callbacks are set, a concurrent + * rds_tcp_conn_path_shutdown() may call sock_release(), which + * sets new_sock->sk to NULL and drops a reference on sk. + * The local pointer lets us safely access sk_state below even + * if new_sock->sk has been nulled, and sock_hold() keeps sk + * itself valid until we are done. + */ + sk = new_sock->sk; + sock_hold(sk); + if (rs_tcp->t_sock) { /* Duelling SYN has been handled in rds_tcp_accept_one() */ rds_tcp_reset_callbacks(new_sock, cp); @@ -192,6 +301,24 @@ int rds_tcp_accept_one(struct socket *sock) rds_tcp_set_callbacks(new_sock, cp); rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } + + /* Since "rds_tcp_set_callbacks" happens this late + * the connection may already have been closed without + * "rds_tcp_state_change" doing its due diligence. + * + * If that's the case, we simply drop the path, + * knowing that "rds_tcp_conn_path_shutdown" will + * dequeue pending messages. + */ + if (READ_ONCE(sk->sk_state) == TCP_CLOSE_WAIT || + READ_ONCE(sk->sk_state) == TCP_LAST_ACK || + READ_ONCE(sk->sk_state) == TCP_CLOSE) + rds_conn_path_drop(cp, 0); + else + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); + + sock_put(sk); + new_sock = NULL; ret = 0; if (conn->c_npaths == 0) @@ -212,6 +339,9 @@ out: mutex_unlock(&rs_tcp->t_conn_path_lock); if (new_sock) sock_release(new_sock); + + mutex_unlock(&rtn->rds_tcp_accept_lock); + return ret; } @@ -239,7 +369,7 @@ void rds_tcp_listen_data_ready(struct sock *sk) * the listen socket is being torn down. */ if (sk->sk_state == TCP_LISTEN) - rds_tcp_accept_work(sk); + rds_tcp_accept_work(net_generic(sock_net(sk), rds_tcp_netid)); else ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); @@ -290,7 +420,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) addr_len = sizeof(*sin); } - ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); + ret = kernel_bind(sock, (struct sockaddr_unsized *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 7997a19d1da3..49f96ee0c40f 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -278,6 +278,10 @@ static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, desc.error); + if (skb_queue_empty_lockless(&sock->sk->sk_receive_queue) && + wq_has_sleeper(&tc->t_recv_done_waitq)) + wake_up(&tc->t_recv_done_waitq); + return desc.error; } @@ -327,7 +331,7 @@ void rds_tcp_data_ready(struct sock *sk) if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); rcu_read_unlock(); } out: diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 7d284ac7e81a..7c52acc749cf 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -169,7 +169,7 @@ out: * unacked byte of the TCP sequence space. We have to do very careful * wrapping 32bit comparisons here. */ -static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) +int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) { if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) return 0; @@ -201,7 +201,7 @@ void rds_tcp_write_space(struct sock *sk) rcu_read_lock(); if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf && !rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); rcu_read_unlock(); out: diff --git a/net/rds/threads.c b/net/rds/threads.c index 1f424cbfcbb4..639302bab51e 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -89,8 +89,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) set_bit(0, &cp->cp_conn->c_map_queued); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) { - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); } rcu_read_unlock(); cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; @@ -140,7 +140,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); return; } @@ -151,7 +151,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_conn_w, + queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, rand % cp->cp_reconnect_jiffies); rcu_read_unlock(); @@ -203,11 +203,11 @@ void rds_send_worker(struct work_struct *work) switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); - queue_delayed_work(rds_wq, &cp->cp_send_w, 2); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 2); break; default: break; @@ -228,11 +228,11 @@ void rds_recv_worker(struct work_struct *work) switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 2); break; default: break; diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 7d3e82e4c2fc..2444237bc36a 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -279,7 +279,7 @@ static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op) struct rfkill_int_event *ev; list_for_each_entry(data, &rfkill_fds, list) { - ev = kzalloc(sizeof(*ev), GFP_KERNEL); + ev = kzalloc_obj(*ev); if (!ev) continue; rfkill_fill_event(&ev->ev, rfkill, op); @@ -1165,7 +1165,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) struct rfkill *rfkill; struct rfkill_int_event *ev, *tmp; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc_obj(*data); if (!data) return -ENOMEM; @@ -1182,7 +1182,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) */ list_for_each_entry(rfkill, &rfkill_list, node) { - ev = kzalloc(sizeof(*ev), GFP_KERNEL); + ev = kzalloc_obj(*ev); if (!ev) goto free; rfkill_sync(rfkill); diff --git a/net/rfkill/input.c b/net/rfkill/input.c index 53d286b10843..f7cb24ce7754 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -221,7 +221,7 @@ static int rfkill_connect(struct input_handler *handler, struct input_dev *dev, struct input_handle *handle; int error; - handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + handle = kzalloc_obj(struct input_handle); if (!handle) return -ENOMEM; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 543f9e8ebb69..ba56213e0a2a 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -205,7 +205,7 @@ start: spin_unlock_bh(&rose_list_lock); for (i = 0; i < cnt; i++) { - sk = array[cnt]; + sk = array[i]; rose = rose_sk(sk); lock_sock(sk); spin_lock_bh(&rose_list_lock); @@ -693,7 +693,7 @@ static int rose_release(struct socket *sock) return 0; } -static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int rose_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -765,7 +765,8 @@ out_release: return err; } -static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +static int rose_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, + int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -810,6 +811,11 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le goto out_release; } + if (sk->sk_state == TCP_SYN_SENT) { + err = -EALREADY; + goto out_release; + } + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; @@ -1570,8 +1576,7 @@ static int __init rose_proto_init(void) rose_callsign = null_ax25_address; - dev_rose = kcalloc(rose_ndevs, sizeof(struct net_device *), - GFP_KERNEL); + dev_rose = kzalloc_objs(struct net_device *, rose_ndevs); if (dev_rose == NULL) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n"); rc = -ENOMEM; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index a1e9b05ef6f5..e31842e6b3c8 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -82,7 +82,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, } if (rose_neigh == NULL) { - rose_neigh = kmalloc(sizeof(*rose_neigh), GFP_ATOMIC); + rose_neigh = kmalloc_obj(*rose_neigh, GFP_ATOMIC); if (rose_neigh == NULL) { res = -ENOMEM; goto out; @@ -106,7 +106,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, if (rose_route->ndigis != 0) { rose_neigh->digipeat = - kmalloc(sizeof(ax25_digi), GFP_ATOMIC); + kmalloc_obj(ax25_digi, GFP_ATOMIC); if (rose_neigh->digipeat == NULL) { kfree(rose_neigh); res = -ENOMEM; @@ -148,7 +148,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, } /* create new node */ - rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC); + rose_node = kmalloc_obj(*rose_node, GFP_ATOMIC); if (rose_node == NULL) { res = -ENOMEM; goto out; @@ -368,7 +368,7 @@ void rose_add_loopback_neigh(void) { struct rose_neigh *sn; - rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_KERNEL); + rose_loopback_neigh = kmalloc_obj(struct rose_neigh); if (!rose_loopback_neigh) return; sn = rose_loopback_neigh; @@ -417,7 +417,7 @@ int rose_add_loopback_node(const rose_address *address) if (rose_node != NULL) goto out; - if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) { + if ((rose_node = kmalloc_obj(*rose_node, GFP_ATOMIC)) == NULL) { err = -ENOMEM; goto out; } @@ -1055,7 +1055,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) goto put_neigh; } - if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { + if ((rose_route = kmalloc_obj(*rose_route, GFP_ATOMIC)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); goto put_neigh; } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 36df0274d7b7..0f90272ac254 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -127,7 +127,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, /* * bind a local address to an RxRPC socket */ -static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) +static int rxrpc_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; @@ -267,12 +267,13 @@ static int rxrpc_listen(struct socket *sock, int backlog) * Lookup or create a remote transport endpoint record for the specified * address. * - * Return: The peer record found with a reference, %NULL if no record is found - * or a negative error code if the address is invalid or unsupported. + * Return: The peer record found with a reference or a negative error code if + * the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) { + struct rxrpc_peer *peer; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; @@ -280,7 +281,8 @@ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, if (ret < 0) return ERR_PTR(ret); - return rxrpc_lookup_peer(rx->local, srx, gfp); + peer = rxrpc_lookup_peer(rx->local, srx, gfp); + return peer ?: ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); @@ -481,7 +483,7 @@ EXPORT_SYMBOL(rxrpc_kernel_set_notifications); * - this just targets it at a specific destination; no actual connection * negotiation takes place */ -static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, +static int rxrpc_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5b7342d43486..36d6ca0d1089 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -387,7 +387,7 @@ struct rxrpc_peer { struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ unsigned long app_data; /* Application data (e.g. afs_server) */ - time64_t last_tx_at; /* Last time packet sent here */ + unsigned int last_tx_at; /* Last time packet sent here (time64_t LSW) */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ int debug_id; /* debug ID for printks */ @@ -1379,6 +1379,13 @@ void rxrpc_peer_keepalive_worker(struct work_struct *); void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, bool sendmsg_fail); +/* Update the last transmission time on a peer for keepalive purposes. */ +static inline void rxrpc_peer_mark_tx(struct rxrpc_peer *peer) +{ + /* To avoid tearing on 32-bit systems, we only keep the LSW. */ + WRITE_ONCE(peer->last_tx_at, ktime_get_seconds()); +} + /* * peer_object.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 00982a030744..ee2d1319e69a 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -164,7 +164,7 @@ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) struct rxrpc_backlog *b = rx->backlog; if (!b) { - b = kzalloc(sizeof(struct rxrpc_backlog), gfp); + b = kzalloc_obj(struct rxrpc_backlog, gfp); if (!b) return -ENOMEM; rx->backlog = b; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 63bbcc567f59..9b757798dedd 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -76,7 +76,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle; - bundle = kzalloc(sizeof(*bundle), gfp); + bundle = kzalloc_obj(*bundle, gfp); if (bundle) { bundle->local = call->local; bundle->peer = rxrpc_get_peer(call->peer, rxrpc_peer_get_bundle); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 232b6986da83..98ad9b51ca2c 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -194,7 +194,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, } ret = kernel_sendmsg(conn->local->socket, &msg, iov, ioc, len); - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); if (ret < 0) trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret, rxrpc_tx_point_call_final_resend); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 37340becb224..0ece717db0f8 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -59,7 +59,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, _enter(""); - conn = kzalloc(sizeof(struct rxrpc_connection), gfp); + conn = kzalloc_obj(struct rxrpc_connection, gfp); if (conn) { INIT_LIST_HEAD(&conn->cache_link); timer_setup(&conn->timer, &rxrpc_connection_timer, 0); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 9fdc1f031c9d..85078114b2dd 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -75,7 +75,7 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, prep->quotalen = datalen + plen; plen -= sizeof(*token); - token = kzalloc(sizeof(*token), GFP_KERNEL); + token = kzalloc_obj(*token); if (!token) return -ENOMEM; @@ -202,7 +202,7 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, prep->quotalen = datalen + plen; plen -= sizeof(*token); - token = kzalloc(sizeof(*token), GFP_KERNEL); + token = kzalloc_obj(*token); if (!token) goto nomem; @@ -500,7 +500,7 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep) prep->quotalen = plen + sizeof(*token); ret = -ENOMEM; - token = kzalloc(sizeof(*token), GFP_KERNEL); + token = kzalloc_obj(*token); if (!token) goto error; token->kad = kzalloc(plen, GFP_KERNEL); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index a74a4b43904f..111f574fe667 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -112,7 +112,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct net *net, struct rxrpc_local *local; u32 tmp; - local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); + local = kzalloc_obj(struct rxrpc_local); if (local) { refcount_set(&local->ref, 1); atomic_set(&local->active_users, 1); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8b5903b6e481..d70db367e358 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -275,7 +275,7 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); ret = do_udp_sendmsg(conn->local->socket, &msg, len); - call->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(call->peer); if (ret < 0) { trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); @@ -411,7 +411,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt)); ret = do_udp_sendmsg(conn->local->socket, &msg, sizeof(pkt)); - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_abort); @@ -698,7 +698,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req ret = 0; trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, rxrpc_txdata_inject_loss); - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); goto done; } } @@ -711,7 +711,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req */ rxrpc_inc_stat(call->rxnet, stat_tx_data_send); ret = do_udp_sendmsg(conn->local->socket, &msg, len); - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); if (ret == -EMSGSIZE) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); @@ -797,7 +797,7 @@ void rxrpc_send_conn_abort(struct rxrpc_connection *conn) trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); } /* @@ -917,7 +917,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) trace_rxrpc_tx_packet(peer->debug_id, &whdr, rxrpc_tx_point_version_keepalive); - peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(peer); _leave(""); } @@ -973,7 +973,7 @@ void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response if (ret < 0) goto fail; - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); return; fail: diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 7f4729234957..9d02448ac062 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -238,6 +238,21 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, } /* + * Reconstruct the last transmission time. The difference calculated should be + * valid provided no more than ~68 years elapsed since the last transmission. + */ +static time64_t rxrpc_peer_get_tx_mark(const struct rxrpc_peer *peer, time64_t base) +{ + s32 last_tx_at = READ_ONCE(peer->last_tx_at); + s32 base_lsw = base; + s32 diff = last_tx_at - base_lsw; + + diff = clamp(diff, -RXRPC_KEEPALIVE_TIME, RXRPC_KEEPALIVE_TIME); + + return diff + base; +} + +/* * Perform keep-alive pings. */ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, @@ -265,7 +280,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { - keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; + keepalive_at = rxrpc_peer_get_tx_mark(peer, base) + RXRPC_KEEPALIVE_TIME; slot = keepalive_at - base; _debug("%02x peer %u t=%d {%pISp}", cursor, peer->debug_id, slot, &peer->srx.transport); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 366431b0736c..fa9a406e1168 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -226,7 +226,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, _enter(""); - peer = kzalloc(sizeof(struct rxrpc_peer), gfp); + peer = kzalloc_obj(struct rxrpc_peer, gfp); if (peer) { refcount_set(&peer->ref, 1); peer->local = rxrpc_get_local(local, rxrpc_local_get_peer); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index d803562ca0ac..59292f7f9205 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -296,13 +296,13 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6ds %8d %8d\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, peer->max_data, - now - peer->last_tx_at, + (s32)now - (s32)READ_ONCE(peer->last_tx_at), READ_ONCE(peer->recent_srtt_us), READ_ONCE(peer->recent_rto_us)); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 7fa7e77f6bb9..e1f7513a46db 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -518,7 +518,8 @@ try_again: if (rxrpc_call_has_failed(call)) goto call_failed; - if (!skb_queue_empty(&call->recvmsg_queue)) + if (!(flags & MSG_PEEK) && + !skb_queue_empty(&call->recvmsg_queue)) rxrpc_notify_socket(call); goto not_yet_complete; @@ -549,11 +550,21 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { spin_lock_irq(&rx->recvmsg_lock); - list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock_irq(&rx->recvmsg_lock); + if (list_empty(&call->recvmsg_link)) { + list_add(&call->recvmsg_link, &rx->recvmsg_q); + rxrpc_see_call(call, rxrpc_call_see_recvmsg_requeue); + spin_unlock_irq(&rx->recvmsg_lock); + } else if (list_is_first(&call->recvmsg_link, &rx->recvmsg_q)) { + spin_unlock_irq(&rx->recvmsg_lock); + rxrpc_put_call(call, rxrpc_call_see_recvmsg_requeue_first); + } else { + list_move(&call->recvmsg_link, &rx->recvmsg_q); + spin_unlock_irq(&rx->recvmsg_lock); + rxrpc_put_call(call, rxrpc_call_see_recvmsg_requeue_move); + } trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { - rxrpc_put_call(call, rxrpc_call_put_recvmsg); + rxrpc_put_call(call, rxrpc_call_put_recvmsg_peek_nowait); } error_no_call: release_sock(&rx->sk); diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index dce5a3d8a964..f9f5a2dc62ed 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -351,7 +351,7 @@ static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, _enter(""); - hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + hdr = kzalloc_obj(*hdr, GFP_NOFS); if (!hdr) goto error_gk; @@ -483,7 +483,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, &data_offset, &data_len); - hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + hdr = kzalloc_obj(*hdr, GFP_NOFS); if (!hdr) goto put_gk; @@ -678,7 +678,7 @@ static int rxgk_issue_challenge(struct rxrpc_connection *conn) ret = do_udp_sendmsg(conn->local->socket, &msg, len); if (ret > 0) - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); __free_page(page); if (ret < 0) { diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c index b4db5aa30e5b..677981d1ad79 100644 --- a/net/rxrpc/rxgk_kdf.c +++ b/net/rxrpc/rxgk_kdf.c @@ -213,7 +213,7 @@ struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, _enter(""); - gk = kzalloc(sizeof(*gk), GFP_KERNEL); + gk = kzalloc_obj(*gk); if (!gk) return ERR_PTR(-ENOMEM); refcount_set(&gk->usage, 1); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 3657c0661cdc..e923d6829008 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -511,7 +511,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, if (nsg <= 4) { nsg = 4; } else { - sg = kmalloc_array(nsg, sizeof(*sg), GFP_NOIO); + sg = kmalloc_objs(*sg, nsg, GFP_NOIO); if (!sg) return -ENOMEM; } @@ -694,7 +694,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return -EAGAIN; } - conn->peer->last_tx_at = ktime_get_seconds(); + rxrpc_peer_mark_tx(conn->peer); trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_rxkad_challenge); _leave(" = 0"); @@ -1139,7 +1139,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, } ret = -ENOMEM; - response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); + response = kzalloc_obj(struct rxkad_response, GFP_NOFS); if (!response) goto temporary_error; diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 2ea71e3831f7..b8df6d22314d 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -151,7 +151,7 @@ static void rxperf_charge_preallocation(struct work_struct *work) struct rxperf_call *call; for (;;) { - call = kzalloc(sizeof(*call), GFP_KERNEL); + call = kzalloc_obj(*call); if (!call) break; @@ -211,7 +211,7 @@ static int rxperf_open_socket(void) ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring); - ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *)&srx, sizeof(srx)); if (ret < 0) goto error_2; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index ebbb78b842de..04f9c5f2dc24 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -282,7 +282,7 @@ static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) { struct rxrpc_txqueue *tq; - tq = kzalloc(sizeof(*tq), sk->sk_allocation); + tq = kzalloc_obj(*tq, sk->sk_allocation); if (!tq) return -ENOMEM; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 29767038691a..55ef7a04852e 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -23,7 +23,7 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; - txb = kzalloc(sizeof(*txb), gfp); + txb = kzalloc_obj(*txb, gfp); if (!txb) return NULL; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index ff6be5cfe2b0..332fd9695e54 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -940,6 +940,8 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops, int ret; idr_for_each_entry_ul(idr, p, tmp, id) { + if (IS_ERR(p)) + continue; if (tc_act_in_hw(p) && !mutex_taken) { rtnl_lock(); mutex_taken = true; @@ -983,7 +985,7 @@ static int tcf_pernet_add_id_list(unsigned int id) } } - id_ptr = kzalloc(sizeof(*id_ptr), GFP_KERNEL); + id_ptr = kzalloc_obj(*id_ptr); if (!id_ptr) { ret = -ENOMEM; goto err_out; @@ -1270,7 +1272,7 @@ errout: static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) { - struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL); + struct tc_cookie *c = kzalloc_obj(*c); if (!c) return NULL; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 26ba8c2d20ab..c634af5edfcd 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!tb[TCA_CONNMARK_PARMS]) return -EINVAL; - nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + nparms = kzalloc_obj(*nparms); if (!nparms) return -ENOMEM; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 0939e6b2ba4d..213e1ce9d2da 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -93,7 +93,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, p = to_tcf_csum(*a); - params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + params_new = kzalloc_obj(*params_new); if (unlikely(!params_new)) { err = -ENOMEM; goto put_chain; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6749a4a9a9cd..7d5e50c921a0 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -13,9 +13,11 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> +#include <linux/if_tunnel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/rhashtable.h> +#include <net/gre.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -330,7 +332,7 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) goto out_unlock; - ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); + ct_ft = kzalloc_obj(*ct_ft); if (!ct_ft) goto err_alloc; refcount_set(&ct_ft->ref, 1); @@ -948,9 +950,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb, return err & NF_VERDICT_MASK; if (action & BIT(NF_NAT_MANIP_SRC)) - tc_skb_cb(skb)->post_ct_snat = 1; + qdisc_skb_cb(skb)->post_ct_snat = 1; if (action & BIT(NF_NAT_MANIP_DST)) - tc_skb_cb(skb)->post_ct_dnat = 1; + qdisc_skb_cb(skb)->post_ct_dnat = 1; return err; #else @@ -986,7 +988,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_action_update_bstats(&c->common, skb); if (clear) { - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_ct_put(ct); @@ -1097,7 +1099,7 @@ do_nat: out_push: skb_push_rcsum(skb, nh_ofs); - tc_skb_cb(skb)->post_ct = true; + qdisc_skb_cb(skb)->post_ct = true; tc_skb_cb(skb)->zone = p->zone; out_clear: if (defrag) @@ -1358,6 +1360,12 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return -EINVAL; } + if (bind && !(flags & TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT)) { + NL_SET_ERR_MSG_MOD(extack, + "Attaching ct to a non ingress/clsact qdisc is unsupported"); + return -EOPNOTSUPP; + } + err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); if (err < 0) return err; @@ -1395,7 +1403,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, c = to_ct(*a); - params = kzalloc(sizeof(*params), GFP_KERNEL); + params = kzalloc_obj(*params); if (unlikely(!params)) { err = -ENOMEM; goto cleanup; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 71efe04d00b5..1886ffd2ca95 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -16,6 +16,7 @@ #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/inet_ecn.h> #include <uapi/linux/tc_act/tc_ctinfo.h> #include <net/tc_act/tc_ctinfo.h> #include <net/tc_wrapper.h> @@ -235,7 +236,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, ci = to_ctinfo(*a); - cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL); + cp_new = kzalloc_obj(*cp_new); if (unlikely(!cp_new)) { err = -ENOMEM; goto put_chain; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index c1f75f272757..fdbfcaa3e2ab 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -32,9 +32,12 @@ static ktime_t gate_get_time(struct tcf_gate *gact) return KTIME_MAX; } -static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +static void tcf_gate_params_free_rcu(struct rcu_head *head); + +static void gate_get_start_time(struct tcf_gate *gact, + const struct tcf_gate_params *param, + ktime_t *start) { - struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; @@ -69,12 +72,14 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); - struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; + struct tcf_gate_params *p; ktime_t close_time, now; spin_lock(&gact->tcf_lock); + p = rcu_dereference_protected(gact->param, + lockdep_is_held(&gact->tcf_lock)); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ @@ -225,6 +230,35 @@ static void release_entry_list(struct list_head *entries) } } +static int tcf_gate_copy_entries(struct tcf_gate_params *dst, + const struct tcf_gate_params *src, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + int i = 0; + + list_for_each_entry(entry, &src->entries, list) { + struct tcfg_gate_entry *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + new->index = entry->index; + new->gate_state = entry->gate_state; + new->interval = entry->interval; + new->ipv = entry->ipv; + new->maxoctets = entry->maxoctets; + list_add_tail(&new->list, &dst->entries); + i++; + } + + dst->num_entries = i; + return 0; +} + static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) @@ -243,7 +277,7 @@ static int parse_gate_list(struct nlattr *list_attr, continue; } - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); err = -ENOMEM; @@ -270,24 +304,44 @@ release_list: return err; } -static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, - enum tk_offsets tko, s32 clockid, - bool do_init) +static bool gate_timer_needs_cancel(u64 basetime, u64 old_basetime, + enum tk_offsets tko, + enum tk_offsets old_tko, + s32 clockid, s32 old_clockid) { - if (!do_init) { - if (basetime == gact->param.tcfg_basetime && - tko == gact->tk_offset && - clockid == gact->param.tcfg_clockid) - return; + return basetime != old_basetime || + clockid != old_clockid || + tko != old_tko; +} - spin_unlock_bh(&gact->tcf_lock); - hrtimer_cancel(&gact->hitimer); - spin_lock_bh(&gact->tcf_lock); +static int gate_clock_resolve(s32 clockid, enum tk_offsets *tko, + struct netlink_ext_ack *extack) +{ + switch (clockid) { + case CLOCK_REALTIME: + *tko = TK_OFFS_REAL; + return 0; + case CLOCK_MONOTONIC: + *tko = TK_OFFS_MAX; + return 0; + case CLOCK_BOOTTIME: + *tko = TK_OFFS_BOOT; + return 0; + case CLOCK_TAI: + *tko = TK_OFFS_TAI; + return 0; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + return -EINVAL; } - gact->param.tcfg_basetime = basetime; - gact->param.tcfg_clockid = clockid; - gact->tk_offset = tko; - hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); +} + +static void gate_setup_timer(struct tcf_gate *gact, s32 clockid, + enum tk_offsets tko) +{ + WRITE_ONCE(gact->tk_offset, tko); + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, + HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, @@ -296,15 +350,22 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); - enum tk_offsets tk_offset = TK_OFFS_TAI; + u64 cycletime = 0, basetime = 0, cycletime_ext = 0; + struct tcf_gate_params *p = NULL, *old_p = NULL; + enum tk_offsets old_tk_offset = TK_OFFS_TAI; + const struct tcf_gate_params *cur_p = NULL; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; + enum tk_offsets tko = TK_OFFS_TAI; struct tcf_chain *goto_ch = NULL; - u64 cycletime = 0, basetime = 0; - struct tcf_gate_params *p; + s32 timer_clockid = CLOCK_TAI; + bool use_old_entries = false; + s32 old_clockid = CLOCK_TAI; + bool need_cancel = false; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; + u64 old_basetime = 0; int ret = 0, err; u32 gflags = 0; s32 prio = -1; @@ -321,26 +382,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (!tb[TCA_GATE_PARMS]) return -EINVAL; - if (tb[TCA_GATE_CLOCKID]) { + if (tb[TCA_GATE_CLOCKID]) clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); - switch (clockid) { - case CLOCK_REALTIME: - tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - return -EINVAL; - } - } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; @@ -366,6 +409,60 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, return -EEXIST; } + gact = to_gate(*a); + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + err = -ENOMEM; + goto chain_put; + } + INIT_LIST_HEAD(&p->entries); + + use_old_entries = !tb[TCA_GATE_ENTRY_LIST]; + if (!use_old_entries) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto err_free; + use_old_entries = !err; + } + + if (ret == ACT_P_CREATED && use_old_entries) { + NL_SET_ERR_MSG(extack, "The entry list is empty"); + err = -EINVAL; + goto err_free; + } + + if (ret != ACT_P_CREATED) { + rcu_read_lock(); + cur_p = rcu_dereference(gact->param); + + old_basetime = cur_p->tcfg_basetime; + old_clockid = cur_p->tcfg_clockid; + old_tk_offset = READ_ONCE(gact->tk_offset); + + basetime = old_basetime; + cycletime_ext = cur_p->tcfg_cycletime_ext; + prio = cur_p->tcfg_priority; + gflags = cur_p->tcfg_flags; + + if (!tb[TCA_GATE_CLOCKID]) + clockid = old_clockid; + + err = 0; + if (use_old_entries) { + err = tcf_gate_copy_entries(p, cur_p, extack); + if (!err && !tb[TCA_GATE_CYCLE_TIME]) + cycletime = cur_p->tcfg_cycletime; + } + rcu_read_unlock(); + if (err) + goto err_free; + } + if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -375,25 +472,26 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); - gact = to_gate(*a); - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&gact->param.entries); + if (tb[TCA_GATE_CYCLE_TIME]) + cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); - if (err < 0) - goto release_idr; + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); - spin_lock_bh(&gact->tcf_lock); - p = &gact->param; + err = gate_clock_resolve(clockid, &tko, extack); + if (err) + goto err_free; + timer_clockid = clockid; - if (tb[TCA_GATE_CYCLE_TIME]) - cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + need_cancel = ret != ACT_P_CREATED && + gate_timer_needs_cancel(basetime, old_basetime, + tko, old_tk_offset, + timer_clockid, old_clockid); - if (tb[TCA_GATE_ENTRY_LIST]) { - err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); - if (err < 0) - goto chain_put; - } + if (need_cancel) + hrtimer_cancel(&gact->hitimer); + + spin_lock_bh(&gact->tcf_lock); if (!cycletime) { struct tcfg_gate_entry *entry; @@ -402,22 +500,20 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; - if (!cycletime) { - err = -EINVAL; - goto chain_put; - } } p->tcfg_cycletime = cycletime; + p->tcfg_cycletime_ext = cycletime_ext; - if (tb[TCA_GATE_CYCLE_TIME_EXT]) - p->tcfg_cycletime_ext = - nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); - - gate_setup_timer(gact, basetime, tk_offset, clockid, - ret == ACT_P_CREATED); + if (need_cancel || ret == ACT_P_CREATED) + gate_setup_timer(gact, timer_clockid, tko); p->tcfg_priority = prio; p->tcfg_flags = gflags; - gate_get_start_time(gact, &start); + p->tcfg_basetime = basetime; + p->tcfg_clockid = timer_clockid; + gate_get_start_time(gact, p, &start); + + old_p = rcu_replace_pointer(gact->param, p, + lockdep_is_held(&gact->tcf_lock)); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; @@ -434,11 +530,15 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); + if (old_p) + call_rcu(&old_p->rcu, tcf_gate_params_free_rcu); + return ret; +err_free: + release_entry_list(&p->entries); + kfree(p); chain_put: - spin_unlock_bh(&gact->tcf_lock); - if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: @@ -446,21 +546,29 @@ release_idr: * without taking tcf_lock. */ if (ret == ACT_P_CREATED) - gate_setup_timer(gact, gact->param.tcfg_basetime, - gact->tk_offset, gact->param.tcfg_clockid, - true); + gate_setup_timer(gact, timer_clockid, tko); + tcf_idr_release(*a, bind); return err; } +static void tcf_gate_params_free_rcu(struct rcu_head *head) +{ + struct tcf_gate_params *p = container_of(head, struct tcf_gate_params, rcu); + + release_entry_list(&p->entries); + kfree(p); +} + static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; - p = &gact->param; hrtimer_cancel(&gact->hitimer); - release_entry_list(&p->entries); + p = rcu_dereference_protected(gact->param, 1); + if (p) + call_rcu(&p->rcu, tcf_gate_params_free_rcu); } static int dumping_entry(struct sk_buff *skb, @@ -509,10 +617,9 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, struct nlattr *entry_list; struct tcf_t t; - spin_lock_bh(&gact->tcf_lock); - opt.action = gact->tcf_action; - - p = &gact->param; + rcu_read_lock(); + opt.action = READ_ONCE(gact->tcf_action); + p = rcu_dereference(gact->param); if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -552,12 +659,12 @@ static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; - spin_unlock_bh(&gact->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&gact->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 7c6975632fc2..d5e8a91bb4eb 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -293,13 +293,13 @@ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, - struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len, bool atomic, bool exists) + struct tcf_ife_params *p, u32 metaid, void *metaval, + int len, bool atomic) { struct tcf_meta_info *mi = NULL; int ret = 0; - mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); + mi = kzalloc_obj(*mi, atomic ? GFP_ATOMIC : GFP_KERNEL); if (!mi) return -ENOMEM; @@ -313,45 +313,40 @@ static int __add_metainfo(const struct tcf_meta_ops *ops, } } - if (exists) - spin_lock_bh(&ife->tcf_lock); - list_add_tail(&mi->metalist, &ife->metalist); - if (exists) - spin_unlock_bh(&ife->tcf_lock); + list_add_tail(&mi->metalist, &p->metalist); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, - struct tcf_ife_info *ife, u32 metaid, - bool exists) + struct tcf_ife_params *p, u32 metaid) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; - ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); + ret = __add_metainfo(ops, p, metaid, NULL, 0, true); if (ret) module_put(ops->owner); return ret; } -static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len, bool exists) +static int add_metainfo(struct tcf_ife_params *p, u32 metaid, void *metaval, + int len) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; - ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); + ret = __add_metainfo(ops, p, metaid, metaval, len, false); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } -static int use_all_metadata(struct tcf_ife_info *ife, bool exists) +static int use_all_metadata(struct tcf_ife_params *p) { struct tcf_meta_ops *o; int rc = 0; @@ -359,7 +354,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { - rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); + rc = add_metainfo_and_get_ops(o, p, o->metaid); if (rc == 0) installed += 1; } @@ -371,7 +366,7 @@ static int use_all_metadata(struct tcf_ife_info *ife, bool exists) return -EINVAL; } -static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) +static int dump_metalist(struct sk_buff *skb, struct tcf_ife_params *p) { struct tcf_meta_info *e; struct nlattr *nest; @@ -379,14 +374,14 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) int total_encoded = 0; /*can only happen on decode */ - if (list_empty(&ife->metalist)) + if (list_empty(&p->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry(e, &p->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } @@ -403,13 +398,11 @@ out_nlmsg_trim: return -1; } -/* under ife->tcf_lock */ -static void _tcf_ife_cleanup(struct tc_action *a) +static void __tcf_ife_cleanup(struct tcf_ife_params *p) { - struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; - list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + list_for_each_entry_safe(e, n, &p->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) @@ -422,18 +415,23 @@ static void _tcf_ife_cleanup(struct tc_action *a) } } +static void tcf_ife_cleanup_params(struct rcu_head *head) +{ + struct tcf_ife_params *p = container_of(head, struct tcf_ife_params, + rcu); + + __tcf_ife_cleanup(p); + kfree(p); +} + static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; - spin_lock_bh(&ife->tcf_lock); - _tcf_ife_cleanup(a); - spin_unlock_bh(&ife->tcf_lock); - p = rcu_dereference_protected(ife->params, 1); if (p) - kfree_rcu(p, rcu); + call_rcu(&p->rcu, tcf_ife_cleanup_params); } static int load_metalist(struct nlattr **tb, bool rtnl_held) @@ -455,8 +453,7 @@ static int load_metalist(struct nlattr **tb, bool rtnl_held) return 0; } -static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, - bool exists, bool rtnl_held) +static int populate_metalist(struct tcf_ife_params *p, struct nlattr **tb) { int len = 0; int rc = 0; @@ -468,7 +465,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = add_metainfo(ife, i, val, len, exists); + rc = add_metainfo(p, i, val, len); if (rc) return rc; } @@ -520,9 +517,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (parm->flags & ~IFE_ENCODE) return -EINVAL; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc_obj(*p); if (!p) return -ENOMEM; + INIT_LIST_HEAD(&p->metalist); if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, @@ -567,8 +565,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } ife = to_ife(*a); - if (ret == ACT_P_CREATED) - INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) @@ -600,8 +596,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } if (tb[TCA_IFE_METALST]) { - err = populate_metalist(ife, tb2, exists, - !(flags & TCA_ACT_FLAGS_NO_RTNL)); + err = populate_metalist(p, tb2); if (err) goto metadata_parse_err; } else { @@ -610,7 +605,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, * as we can. You better have at least one else we are * going to bail out */ - err = use_all_metadata(ife, exists); + err = use_all_metadata(p); if (err) goto metadata_parse_err; } @@ -626,13 +621,14 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) - kfree_rcu(p, rcu); + call_rcu(&p->rcu, tcf_ife_cleanup_params); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: + __tcf_ife_cleanup(p); kfree(p); tcf_idr_release(*a, bind); return err; @@ -649,9 +645,9 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, memset(&opt, 0, sizeof(opt)); - opt.index = ife->tcf_index, - opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref, - opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, + opt.index = ife->tcf_index; + opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref; + opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind; spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; @@ -679,7 +675,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; - if (dump_metalist(skb, ife)) { + if (dump_metalist(skb, p)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } @@ -693,13 +689,13 @@ nla_put_failure: return -1; } -static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, +static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_params *p, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ @@ -716,10 +712,13 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; + struct tcf_ife_params *p; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; + p = rcu_dereference_bh(ife->params); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); @@ -745,7 +744,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { + if (find_decode_metaid(skb, p, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ @@ -769,12 +768,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, /*XXX: check if we can do this at install time instead of current * send data path **/ -static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) +static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_params *p) { - struct tcf_meta_info *e, *n; + struct tcf_meta_info *e; int tot_run_sz = 0, run_sz = 0; - list_for_each_entry_safe(e, n, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; @@ -795,7 +794,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ - u16 metalen = ife_get_sz(skb, ife); + u16 metalen = ife_get_sz(skb, p); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; @@ -821,6 +820,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { +drop: qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -829,27 +829,24 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, skb_push(skb, skb->dev->hard_header_len); ife_meta = ife_encode(skb, metalen); - - spin_lock(&ife->tcf_lock); + if (!ife_meta) + goto drop; /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ - list_for_each_entry(e, &ife->metalist, metalist) { + list_for_each_entry_rcu(e, &p->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ - spin_unlock(&ife->tcf_lock); - qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); - return TC_ACT_SHOT; + goto drop; } skboff += err; } - spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5f01f567c934..05e0b14b5773 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,31 +29,6 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_NEST_LIMIT 4 - -#ifndef CONFIG_PREEMPT_RT -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); -} - -static void tcf_mirred_nest_level_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); -} - -#else -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return current->net_xmit.sched_mirred_nest++; -} - -static void tcf_mirred_nest_level_dec(void) -{ - current->net_xmit.sched_mirred_nest--; -} -#endif - static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; @@ -291,11 +266,22 @@ static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, goto err_cant_do; } + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + + at_ingress = skb_at_tc_ingress(skb); + if (dev == skb->dev && want_ingress == at_ingress) { + pr_notice_once("tc mirred: Loop (%s:%s --> %s:%s)\n", + netdev_name(skb->dev), + at_ingress ? "ingress" : "egress", + netdev_name(dev), + want_ingress ? "ingress" : "egress"); + goto err_cant_do; + } + /* we could easily avoid the clone only if called by ingress and clsact; * since we can't easily detect the clsact caller, skip clone only for * ingress - that covers the TC S/W datapath. */ - at_ingress = skb_at_tc_ingress(skb); dont_clone = skb_at_tc_ingress(skb) && is_redirect && tcf_mirred_can_reinsert(retval); if (!dont_clone) { @@ -304,8 +290,6 @@ static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, goto err_cant_do; } - want_ingress = tcf_mirred_act_wants_ingress(m_eaction); - /* All mirred/redirected skbs should clear previous ct info */ nf_reset_ct(skb_to_send); if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */ @@ -439,44 +423,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); - unsigned int nest_level; + struct netdev_xmit *xmit; bool m_mac_header_xmit; struct net_device *dev; - int m_eaction; + int i, m_eaction; u32 blockid; - nest_level = tcf_mirred_nest_level_inc_return(); - if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { +#ifdef CONFIG_PREEMPT_RT + xmit = ¤t->net_xmit; +#else + xmit = this_cpu_ptr(&softnet_data.xmit); +#endif + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - retval = TC_ACT_SHOT; - goto dec_nest_level; + return TC_ACT_SHOT; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); - if (blockid) { - retval = tcf_blockcast(skb, m, blockid, res, retval); - goto dec_nest_level; - } + if (blockid) + return tcf_blockcast(skb, m, blockid, res, retval); dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); - goto dec_nest_level; + return retval; + } + for (i = 0; i < xmit->sched_mirred_nest; i++) { + if (xmit->sched_mirred_dev[i] != dev) + continue; + pr_notice_once("tc mirred: loop on device %s\n", + netdev_name(dev)); + tcf_action_inc_overlimit_qstats(&m->common); + return retval; } + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; + m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); - -dec_nest_level: - tcf_mirred_nest_level_dec(); + xmit->sched_mirred_nest--; return retval; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 6654011dcd2b..1abfaf9d99f1 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -279,7 +279,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, m = to_mpls(*a); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc_obj(*p); if (!p) { err = -ENOMEM; goto put_chain; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 26241d80ebe0..abb332dee836 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -81,7 +81,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (err < 0) goto release_idr; - nparm = kzalloc(sizeof(*nparm), GFP_KERNEL); + nparm = kzalloc_obj(*nparm); if (!nparm) { err = -ENOMEM; goto release_idr; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 4b65901397a8..bc20f08a2789 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -51,7 +51,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, if (!nla) return NULL; - keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); + keys_ex = kzalloc_objs(*k, n); if (!keys_ex) return ERR_PTR(-ENOMEM); @@ -223,7 +223,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_release; } - nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + nparms = kzalloc_obj(*nparms); if (!nparms) { ret = -ENOMEM; goto out_release; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0e1c61183379..12ea9e5a6005 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -151,7 +151,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, goto failure; } - new = kzalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc_obj(*new); if (unlikely(!new)) { err = -ENOMEM; goto failure; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8c1d1554f657..a778cdba9258 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -126,7 +126,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; u16 *queue_mapping = NULL, *ptype = NULL; - u16 mapping_mod = 1; + u32 mapping_mod = 1; bool exists = false; int ret = 0, err; u32 index; @@ -194,6 +194,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } mapping_mod = *queue_mapping_max - *queue_mapping + 1; + if (mapping_mod > U16_MAX) { + NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid."); + return -EINVAL; + } flags |= SKBEDIT_F_TXQ_SKBHASH; } if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) @@ -238,7 +242,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (err < 0) goto release_idr; - params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + params_new = kzalloc_obj(*params_new); if (unlikely(!params_new)) { err = -ENOMEM; goto put_chain; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index a9e0c1326e2a..23ca46138f04 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -185,7 +185,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, d = to_skbmod(*a); - p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); + p = kzalloc_obj(struct tcf_skbmod_params); if (unlikely(!p)) { err = -ENOMEM; goto put_chain; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a74621797d69..6a375fdc63dd 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -233,7 +233,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc_obj(*p); if (!p) { err = -ENOMEM; goto put_chain; diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..098ca02aed89 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -202,6 +202,12 @@ __bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) kfree_skb(skb); } +__bpf_kfunc void bpf_kfree_skb_dtor(void *skb) +{ + bpf_kfree_skb(skb); +} +CFI_NOSEAL(bpf_kfree_skb_dtor); + /* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. * @skb: The skb whose reference to be released and dropped. * @to_free_list: The list of skbs to be dropped. @@ -271,14 +277,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) -BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_skb_get_hash) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) @@ -449,7 +455,7 @@ static struct bpf_struct_ops bpf_Qdisc_ops = { .owner = THIS_MODULE, }; -BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb) +BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb_dtor) static int __init bpf_qdisc_kfunc_init(void) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ecec0a1e1c1a..4829c27446e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -84,7 +84,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, if (WARN_ON(!handle || !tp->ops->get_exts)) return -EINVAL; - n = kzalloc(sizeof(*n), GFP_KERNEL); + n = kzalloc_obj(*n); if (!n) return -ENOMEM; @@ -377,7 +377,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, struct tcf_proto *tp; int err; - tp = kzalloc(sizeof(*tp), GFP_KERNEL); + tp = kzalloc_obj(*tp); if (!tp) return ERR_PTR(-ENOBUFS); @@ -502,7 +502,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, ASSERT_BLOCK_LOCKED(block); - chain = kzalloc(sizeof(*chain), GFP_KERNEL); + chain = kzalloc_obj(*chain); if (!chain) return NULL; list_add_tail_rcu(&chain->list, &block->chain_list); @@ -918,7 +918,7 @@ tcf_chain0_head_change_cb_add(struct tcf_block *block, struct tcf_filter_chain_list_item *item; struct tcf_chain *chain0; - item = kmalloc(sizeof(*item), GFP_KERNEL); + item = kmalloc_obj(*item); if (!item) { NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed"); return -ENOMEM; @@ -1016,7 +1016,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, { struct tcf_block *block; - block = kzalloc(sizeof(*block), GFP_KERNEL); + block = kzalloc_obj(*block); if (!block) { NL_SET_ERR_MSG(extack, "Memory allocation for block failed"); return ERR_PTR(-ENOMEM); @@ -1428,7 +1428,7 @@ static int tcf_block_owner_add(struct tcf_block *block, { struct tcf_block_owner_item *item; - item = kmalloc(sizeof(*item), GFP_KERNEL); + item = kmalloc_obj(*item); if (!item) return -ENOMEM; item->q = q; @@ -1866,15 +1866,15 @@ int tcf_classify(struct sk_buff *skb, struct tc_skb_cb *cb = tc_skb_cb(skb); ext = tc_skb_ext_alloc(skb); - if (WARN_ON_ONCE(!ext)) { + if (!ext) { tcf_set_drop_reason(skb, SKB_DROP_REASON_NOMEM); return TC_ACT_SHOT; } ext->chain = last_executed_chain; ext->mru = cb->mru; - ext->post_ct = cb->post_ct; - ext->post_ct_snat = cb->post_ct_snat; - ext->post_ct_dnat = cb->post_ct_dnat; + ext->post_ct = qdisc_skb_cb(skb)->post_ct; + ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat; + ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat; ext->zone = cb->zone; } } @@ -2228,6 +2228,11 @@ static bool is_qdisc_ingress(__u32 classid) return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); } +static bool is_ingress_or_clsact(struct tcf_block *block, struct Qdisc *q) +{ + return tcf_block_shared(block) || (q && !!(q->flags & TCQ_F_INGRESS)); +} + static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -2420,6 +2425,8 @@ replay: flags |= TCA_ACT_FLAGS_NO_RTNL; if (is_qdisc_ingress(parent)) flags |= TCA_ACT_FLAGS_AT_INGRESS; + if (is_ingress_or_clsact(block, q)) + flags |= TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { @@ -3341,8 +3348,7 @@ int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, * This reference might be taken later from tcf_exts_get_net(). */ exts->net = net; - exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), - GFP_KERNEL); + exts->actions = kzalloc_objs(struct tc_action *, TCA_ACT_MAX_PRIO); if (!exts->actions) return -ENOMEM; #endif diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index ecfaa4f9a04e..492cd9ce8d46 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -77,7 +77,7 @@ static int basic_init(struct tcf_proto *tp) { struct basic_head *head; - head = kzalloc(sizeof(*head), GFP_KERNEL); + head = kzalloc_obj(*head); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); @@ -193,7 +193,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + fnew = kzalloc_obj(*fnew); if (!fnew) return -ENOBUFS; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index a32754a2658b..9a346b6221b3 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -242,7 +242,7 @@ static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; - head = kzalloc(sizeof(*head), GFP_KERNEL); + head = kzalloc_obj(*head); if (head == NULL) return -ENOBUFS; @@ -427,7 +427,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) return ret; - prog = kzalloc(sizeof(*prog), GFP_KERNEL); + prog = kzalloc_obj(*prog); if (!prog) return -ENOBUFS; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 424252982d6a..680a5c308094 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -95,7 +95,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (head && handle != head->handle) return -ENOENT; - new = kzalloc(sizeof(*head), GFP_KERNEL); + new = kzalloc_obj(*head); if (!new) return -ENOBUFS; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5693b41b093f..339c664beff6 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -433,7 +433,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; } - fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + fnew = kzalloc_obj(*fnew); if (!fnew) return -ENOBUFS; @@ -583,7 +583,7 @@ static int flow_init(struct tcf_proto *tp) { struct flow_head *head; - head = kzalloc(sizeof(*head), GFP_KERNEL); + head = kzalloc_obj(*head); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->filters); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 099ff6a3e1f5..26070c892305 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - bool post_ct = tc_skb_cb(skb)->post_ct; + bool post_ct = qdisc_skb_cb(skb)->post_ct; u16 zone = tc_skb_cb(skb)->zone; struct fl_flow_key skb_key; struct fl_flow_mask *mask; @@ -363,7 +363,7 @@ static int fl_init(struct tcf_proto *tp) { struct cls_fl_head *head; - head = kzalloc(sizeof(*head), GFP_KERNEL); + head = kzalloc_obj(*head); if (!head) return -ENOBUFS; @@ -2237,7 +2237,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, struct fl_flow_mask *newmask; int err; - newmask = kzalloc(sizeof(*newmask), GFP_KERNEL); + newmask = kzalloc_obj(*newmask); if (!newmask) return ERR_PTR(-ENOMEM); @@ -2376,13 +2376,13 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout_fold; } - mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL); + mask = kzalloc_obj(struct fl_flow_mask); if (!mask) { err = -ENOBUFS; goto errout_fold; } - tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + tb = kzalloc_objs(struct nlattr *, TCA_FLOWER_MAX + 1); if (!tb) { err = -ENOBUFS; goto errout_mask_alloc; @@ -2398,7 +2398,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout_tb; } - fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + fnew = kzalloc_obj(*fnew); if (!fnew) { err = -ENOBUFS; goto errout_tb; @@ -2815,7 +2815,7 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, if (!tca_opts) return ERR_PTR(-EINVAL); - tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + tb = kzalloc_objs(struct nlattr *, TCA_FLOWER_MAX + 1); if (!tb) return ERR_PTR(-ENOBUFS); err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX, @@ -2823,7 +2823,7 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, if (err) goto errout_tb; - tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL); + tmplt = kzalloc_obj(*tmplt); if (!tmplt) { err = -ENOMEM; goto errout_tb; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index cdddc8695228..be81c108179d 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -262,7 +262,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f->id != handle && handle) return -EINVAL; - fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); + fnew = kzalloc_obj(struct fw_filter); if (!fnew) return -ENOBUFS; @@ -308,7 +308,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_FW_MASK]) mask = nla_get_u32(tb[TCA_FW_MASK]); - head = kzalloc(sizeof(*head), GFP_KERNEL); + head = kzalloc_obj(*head); if (!head) return -ENOBUFS; head->mask = mask; @@ -316,7 +316,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(tp->root, head); } - f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); + f = kzalloc_obj(struct fw_filter); if (f == NULL) return -ENOBUFS; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index f03bf5da39ee..6f126872c14a 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -189,7 +189,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - new = kzalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc_obj(*new); if (!new) return -ENOBUFS; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index b9c58c040c30..bd6f945bd388 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -244,7 +244,7 @@ static int route4_init(struct tcf_proto *tp) { struct route4_head *head; - head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); + head = kzalloc_obj(struct route4_head); if (head == NULL) return -ENOBUFS; @@ -438,7 +438,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, h1 = to_hash(nhandle); b = rtnl_dereference(head->table[h1]); if (!b) { - b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); + b = kzalloc_obj(struct route4_bucket); if (b == NULL) return -ENOBUFS; @@ -507,7 +507,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; err = -ENOBUFS; - f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); + f = kzalloc_obj(struct route4_filter); if (!f) goto errout; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 2a1c00048fd6..9241c025aa74 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -161,10 +161,8 @@ next_knode: int toff = off + key->off + (off2 & key->offmask); __be32 *data, hdata; - if (skb_headroom(skb) + toff > INT_MAX) - goto out; - - data = skb_header_pointer(skb, toff, 4, &hdata); + data = skb_header_pointer_careful(skb, toff, 4, + &hdata); if (!data) goto out; if ((*data ^ key->val) & key->mask) { @@ -214,8 +212,9 @@ check_terminal: if (ht->divisor) { __be32 *data, hdata; - data = skb_header_pointer(skb, off + n->sel.hoff, 4, - &hdata); + data = skb_header_pointer_careful(skb, + off + n->sel.hoff, + 4, &hdata); if (!data) goto out; sel = ht->divisor & u32_hash_fold(*data, &n->sel, @@ -229,7 +228,7 @@ check_terminal: if (n->sel.flags & TC_U32_VAROFFSET) { __be16 *data, hdata; - data = skb_header_pointer(skb, + data = skb_header_pointer_careful(skb, off + n->sel.offoff, 2, &hdata); if (!data) @@ -365,7 +364,7 @@ static int u32_init(struct tcf_proto *tp) void *key = tc_u_common_ptr(tp); struct tc_u_common *tp_c = tc_u_common_find(key); - root_ht = kzalloc(struct_size(root_ht, ht, 1), GFP_KERNEL); + root_ht = kzalloc_flex(*root_ht, ht, 1); if (root_ht == NULL) return -ENOBUFS; @@ -376,7 +375,7 @@ static int u32_init(struct tcf_proto *tp) idr_init(&root_ht->handle_idr); if (tp_c == NULL) { - tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); + tp_c = kzalloc_obj(*tp_c); if (tp_c == NULL) { kfree(root_ht); return -ENOBUFS; @@ -826,7 +825,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, struct tc_u32_sel *s = &n->sel; struct tc_u_knode *new; - new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL); + new = kzalloc_flex(*new, sel.keys, s->nkeys); if (!new) return NULL; @@ -975,7 +974,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table"); return -EINVAL; } - ht = kzalloc(struct_size(ht, ht, divisor + 1), GFP_KERNEL); + ht = kzalloc_flex(*ht, ht, divisor + 1); if (ht == NULL) return -ENOBUFS; if (handle == 0) { @@ -1105,7 +1104,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, goto erridr; } - n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL); + n = kzalloc_flex(*n, sel.keys, s->nkeys); if (n == NULL) { err = -ENOBUFS; goto erridr; @@ -1418,7 +1417,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; } #ifdef CONFIG_CLS_U32_PERF - gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL); + gpf = kzalloc_flex(*gpf, kcnts, n->sel.nkeys); if (!gpf) goto nla_put_failure; @@ -1481,9 +1480,7 @@ static int __init init_u32(void) #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif - tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE, - sizeof(struct hlist_head), - GFP_KERNEL); + tc_u_common_hash = kvmalloc_objs(struct hlist_head, U32_HASH_SIZE); if (!tc_u_common_hash) return -ENOMEM; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 3f2e707a11d1..48bdd77df0a3 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -927,7 +927,7 @@ static int em_meta_change(struct net *net, void *data, int len, TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) goto errout; - meta = kzalloc(sizeof(*meta), GFP_KERNEL); + meta = kzalloc_obj(*meta); if (meta == NULL) { err = -ENOMEM; goto errout; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 692e2be1793e..343f1aebeec2 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -84,7 +84,7 @@ retry: return -EAGAIN; } - tm = kmalloc(sizeof(*tm), GFP_KERNEL); + tm = kmalloc_obj(*tm); if (tm == NULL) { textsearch_destroy(ts_conf); return -ENOBUFS; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f56b18c8aebf..cc43e3f7574f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -437,7 +437,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, } } - rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + rtab = kmalloc_obj(*rtab); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; @@ -530,7 +530,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, return ERR_PTR(-EINVAL); } - stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); + stab = kmalloc_flex(*stab, data, tsize); if (!stab) return ERR_PTR(-ENOMEM); @@ -668,7 +668,7 @@ static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) struct hlist_head *h; unsigned int i; - h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); + h = kvmalloc_objs(struct hlist_head, n); if (h != NULL) { for (i = 0; i < n; i++) @@ -1353,7 +1353,7 @@ err_out4: ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: - lockdep_unregister_key(&sch->root_lock_key); + qdisc_lock_uninit(sch, ops); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 32bacfc314c2..9efe23f8371b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -67,6 +67,7 @@ #include <linux/if_vlan.h> #include <net/gso.h> #include <net/pkt_sched.h> +#include <net/sch_priv.h> #include <net/pkt_cls.h> #include <net/tcp.h> #include <net/flow_dissector.h> @@ -197,40 +198,45 @@ struct cake_tin_data { u32 way_collisions; }; /* number of tins is small, so size of this struct doesn't matter much */ +struct cake_sched_config { + u64 rate_bps; + u64 interval; + u64 target; + u64 sync_time; + u32 buffer_config_limit; + u32 fwmark_mask; + u16 fwmark_shft; + s16 rate_overhead; + u16 rate_mpu; + u16 rate_flags; + u8 tin_mode; + u8 flow_mode; + u8 atm_mode; + u8 ack_filter; + u8 is_shared; +}; + struct cake_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct tcf_block *block; struct cake_tin_data *tins; + struct cake_sched_config *config; + struct cake_sched_config initial_config; struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; - u16 overflow_timeout; - - u16 tin_cnt; - u8 tin_mode; - u8 flow_mode; - u8 ack_filter; - u8 atm_mode; - - u32 fwmark_mask; - u16 fwmark_shft; /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ - u16 rate_shft; ktime_t time_next_packet; ktime_t failsafe_next_packet; u64 rate_ns; - u64 rate_bps; - u16 rate_flags; - s16 rate_overhead; - u16 rate_mpu; - u64 interval; - u64 target; + u16 rate_shft; + u16 overflow_timeout; + u16 tin_cnt; /* resource tracking */ u32 buffer_used; u32 buffer_max_used; u32 buffer_limit; - u32 buffer_config_limit; /* indices for dequeue */ u16 cur_tin; @@ -254,6 +260,11 @@ struct cake_sched_data { u16 max_adjlen; u16 min_netlen; u16 min_adjlen; + + /* mq sync state */ + u64 last_checked_active; + u64 last_active; + u32 active_queues; }; enum { @@ -380,6 +391,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 1239850263, 1191209601, 1147878294, 1108955788 }; +static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); + /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * @@ -1198,7 +1211,7 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph, static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, struct cake_flow *flow) { - bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; + bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE; struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; struct sk_buff *skb_check, *skb_prev = NULL; const struct ipv6hdr *ipv6h, *ipv6h_check; @@ -1266,7 +1279,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) continue; - seglen = ntohs(ipv6h_check->payload_len); + seglen = ipv6_payload_len(skb, ipv6h_check); } else { WARN_ON(1); /* shouldn't happen */ continue; @@ -1358,15 +1371,17 @@ static u64 cake_ewma(u64 avg, u64 sample, u32 shift) return avg; } -static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) +static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) { + struct cake_sched_config *q = qd->config; + if (q->rate_flags & CAKE_FLAG_OVERHEAD) len -= off; - if (q->max_netlen < len) - q->max_netlen = len; - if (q->min_netlen > len) - q->min_netlen = len; + if (qd->max_netlen < len) + qd->max_netlen = len; + if (qd->min_netlen > len) + qd->min_netlen = len; len += q->rate_overhead; @@ -1385,10 +1400,10 @@ static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) len += (len + 63) / 64; } - if (q->max_adjlen < len) - q->max_adjlen = len; - if (q->min_adjlen > len) - q->min_adjlen = len; + if (qd->max_adjlen < len) + qd->max_adjlen = len; + if (qd->min_adjlen > len) + qd->min_adjlen = len; return len; } @@ -1398,15 +1413,15 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); + u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); - if (!shinfo->gso_size) + if (segs == 1) return cake_calc_overhead(q, len, off); - /* borrowed from qdisc_pkt_len_init() */ + /* borrowed from qdisc_pkt_len_segs_init() */ if (!skb->encapsulation) hdr_len = skb_transport_offset(skb); else @@ -1430,12 +1445,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) hdr_len += sizeof(struct udphdr); } - if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) - segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); - else - segs = shinfo->gso_segs; - len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); @@ -1592,12 +1601,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) flow->dropped++; b->tin_dropped++; - if (q->rate_flags & CAKE_FLAG_INGRESS) + if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; - qdisc_tree_reduce_backlog(sch, 1, len); cake_heapify(q, 0); @@ -1663,7 +1671,8 @@ static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, struct sk_buff *skb) { - struct cake_sched_data *q = qdisc_priv(sch); + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; u32 tin, mark; bool wash; u8 dscp; @@ -1680,24 +1689,24 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) tin = 0; - else if (mark && mark <= q->tin_cnt) - tin = q->tin_order[mark - 1]; + else if (mark && mark <= qd->tin_cnt) + tin = qd->tin_order[mark - 1]; else if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && - TC_H_MIN(skb->priority) <= q->tin_cnt) - tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; + TC_H_MIN(skb->priority) <= qd->tin_cnt) + tin = qd->tin_order[TC_H_MIN(skb->priority) - 1]; else { if (!wash) dscp = cake_handle_diffserv(skb, wash); - tin = q->tin_index[dscp]; + tin = qd->tin_index[dscp]; - if (unlikely(tin >= q->tin_cnt)) + if (unlikely(tin >= qd->tin_cnt)) tin = 0; } - return &q->tins[tin]; + return &qd->tins[tin]; } static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, @@ -1743,17 +1752,17 @@ static void cake_reconfigure(struct Qdisc *sch); static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + u32 idx, tin, prev_qlen, prev_backlog, drop_id; struct cake_sched_data *q = qdisc_priv(sch); - int len = qdisc_pkt_len(skb); - int ret; + int len = qdisc_pkt_len(skb), ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx, tin; + bool same_flow = false; /* choose flow to insert into */ - idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); + idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); @@ -1788,7 +1797,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; @@ -1800,6 +1809,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); @@ -1823,23 +1833,25 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, consume_skb(skb); } else { /* not splitting */ + int ack_pkt_len = 0; + cobalt_set_enqueue_time(skb, now); get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); - if (q->ack_filter) + if (q->config->ack_filter) ack = cake_ack_filter(q, flow); if (ack) { b->ack_drops++; sch->qstats.drops++; - b->bytes += qdisc_pkt_len(ack); - len -= qdisc_pkt_len(ack); + ack_pkt_len = qdisc_pkt_len(ack); + b->bytes += ack_pkt_len; q->buffer_used += skb->truesize - ack->truesize; - if (q->rate_flags & CAKE_FLAG_INGRESS) + if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); - qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); consume_skb(ack); } else { sch->q.qlen++; @@ -1848,18 +1860,18 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ b->packets++; - b->bytes += len; - b->backlogs[idx] += len; - b->tin_backlog += len; - sch->qstats.backlog += len; - q->avg_window_bytes += len; + b->bytes += len - ack_pkt_len; + b->backlogs[idx] += len - ack_pkt_len; + b->tin_backlog += len - ack_pkt_len; + sch->qstats.backlog += len - ack_pkt_len; + q->avg_window_bytes += len - ack_pkt_len; } if (q->overflow_timeout) cake_heapify_up(q, b->overflow_idx[idx]); /* incoming bandwidth capacity estimate */ - if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { + if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { u64 packet_interval = \ ktime_to_ns(ktime_sub(now, q->last_packet_time)); @@ -1891,7 +1903,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (ktime_after(now, ktime_add_ms(q->last_reconfig_time, 250))) { - q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; + q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; cake_reconfigure(sch); } } @@ -1911,7 +1923,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); + flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. @@ -1920,31 +1932,36 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); } if (q->buffer_used > q->buffer_max_used) q->buffer_max_used = q->buffer_used; - if (q->buffer_used > q->buffer_limit) { - bool same_flow = false; - u32 dropped = 0; - u32 drop_id; + if (q->buffer_used <= q->buffer_limit) + return NET_XMIT_SUCCESS; - while (q->buffer_used > q->buffer_limit) { - dropped++; - drop_id = cake_drop(sch, to_free); + prev_qlen = sch->q.qlen; + prev_backlog = sch->qstats.backlog; - if ((drop_id >> 16) == tin && - (drop_id & 0xFFFF) == idx) - same_flow = true; - } - b->drop_overlimit += dropped; + while (q->buffer_used > q->buffer_limit) { + drop_id = cake_drop(sch, to_free); + if ((drop_id >> 16) == tin && + (drop_id & 0xFFFF) == idx) + same_flow = true; + } + + prev_qlen -= sch->q.qlen; + prev_backlog -= sch->qstats.backlog; + b->drop_overlimit += prev_qlen; - if (same_flow) - return NET_XMIT_CN; + if (same_flow) { + qdisc_tree_reduce_backlog(sch, prev_qlen - 1, + prev_backlog - len); + return NET_XMIT_CN; } + qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); return NET_XMIT_SUCCESS; } @@ -1996,6 +2013,38 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) u64 delay; u32 len; + if (q->config->is_shared && q->rate_ns && + now - q->last_checked_active >= q->config->sync_time) { + struct net_device *dev = qdisc_dev(sch); + struct cake_sched_data *other_priv; + u64 new_rate = q->config->rate_bps; + u64 other_qlen, other_last_active; + struct Qdisc *other_sch; + u32 num_active_qs = 1; + unsigned int ntx; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); + other_priv = qdisc_priv(other_sch); + + if (other_priv == q) + continue; + + other_qlen = READ_ONCE(other_sch->q.qlen); + other_last_active = READ_ONCE(other_priv->last_active); + + if (other_qlen || other_last_active > q->last_checked_active) + num_active_qs++; + } + + if (num_active_qs > 1) + new_rate = div64_u64(q->config->rate_bps, num_active_qs); + + cake_configure_rates(sch, new_rate, true); + q->last_checked_active = now; + q->active_queues = num_active_qs; + } + begin: if (!sch->q.qlen) return NULL; @@ -2103,8 +2152,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2116,7 +2165,7 @@ retry: } } - flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); + flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2140,8 +2189,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2159,8 +2208,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); } else b->decaying_flow_count--; @@ -2171,14 +2220,14 @@ retry: reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, (b->bulk_flow_count * - !!(q->rate_flags & + !!(q->config->rate_flags & CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ - if (q->rate_flags & CAKE_FLAG_INGRESS) { + if (q->config->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); flow->deficit -= len; @@ -2188,13 +2237,14 @@ retry: b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb_reason(skb, reason); - if (q->rate_flags & CAKE_FLAG_INGRESS) + qdisc_dequeue_drop(sch, skb, reason); + if (q->config->rate_flags & CAKE_FLAG_INGRESS) goto retry; } b->tin_ecn_mark += !!flow->cvars.ecn_marked; qdisc_bstats_update(sch, skb); + WRITE_ONCE(q->last_active, now); /* collect delay stats */ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); @@ -2295,6 +2345,9 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; + if (mtu == 0) + return; + byte_target_ns = (byte_target * rate_ns) >> rate_shft; b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); @@ -2306,12 +2359,10 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->cparams.p_dec = 1 << 20; /* 1/4096 */ } -static int cake_config_besteffort(struct Qdisc *sch) +static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; q->tin_cnt = 1; @@ -2319,18 +2370,16 @@ static int cake_config_besteffort(struct Qdisc *sch) q->tin_order = normal_order; cake_set_rate(b, rate, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); b->tin_quantum = 65535; return 0; } -static int cake_config_precedence(struct Qdisc *sch) +static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu) { /* convert high-level (user visible) parameters into internal format */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; u32 quantum = 256; u32 i; @@ -2341,8 +2390,8 @@ static int cake_config_precedence(struct Qdisc *sch) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; - cake_set_rate(b, rate, mtu, us_to_ns(q->target), - us_to_ns(q->interval)); + cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), + us_to_ns(q->config->interval)); b->tin_quantum = max_t(u16, 1U, quantum); @@ -2401,7 +2450,7 @@ static int cake_config_precedence(struct Qdisc *sch) * Total 12 traffic classes. */ -static int cake_config_diffserv8(struct Qdisc *sch) +static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu) { /* Pruned list of traffic classes for typical applications: * @@ -2418,8 +2467,6 @@ static int cake_config_diffserv8(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; u32 quantum = 256; u32 i; @@ -2433,8 +2480,8 @@ static int cake_config_diffserv8(struct Qdisc *sch) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; - cake_set_rate(b, rate, mtu, us_to_ns(q->target), - us_to_ns(q->interval)); + cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), + us_to_ns(q->config->interval)); b->tin_quantum = max_t(u16, 1U, quantum); @@ -2449,7 +2496,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) return 0; } -static int cake_config_diffserv4(struct Qdisc *sch) +static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu) { /* Further pruned list of traffic classes for four-class system: * @@ -2462,8 +2509,6 @@ static int cake_config_diffserv4(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; u32 quantum = 1024; q->tin_cnt = 4; @@ -2474,13 +2519,13 @@ static int cake_config_diffserv4(struct Qdisc *sch) /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[2], rate >> 1, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[3], rate >> 2, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; @@ -2491,7 +2536,7 @@ static int cake_config_diffserv4(struct Qdisc *sch) return 0; } -static int cake_config_diffserv3(struct Qdisc *sch) +static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu) { /* Simplified Diffserv structure with 3 tins. * Latency Sensitive (CS7, CS6, EF, VA, TOS4) @@ -2499,8 +2544,6 @@ static int cake_config_diffserv3(struct Qdisc *sch) * Low Priority (LE, CS1) */ struct cake_sched_data *q = qdisc_priv(sch); - u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; u32 quantum = 1024; q->tin_cnt = 3; @@ -2511,11 +2554,11 @@ static int cake_config_diffserv3(struct Qdisc *sch) /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[2], rate >> 2, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; @@ -2525,67 +2568,76 @@ static int cake_config_diffserv3(struct Qdisc *sch) return 0; } -static void cake_reconfigure(struct Qdisc *sch) +static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust) { - struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch)); + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; int c, ft; switch (q->tin_mode) { case CAKE_DIFFSERV_BESTEFFORT: - ft = cake_config_besteffort(sch); + ft = cake_config_besteffort(sch, rate, mtu); break; case CAKE_DIFFSERV_PRECEDENCE: - ft = cake_config_precedence(sch); + ft = cake_config_precedence(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV8: - ft = cake_config_diffserv8(sch); + ft = cake_config_diffserv8(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV4: - ft = cake_config_diffserv4(sch); + ft = cake_config_diffserv4(sch, rate, mtu); break; case CAKE_DIFFSERV_DIFFSERV3: default: - ft = cake_config_diffserv3(sch); + ft = cake_config_diffserv3(sch, rate, mtu); break; } - for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { + for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) { cake_clear_tin(sch, c); - q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; + qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time; } - q->rate_ns = q->tins[ft].tin_rate_ns; - q->rate_shft = q->tins[ft].tin_rate_shft; + qd->rate_ns = qd->tins[ft].tin_rate_ns; + qd->rate_shft = qd->tins[ft].tin_rate_shft; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; + + cake_configure_rates(sch, qd->config->rate_bps, false); if (q->buffer_config_limit) { - q->buffer_limit = q->buffer_config_limit; + qd->buffer_limit = q->buffer_config_limit; } else if (q->rate_bps) { u64 t = q->rate_bps * q->interval; do_div(t, USEC_PER_SEC / 4); - q->buffer_limit = max_t(u32, t, 4U << 20); + qd->buffer_limit = max_t(u32, t, 4U << 20); } else { - q->buffer_limit = ~0; + qd->buffer_limit = ~0; } sch->flags &= ~TCQ_F_CAN_BYPASS; - q->buffer_limit = min(q->buffer_limit, - max(sch->limit * psched_mtu(qdisc_dev(sch)), - q->buffer_config_limit)); + qd->buffer_limit = min(qd->buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit)); } -static int cake_change(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt, + struct netlink_ext_ack *extack, bool *overhead_changed) { - struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; - u16 rate_flags; - u8 flow_mode; + u16 rate_flags = q->rate_flags; + u8 flow_mode = q->flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, @@ -2593,7 +2645,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; - flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) flow_mode &= ~CAKE_FLOW_NAT_FLAG; @@ -2606,6 +2657,19 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, #endif } + if (tb[TCA_CAKE_AUTORATE]) { + if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) { + if (q->is_shared) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE], + "Can't use autorate-ingress with cake_mq"); + return -EOPNOTSUPP; + } + rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + } else { + rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + } + } + if (tb[TCA_CAKE_BASE_RATE64]) WRITE_ONCE(q->rate_bps, nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); @@ -2614,7 +2678,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->tin_mode, nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); - rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) rate_flags |= CAKE_FLAG_WASH; @@ -2635,20 +2698,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->rate_overhead, nla_get_s32(tb[TCA_CAKE_OVERHEAD])); rate_flags |= CAKE_FLAG_OVERHEAD; - - q->max_netlen = 0; - q->max_adjlen = 0; - q->min_netlen = ~0; - q->min_adjlen = ~0; + *overhead_changed = true; } if (tb[TCA_CAKE_RAW]) { rate_flags &= ~CAKE_FLAG_OVERHEAD; - - q->max_netlen = 0; - q->max_adjlen = 0; - q->min_netlen = ~0; - q->min_adjlen = ~0; + *overhead_changed = true; } if (tb[TCA_CAKE_MPU]) @@ -2667,13 +2722,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->target, max(target, 1U)); } - if (tb[TCA_CAKE_AUTORATE]) { - if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) - rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; - else - rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; - } - if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) rate_flags |= CAKE_FLAG_INGRESS; @@ -2704,7 +2752,35 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->rate_flags, rate_flags); WRITE_ONCE(q->flow_mode, flow_mode); - if (q->tins) { + + return 0; +} + +static int cake_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; + bool overhead_changed = false; + int ret; + + if (q->is_shared) { + NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs"); + return -EOPNOTSUPP; + } + + ret = cake_config_change(q, opt, extack, &overhead_changed); + if (ret) + return ret; + + if (overhead_changed) { + qd->max_netlen = 0; + qd->max_adjlen = 0; + qd->min_netlen = ~0; + qd->min_adjlen = ~0; + } + + if (qd->tins) { sch_tree_lock(sch); cake_reconfigure(sch); sch_tree_unlock(sch); @@ -2722,13 +2798,8 @@ static void cake_destroy(struct Qdisc *sch) kvfree(q->tins); } -static int cake_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static void cake_config_init(struct cake_sched_config *q, bool is_shared) { - struct cake_sched_data *q = qdisc_priv(sch); - int i, j, err; - - sch->limit = 10240; q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; @@ -2739,19 +2810,35 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, * for 5 to 10% of interval */ q->rate_flags |= CAKE_FLAG_SPLIT_GSO; - q->cur_tin = 0; - q->cur_flow = 0; + q->is_shared = is_shared; + q->sync_time = 200 * NSEC_PER_USEC; +} + +static int cake_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = &qd->initial_config; + int i, j, err; - qdisc_watchdog_init(&q->watchdog, sch); + cake_config_init(q, false); + + sch->limit = 10240; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + + qd->cur_tin = 0; + qd->cur_flow = 0; + qd->config = q; + + qdisc_watchdog_init(&qd->watchdog, sch); if (opt) { err = cake_change(sch, opt, extack); - if (err) return err; } - err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack); if (err) return err; @@ -2759,13 +2846,12 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, for (i = 1; i <= CAKE_QUEUES; i++) quantum_div[i] = 65535 / i; - q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), - GFP_KERNEL); - if (!q->tins) + qd->tins = kvzalloc_objs(struct cake_tin_data, CAKE_MAX_TINS); + if (!qd->tins) return -ENOMEM; for (i = 0; i < CAKE_MAX_TINS; i++) { - struct cake_tin_data *b = q->tins + i; + struct cake_tin_data *b = qd->tins + i; INIT_LIST_HEAD(&b->new_flows); INIT_LIST_HEAD(&b->old_flows); @@ -2781,22 +2867,32 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, INIT_LIST_HEAD(&flow->flowchain); cobalt_vars_init(&flow->cvars); - q->overflow_heap[k].t = i; - q->overflow_heap[k].b = j; + qd->overflow_heap[k].t = i; + qd->overflow_heap[k].b = j; b->overflow_idx[j] = k; } } cake_reconfigure(sch); - q->avg_peak_bandwidth = q->rate_bps; - q->min_netlen = ~0; - q->min_adjlen = ~0; + qd->avg_peak_bandwidth = q->rate_bps; + qd->min_netlen = ~0; + qd->min_adjlen = ~0; + qd->active_queues = 0; + qd->last_checked_active = 0; + return 0; } -static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + + qd->config = cfg; + cake_reconfigure(sch); +} + +static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb) { - struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; u16 rate_flags; u8 flow_mode; @@ -2872,6 +2968,13 @@ nla_put_failure: return -1; } +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + + return cake_config_dump(qd->config, skb); +} + static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); @@ -2900,6 +3003,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); PUT_STAT_U32(MIN_NETLEN, q->min_netlen); PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues); #undef PUT_STAT_U32 #undef PUT_STAT_U64 @@ -3133,14 +3237,133 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = { }; MODULE_ALIAS_NET_SCH("cake"); +struct cake_mq_sched { + struct mq_sched mq_priv; /* must be first */ + struct cake_sched_config cake_config; +}; + +static void cake_mq_destroy(struct Qdisc *sch) +{ + mq_destroy_common(sch); +} + +static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_mq_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int ret, ntx; + bool _unused; + + cake_config_init(&priv->cake_config, true); + if (opt) { + ret = cake_config_change(&priv->cake_config, opt, extack, &_unused); + if (ret) + return ret; + } + + ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops); + if (ret) + return ret; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) + cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config); + + return 0; +} + +static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_mq_sched *priv = qdisc_priv(sch); + + mq_dump_common(sch, skb); + return cake_config_dump(&priv->cake_config, skb); +} + +static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_mq_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + bool overhead_changed = false; + unsigned int ntx; + int ret; + + ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed); + if (ret) + return ret; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); + struct cake_sched_data *qd = qdisc_priv(chld); + + if (overhead_changed) { + qd->max_netlen = 0; + qd->max_adjlen = 0; + qd->min_netlen = ~0; + qd->min_adjlen = ~0; + } + + if (qd->tins) { + sch_tree_lock(chld); + cake_reconfigure(chld); + sch_tree_unlock(chld); + } + } + + return 0; +} + +static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old, struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs"); + return -EOPNOTSUPP; +} + +static const struct Qdisc_class_ops cake_mq_class_ops = { + .select_queue = mq_select_queue, + .graft = cake_mq_graft, + .leaf = mq_leaf, + .find = mq_find, + .walk = mq_walk, + .dump = mq_dump_class, + .dump_stats = mq_dump_class_stats, +}; + +static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = { + .cl_ops = &cake_mq_class_ops, + .id = "cake_mq", + .priv_size = sizeof(struct cake_mq_sched), + .init = cake_mq_init, + .destroy = cake_mq_destroy, + .attach = mq_attach, + .change = cake_mq_change, + .change_real_num_tx = mq_change_real_num_tx, + .dump = cake_mq_dump, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_NET_SCH("cake_mq"); + static int __init cake_module_init(void) { - return register_qdisc(&cake_qdisc_ops); + int ret; + + ret = register_qdisc(&cake_qdisc_ops); + if (ret) + return ret; + + ret = register_qdisc(&cake_mq_qdisc_ops); + if (ret) + unregister_qdisc(&cake_qdisc_ops); + + return ret; } static void __exit cake_module_exit(void) { unregister_qdisc(&cake_qdisc_ops); + unregister_qdisc(&cake_mq_qdisc_ops); } module_init(cake_module_init) @@ -3148,3 +3371,4 @@ module_exit(cake_module_exit) MODULE_AUTHOR("Jonathan Morton"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("The CAKE shaper."); +MODULE_IMPORT_NS("NET_SCHED_INTERNAL"); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 59e7bdf5063e..94df8e741a97 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -370,7 +370,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL); + ntab = kvzalloc_objs(struct sk_buff *, mask + 1); if (!ntab) return -ENOMEM; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index fa0314679e43..c6551578f1cf 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -182,6 +182,8 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; } diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 9b6d79bd8737..01335a49e091 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -105,7 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return 0; } - cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL); + cl = kzalloc_obj(struct drr_class); if (cl == NULL) return -ENOBUFS; diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 4b975feb52b1..6d7e6389758d 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb */ qdisc_skb_cb(nskb)->pkt_len = nskb->len; + qdisc_skb_cb(nskb)->pkt_segs = 1; dualpi2_skb_cb(nskb)->classified = dualpi2_skb_cb(skb)->classified; dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 82635dd2cfa5..a4b07b661b77 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -115,12 +115,12 @@ static void ets_offload_change(struct Qdisc *sch) struct ets_sched *q = qdisc_priv(sch); struct tc_ets_qopt_offload qopt; unsigned int w_psum_prev = 0; - unsigned int q_psum = 0; - unsigned int q_sum = 0; unsigned int quantum; unsigned int w_psum; unsigned int weight; unsigned int i; + u64 q_psum = 0; + u64 q_sum = 0; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; @@ -138,8 +138,12 @@ static void ets_offload_change(struct Qdisc *sch) for (i = 0; i < q->nbands; i++) { quantum = q->classes[i].quantum; - q_psum += quantum; - w_psum = quantum ? q_psum * 100 / q_sum : 0; + if (quantum) { + q_psum += quantum; + w_psum = div64_u64(q_psum * 100, q_sum); + } else { + w_psum = 0; + } weight = w_psum - w_psum_prev; w_psum_prev = w_psum; @@ -652,7 +656,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); for (i = nbands; i < oldbands; i++) { - if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) + if (cl_is_active(&q->classes[i])) list_del_init(&q->classes[i].alist); qdisc_purge_queue(q->classes[i].qdisc); } @@ -664,6 +668,10 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, q->classes[i].deficit = quanta[i]; } } + for (i = q->nstrict; i < nstrict; i++) { + if (cl_is_active(&q->classes[i])) + list_del_init(&q->classes[i].alist); + } WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index fee922da2f99..05084c9af48e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -245,8 +245,6 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) static struct kmem_cache *fq_flow_cachep __read_mostly; -/* limit number of collected flows per round */ -#define FQ_GC_MAX 8 #define FQ_GC_AGE (3*HZ) static bool fq_gc_candidate(const struct fq_flow *f) @@ -259,10 +257,9 @@ static void fq_gc(struct fq_sched_data *q, struct rb_root *root, struct sock *sk) { + struct fq_flow *f, *tofree = NULL; struct rb_node **p, *parent; - void *tofree[FQ_GC_MAX]; - struct fq_flow *f; - int i, fcnt = 0; + int fcnt; p = &root->rb_node; parent = NULL; @@ -274,9 +271,8 @@ static void fq_gc(struct fq_sched_data *q, break; if (fq_gc_candidate(f)) { - tofree[fcnt++] = f; - if (fcnt == FQ_GC_MAX) - break; + f->next = tofree; + tofree = f; } if (f->sk > sk) @@ -285,18 +281,20 @@ static void fq_gc(struct fq_sched_data *q, p = &parent->rb_left; } - if (!fcnt) + if (!tofree) return; - for (i = fcnt; i > 0; ) { - f = tofree[--i]; + fcnt = 0; + while (tofree) { + f = tofree; + tofree = f->next; rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + fcnt++; } q->flows -= fcnt; q->inactive_flows -= fcnt; q->stat_gc_flows += fcnt; - - kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } /* Fast path can be used if : @@ -480,7 +478,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { if (skb == flow->head) { - flow->head = skb->next; + struct sk_buff *next = skb->next; + + prefetch(next); + flow->head = next; } else { rb_erase(&skb->rbnode, &flow->t_root); skb->dev = qdisc_dev(sch); @@ -497,6 +498,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, skb_mark_not_on_list(skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; + qdisc_bstats_update(sch, skb); } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -661,7 +663,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) return NULL; skb = fq_peek(&q->internal); - if (unlikely(skb)) { + if (skb) { q->internal.qlen--; fq_dequeue_skb(sch, &q->internal, skb); goto out; @@ -711,14 +713,14 @@ begin: goto begin; } prefetch(&skb->end); - if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { + fq_dequeue_skb(sch, f, skb); + if (unlikely((s64)(now - time_next_packet - q->ce_threshold) > 0)) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } if (--f->qlen == 0) q->inactive_flows++; q->band_pkt_count[fq_skb_cb(skb)->band]--; - fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ @@ -776,7 +778,6 @@ begin: f->time_next_packet = now + len; } out: - qdisc_bstats_update(sch, skb); return skb; } @@ -826,6 +827,7 @@ static void fq_reset(struct Qdisc *sch) for (idx = 0; idx < FQ_BANDS; idx++) { q->band_flows[idx].new_flows.first = NULL; q->band_flows[idx].old_flows.first = NULL; + q->band_pkt_count[idx] = 0; } q->delayed = RB_ROOT; q->flows = 0; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a14142392939..8181b52dd9a8 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -275,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -496,9 +496,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, goto init_failure; if (!q->flows) { - q->flows = kvcalloc(q->flows_cnt, - sizeof(struct fq_codel_flow), - GFP_KERNEL); + q->flows = kvzalloc_objs(struct fq_codel_flow, q->flows_cnt); if (!q->flows) { err = -ENOMEM; goto init_failure; @@ -519,6 +517,9 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; + + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; alloc_failure: diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 7b96bc3ff891..d8ac3519e937 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -448,8 +448,7 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, if (err) goto init_failure; - q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow), - GFP_KERNEL); + q->flows = kvzalloc_objs(struct fq_pie_flow, q->flows_cnt); if (!q->flows) { err = -ENOMEM; goto init_failure; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7dee9748a56b..9e726c3bd86b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -669,7 +669,6 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -682,7 +681,6 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, - .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); @@ -957,9 +955,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); - lockdep_register_key(&sch->root_lock_key); - spin_lock_init(&sch->q.lock); - lockdep_set_class(&sch->q.lock, &sch->root_lock_key); + qdisc_lock_init(sch, ops); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = @@ -974,10 +970,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } } - spin_lock_init(&sch->busylock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, @@ -988,13 +980,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: - lockdep_unregister_key(&sch->root_lock_key); + qdisc_lock_uninit(sch, ops); kfree(sch); errout: return ERR_PTR(err); @@ -1083,7 +1074,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) if (ops->destroy) ops->destroy(qdisc); - lockdep_unregister_key(&qdisc->root_lock_key); + qdisc_lock_uninit(qdisc, ops); bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); @@ -1297,33 +1288,6 @@ static void dev_deactivate_queue(struct net_device *dev, } } -static void dev_reset_queue(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - struct Qdisc *qdisc; - bool nolock; - - qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); - if (!qdisc) - return; - - nolock = qdisc->flags & TCQ_F_NOLOCK; - - if (nolock) - spin_lock_bh(&qdisc->seqlock); - spin_lock_bh(qdisc_lock(qdisc)); - - qdisc_reset(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) { - clear_bit(__QDISC_STATE_MISSED, &qdisc->state); - clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); - spin_unlock_bh(&qdisc->seqlock); - } -} - static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 532fde548b88..6706faba95b9 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -359,7 +359,7 @@ static int gred_offload_dump_stats(struct Qdisc *sch) unsigned int i; int ret; - hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL); + hw_stats = kzalloc_obj(*hw_stats); if (!hw_stats) return -ENOMEM; @@ -700,7 +700,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, prio = ctl->prio; } - prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + prealloc = kzalloc_obj(*prealloc); sch_tree_lock(sch); err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc, @@ -757,7 +757,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, * psched_mtu(qdisc_dev(sch)); if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { - table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); + table->opt = kzalloc_obj(*table->opt); if (!table->opt) return -ENOMEM; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d8fd35da32a7..b5657ffbbf84 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1025,7 +1025,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (rsc == NULL && fsc == NULL) return -EINVAL; - cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL); + cl = kzalloc_obj(struct hfsc_class); if (cl == NULL) return -ENOBUFS; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 2d4855e28a28..95e5d9bfd9c8 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -230,7 +230,7 @@ static struct hh_flow_state *alloc_new_hh(struct list_head *head, return NULL; } /* Create new entry. */ - flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); + flow = kzalloc_obj(struct hh_flow_state, GFP_ATOMIC); if (!flow) return NULL; @@ -604,8 +604,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt, if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ - q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), - GFP_KERNEL); + q->hh_flows = kvzalloc_objs(struct list_head, HH_FLOWS_CNT); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index b5e40c51655a..cf6cd4ccfa20 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1095,9 +1095,8 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, } q->num_direct_qdiscs = dev->real_num_tx_queues; - q->direct_qdiscs = kcalloc(q->num_direct_qdiscs, - sizeof(*q->direct_qdiscs), - GFP_KERNEL); + q->direct_qdiscs = kzalloc_objs(*q->direct_qdiscs, + q->num_direct_qdiscs); if (!q->direct_qdiscs) return -ENOMEM; } @@ -1845,7 +1844,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; } err = -ENOBUFS; - cl = kzalloc(sizeof(*cl), GFP_KERNEL); + cl = kzalloc_obj(*cl); if (!cl) goto failure; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index cc6051d4f2ef..c3e18bae8fbf 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -113,14 +113,15 @@ static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress); + struct bpf_mprog_entry *entry; if (sch->parent != TC_H_INGRESS) return; tcf_block_put_ext(q->block, sch, &q->block_info); - if (entry) { + if (mini_qdisc_pair_inited(&q->miniqp)) { + entry = rtnl_dereference(dev->tcx_ingress); tcx_miniq_dec(entry); if (!tcx_entry_is_active(entry)) { tcx_entry_update(dev, NULL, true); @@ -290,10 +291,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, static void clsact_destroy(struct Qdisc *sch) { + struct bpf_mprog_entry *ingress_entry, *egress_entry; struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress); - struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress); if (sch->parent != TC_H_CLSACT) return; @@ -301,7 +301,8 @@ static void clsact_destroy(struct Qdisc *sch) tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); - if (ingress_entry) { + if (mini_qdisc_pair_inited(&q->miniqp_ingress)) { + ingress_entry = rtnl_dereference(dev->tcx_ingress); tcx_miniq_dec(ingress_entry); if (!tcx_entry_is_active(ingress_entry)) { tcx_entry_update(dev, NULL, true); @@ -309,7 +310,8 @@ static void clsact_destroy(struct Qdisc *sch) } } - if (egress_entry) { + if (mini_qdisc_pair_inited(&q->miniqp_egress)) { + egress_entry = rtnl_dereference(dev->tcx_egress); tcx_miniq_dec(egress_entry); if (!tcx_entry_is_active(egress_entry)) { tcx_entry_update(dev, NULL, false); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index c860119a8f09..0ed199fa18f0 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -15,11 +15,7 @@ #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> -#include <net/sch_generic.h> - -struct mq_sched { - struct Qdisc **qdiscs; -}; +#include <net/sch_priv.h> static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) { @@ -49,23 +45,29 @@ static int mq_offload_stats(struct Qdisc *sch) return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt); } -static void mq_destroy(struct Qdisc *sch) +void mq_destroy_common(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); unsigned int ntx; - mq_offload(sch, TC_MQ_DESTROY); - if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } +EXPORT_SYMBOL_NS_GPL(mq_destroy_common, "NET_SCHED_INTERNAL"); -static int mq_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static void mq_destroy(struct Qdisc *sch) +{ + mq_offload(sch, TC_MQ_DESTROY); + mq_destroy_common(sch); +} + +int mq_init_common(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack, + const struct Qdisc_ops *qdisc_ops) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); @@ -80,14 +82,14 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt, return -EOPNOTSUPP; /* pre-allocate qdiscs, attachment can't fail */ - priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), - GFP_KERNEL); + priv->qdiscs = kzalloc_objs(priv->qdiscs[0], dev->num_tx_queues); if (!priv->qdiscs) return -ENOMEM; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), + qdisc = qdisc_create_dflt(dev_queue, + qdisc_ops ?: get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1)), extack); @@ -98,12 +100,24 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt, } sch->flags |= TCQ_F_MQROOT; + return 0; +} +EXPORT_SYMBOL_NS_GPL(mq_init_common, "NET_SCHED_INTERNAL"); + +static int mq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int ret; + + ret = mq_init_common(sch, opt, extack, NULL); + if (ret) + return ret; mq_offload(sch, TC_MQ_CREATE); return 0; } -static void mq_attach(struct Qdisc *sch) +void mq_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); @@ -124,8 +138,9 @@ static void mq_attach(struct Qdisc *sch) kfree(priv->qdiscs); priv->qdiscs = NULL; } +EXPORT_SYMBOL_NS_GPL(mq_attach, "NET_SCHED_INTERNAL"); -static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) +void mq_dump_common(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; @@ -152,7 +167,12 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } +} +EXPORT_SYMBOL_NS_GPL(mq_dump_common, "NET_SCHED_INTERNAL"); +static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + mq_dump_common(sch, skb); return mq_offload_stats(sch); } @@ -166,11 +186,12 @@ static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) return netdev_get_tx_queue(dev, ntx); } -static struct netdev_queue *mq_select_queue(struct Qdisc *sch, - struct tcmsg *tcm) +struct netdev_queue *mq_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) { return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } +EXPORT_SYMBOL_NS_GPL(mq_select_queue, "NET_SCHED_INTERNAL"); static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) @@ -198,14 +219,15 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return 0; } -static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) +struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); return rtnl_dereference(dev_queue->qdisc_sleeping); } +EXPORT_SYMBOL_NS_GPL(mq_leaf, "NET_SCHED_INTERNAL"); -static unsigned long mq_find(struct Qdisc *sch, u32 classid) +unsigned long mq_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); @@ -213,9 +235,10 @@ static unsigned long mq_find(struct Qdisc *sch, u32 classid) return 0; return ntx; } +EXPORT_SYMBOL_NS_GPL(mq_find, "NET_SCHED_INTERNAL"); -static int mq_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) +int mq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); @@ -224,9 +247,10 @@ static int mq_dump_class(struct Qdisc *sch, unsigned long cl, tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; return 0; } +EXPORT_SYMBOL_NS_GPL(mq_dump_class, "NET_SCHED_INTERNAL"); -static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct gnet_dump *d) +int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); @@ -236,8 +260,9 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, return -1; return 0; } +EXPORT_SYMBOL_NS_GPL(mq_dump_class_stats, "NET_SCHED_INTERNAL"); -static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx; @@ -251,6 +276,7 @@ static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) break; } } +EXPORT_SYMBOL_NS_GPL(mq_walk, "NET_SCHED_INTERNAL"); static const struct Qdisc_class_ops mq_class_ops = { .select_queue = mq_select_queue, diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index f3e5ef9a9592..b83276409416 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -388,8 +388,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, } /* pre-allocate qdisc, attachment can't fail */ - priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), - GFP_KERNEL); + priv->qdiscs = kzalloc_objs(priv->qdiscs[0], dev->num_tx_queues); if (!priv->qdiscs) return -ENOMEM; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 06e03f5cd7ce..9f822fee113d 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -249,7 +249,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt, q->max_bands = qdisc_dev(sch)->num_tx_queues; - q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); + q->queues = kzalloc_objs(struct Qdisc *, q->max_bands); if (!q->queues) return -ENOBUFS; for (i = 0; i < q->max_bands; i++) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eafc316ae319..5de1c932944a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); + qdisc_skb_cb(skb)->pkt_segs = 1; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { @@ -813,7 +814,7 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) if (!n || n > NETEM_DIST_MAX) return -EINVAL; - d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); + d = kvmalloc_flex(*d, table, n); if (!d) return -ENOMEM; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 2255355e51d3..699e45873f86 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -373,7 +373,7 @@ static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) /* Deschedule class and remove it from its parent aggregate. */ static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { - if (cl->qdisc->q.qlen > 0) /* class is active */ + if (cl_is_active(cl)) /* class is active */ qfq_deactivate_class(q, cl); qfq_rm_from_agg(q, cl); @@ -392,7 +392,7 @@ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ - new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); + new_agg = kzalloc_obj(*new_agg, GFP_ATOMIC); if (new_agg == NULL) return -ENOBUFS; qfq_init_agg(q, new_agg, lmax, weight); @@ -476,7 +476,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } /* create and init new class */ - cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); + cl = kzalloc_obj(struct qfq_class); if (cl == NULL) return -ENOBUFS; @@ -508,7 +508,7 @@ set_change_agg: new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ sch_tree_unlock(sch); - new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); + new_agg = kzalloc_obj(*new_agg); if (new_agg == NULL) { err = -ENOBUFS; gen_kill_estimator(&cl->rate_est); @@ -529,8 +529,10 @@ set_change_agg: return 0; destroy_class: - qdisc_put(cl->qdisc); - kfree(cl); + if (!existing) { + qdisc_put(cl->qdisc); + kfree(cl); + } return err; } @@ -1250,7 +1252,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } } - gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + gso_segs = qdisc_pkt_segs(skb); err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1481,7 +1483,7 @@ static void qfq_reset_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { - if (cl->qdisc->q.qlen > 0) + if (cl_is_active(cl)) qfq_deactivate_class(q, cl); qdisc_reset(cl->qdisc); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 96eb2f122973..503d7d3ca081 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -668,7 +668,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc_obj(*p); if (!p) return -ENOMEM; } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 39b735386996..f721c03514f6 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; slen += segs->len; /* FIXME: we should be segmenting to a smaller size @@ -1102,7 +1103,7 @@ static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, continue; } - entry = kzalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc_obj(*entry); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; @@ -1375,8 +1376,7 @@ static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; - __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), - GFP_KERNEL); + __offload = kzalloc_flex(*__offload, offload.entries, num_entries); if (!__offload) return NULL; @@ -1869,7 +1869,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err) return err; - new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); + new_admin = kzalloc_obj(*new_admin); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; @@ -2090,8 +2090,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return -EOPNOTSUPP; } - q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), - GFP_KERNEL); + q->qdiscs = kzalloc_objs(q->qdiscs[0], dev->num_tx_queues); if (!q->qdiscs) return -ENOMEM; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4c977f049670..f2340164f579 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; + qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 8badec6d82a2..ec4039a201a2 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -146,15 +146,12 @@ teql_destroy(struct Qdisc *sch) master->slaves = NEXT_SLAVE(q); if (q == master->slaves) { struct netdev_queue *txq; - spinlock_t *root_lock; txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; - root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc)); - spin_lock_bh(root_lock); - qdisc_reset(rtnl_dereference(txq->qdisc)); - spin_unlock_bh(root_lock); + dev_reset_queue(master->dev, + txq, NULL); } } skb_queue_purge(&dat->q); @@ -178,6 +175,11 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt, if (m->dev == dev) return -ELOOP; + if (sch->parent != TC_H_ROOT) { + NL_SET_ERR_MSG_MOD(extack, "teql can only be used as root"); + return -EOPNOTSUPP; + } + q->m = m; skb_queue_head_init(&q->q); @@ -310,6 +312,7 @@ restart: if (__netif_tx_trylock(slave_txq)) { unsigned int length = qdisc_pkt_len(skb); + skb->dev = slave; if (!netif_xmit_frozen_or_stopped(slave_txq) && netdev_start_xmit(skb, slave, slave_txq, false) == NETDEV_TX_OK) { diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5793d71852b8..62d3cc155809 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -289,7 +289,7 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, { struct sctp_association *asoc; - asoc = kzalloc(sizeof(*asoc), gfp); + asoc = kzalloc_obj(*asoc, gfp); if (!asoc) goto fail; diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 82aad477590e..be9782760f50 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -82,7 +82,7 @@ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) struct sctp_shared_key *new; /* Allocate the shared key container */ - new = kzalloc(sizeof(struct sctp_shared_key), gfp); + new = kzalloc_obj(struct sctp_shared_key, gfp); if (!new) return NULL; @@ -931,8 +931,8 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) if (!ep->auth_hmacs_list) { struct sctp_hmac_algo_param *auth_hmacs; - auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids, - SCTP_AUTH_NUM_HMACS), gfp); + auth_hmacs = kzalloc_flex(*auth_hmacs, hmac_ids, + SCTP_AUTH_NUM_HMACS, gfp); if (!auth_hmacs) goto nomem; /* Initialize the HMACS parameter. diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 6b95d3ba8fe1..75e3e61d494e 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -147,7 +147,7 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, struct sctp_sockaddr_entry *addr; /* Add the address to the bind address list. */ - addr = kzalloc(sizeof(*addr), gfp); + addr = kzalloc_obj(*addr, gfp); if (!addr) return -ENOMEM; diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index c655b571ca01..5b889e89e9f2 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -47,7 +47,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg) static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; - msg = kmalloc(sizeof(struct sctp_datamsg), gfp); + msg = kmalloc_obj(struct sctp_datamsg, gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 31e989dfe846..8d342b514142 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -135,7 +135,7 @@ struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp) struct sctp_endpoint *ep; /* Build a local endpoint. */ - ep = kzalloc(sizeof(*ep), gfp); + ep = kzalloc_obj(*ep, gfp); if (!ep) goto fail; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 568ff8797c39..53a5c027f8e3 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -83,7 +83,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, switch (ev) { case NETDEV_UP: - addr = kzalloc(sizeof(*addr), GFP_ATOMIC); + addr = kzalloc_obj(*addr, GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifa->addr; @@ -471,7 +471,7 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { /* Add the address to the local list. */ - addr = kzalloc(sizeof(*addr), GFP_ATOMIC); + addr = kzalloc_obj(*addr, GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifp->addr; @@ -492,6 +492,8 @@ static void sctp_v6_copy_ip_options(struct sock *sk, struct sock *newsk) struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct ipv6_txoptions *opt; + inet_sk(newsk)->inet_opt = NULL; + newnp = inet6_sk(newsk); rcu_read_lock(); @@ -777,54 +779,6 @@ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr) return retval; } -/* Create and initialize a new sk for the socket to be returned by accept(). */ -static struct sock *sctp_v6_create_accept_sk(struct sock *sk, - struct sctp_association *asoc, - bool kern) -{ - struct sock *newsk; - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); - struct sctp6_sock *newsctp6sk; - - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); - if (!newsk) - goto out; - - sock_init_data(NULL, newsk); - - sctp_copy_sock(newsk, sk, asoc); - sock_reset_flag(sk, SOCK_ZAPPED); - - newsctp6sk = (struct sctp6_sock *)newsk; - inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; - - sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; - - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - - sctp_v6_copy_ip_options(sk, newsk); - - /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() - * and getpeername(). - */ - sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); - - newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - - if (newsk->sk_prot->init(newsk)) { - sk_common_release(newsk); - newsk = NULL; - } - -out: - return newsk; -} - /* Format a sockaddr for return to user space. This makes sure the return is * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. */ @@ -1171,7 +1125,6 @@ static struct sctp_pf sctp_pf_inet6 = { .bind_verify = sctp_inet6_bind_verify, .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, - .create_accept_sk = sctp_v6_create_accept_sk, .addr_to_user = sctp_v6_addr_to_user, .to_sk_saddr = sctp_v6_to_sk_saddr, .to_sk_daddr = sctp_v6_to_sk_daddr, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9dbc24af749b..828a59b8e7bf 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -86,7 +86,7 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, in_dev_for_each_ifa_rcu(ifa, in_dev) { /* Add the address to the local list. */ - addr = kzalloc(sizeof(*addr), GFP_ATOMIC); + addr = kzalloc_obj(*addr, GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; @@ -580,38 +580,6 @@ static int sctp_v4_is_ce(const struct sk_buff *skb) return INET_ECN_is_ce(ip_hdr(skb)->tos); } -/* Create and initialize a new sk for the socket returned by accept(). */ -static struct sock *sctp_v4_create_accept_sk(struct sock *sk, - struct sctp_association *asoc, - bool kern) -{ - struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot, kern); - struct inet_sock *newinet; - - if (!newsk) - goto out; - - sock_init_data(NULL, newsk); - - sctp_copy_sock(newsk, sk, asoc); - sock_reset_flag(newsk, SOCK_ZAPPED); - - sctp_v4_copy_ip_options(sk, newsk); - - newinet = inet_sk(newsk); - - newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; - - if (newsk->sk_prot->init(newsk)) { - sk_common_release(newsk); - newsk = NULL; - } - -out: - return newsk; -} - static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { /* No address mapping for V4 sockets */ @@ -806,7 +774,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, switch (ev) { case NETDEV_UP: - addr = kzalloc(sizeof(*addr), GFP_ATOMIC); + addr = kzalloc_obj(*addr, GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; @@ -1119,7 +1087,6 @@ static struct sctp_pf sctp_pf_inet = { .bind_verify = sctp_inet_bind_verify, .send_verify = sctp_inet_send_verify, .supported_addrs = sctp_inet_supported_addrs, - .create_accept_sk = sctp_v4_create_accept_sk, .addr_to_user = sctp_v4_addr_to_user, .to_sk_saddr = sctp_v4_to_sk_saddr, .to_sk_daddr = sctp_v4_to_sk_daddr, @@ -1571,7 +1538,7 @@ static __init int sctp_init(void) /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; sctp_ep_hashtable = - kmalloc_array(64, sizeof(struct sctp_hashbucket), GFP_KERNEL); + kmalloc_objs(struct sctp_hashbucket, 64); if (!sctp_ep_hashtable) { pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3755ba079d07..7b823d759141 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -603,6 +603,11 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT, SCTP_PEER_INIT(initchunk)); + /* SCTP-AUTH: generate the association shared keys so that + * we can potentially sign the COOKIE-ECHO. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL()); + /* Reset init error count upon receipt of INIT-ACK. */ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); @@ -617,11 +622,6 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_COOKIE_ECHOED)); - /* SCTP-AUTH: generate the association shared keys so that - * we can potentially sign the COOKIE-ECHO. - */ - sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL()); - /* 5.1 C) "A" shall then send the State Cookie received in the * INIT ACK chunk in a COOKIE ECHO chunk, ... */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ed8293a34240..05fb00c9c335 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -306,7 +306,8 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, * sockaddr_in6 [RFC 2553]), * addr_len - the size of the address structure. */ -static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) +static int sctp_bind(struct sock *sk, struct sockaddr_unsized *addr, + int addr_len) { int retval = 0; @@ -827,7 +828,7 @@ static int sctp_send_asconf_del_ip(struct sock *sk, if (asoc->asconf_addr_del_pending) continue; asoc->asconf_addr_del_pending = - kzalloc(sizeof(union sctp_addr), GFP_ATOMIC); + kzalloc_obj(union sctp_addr, GFP_ATOMIC); if (asoc->asconf_addr_del_pending == NULL) { retval = -ENOMEM; goto out; @@ -1053,13 +1054,13 @@ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs, } } -static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, - int addrlen) +static int sctp_bind_add(struct sock *sk, struct sockaddr_unsized *addrs, + int addrlen) { int err; lock_sock(sk); - err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR); + err = sctp_setsockopt_bindx(sk, (struct sockaddr *)addrs, addrlen, SCTP_BINDX_ADD_ADDR); release_sock(sk); return err; } @@ -1553,8 +1554,6 @@ static void sctp_close(struct sock *sk, long timeout) spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); - - SCTP_DBG_OBJCNT_DEC(sock); } /* Handle EPIPE error. */ @@ -4822,7 +4821,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, return err; } -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { if (addr_len < sizeof(uaddr->sa_family)) @@ -4831,7 +4830,7 @@ int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, if (uaddr->sa_family == AF_UNSPEC) return -EOPNOTSUPP; - return sctp_connect(sock->sk, uaddr, addr_len, flags); + return sctp_connect(sock->sk, (struct sockaddr *)uaddr, addr_len, flags); } /* Only called when shutdown a listening SCTP socket. */ @@ -4844,6 +4843,76 @@ static int sctp_disconnect(struct sock *sk, int flags) return 0; } +static struct sock *sctp_clone_sock(struct sock *sk, + struct sctp_association *asoc, + enum sctp_socket_type type) +{ + struct sock *newsk = sk_clone(sk, GFP_KERNEL, false); + struct inet_sock *newinet; + struct sctp_sock *newsp; + int err = -ENOMEM; + + if (!newsk) + return ERR_PTR(err); + + /* sk_clone() sets refcnt to 2 */ + sock_put(newsk); + + newinet = inet_sk(newsk); + newsp = sctp_sk(newsk); + + newsp->pf->to_sk_daddr(&asoc->peer.primary_addr, newsk); + newinet->inet_dport = htons(asoc->peer.port); + atomic_set(&newinet->inet_id, get_random_u16()); + + inet_set_bit(MC_LOOP, newsk); + newinet->mc_ttl = 1; + newinet->mc_index = 0; + newinet->mc_list = NULL; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *newnp; + + newinet->pinet6 = &((struct sctp6_sock *)newsk)->inet6; + newinet->ipv6_fl_list = NULL; + + newnp = inet6_sk(newsk); + memcpy(newnp, inet6_sk(sk), sizeof(struct ipv6_pinfo)); + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + } +#endif + + newsp->pf->copy_ip_options(sk, newsk); + + newsp->do_auto_asconf = 0; + skb_queue_head_init(&newsp->pd_lobby); + + newsp->ep = sctp_endpoint_new(newsk, GFP_KERNEL); + if (!newsp->ep) + goto out_release; + + SCTP_DBG_OBJCNT_INC(sock); + sk_sockets_allocated_inc(newsk); + sock_prot_inuse_add(sock_net(sk), newsk->sk_prot, 1); + + err = sctp_sock_migrate(sk, newsk, asoc, type); + if (err) + goto out_release; + + /* Set newsk security attributes from original sk and connection + * security attribute from asoc. + */ + security_sctp_sk_clone(asoc, sk, newsk); + + return newsk; + +out_release: + sk_common_release(newsk); + return ERR_PTR(err); +} + /* 4.1.4 accept() - TCP Style Syntax * * Applications use accept() call to remove an established SCTP @@ -4853,18 +4922,13 @@ static int sctp_disconnect(struct sock *sk, int flags) */ static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg) { - struct sctp_sock *sp; - struct sctp_endpoint *ep; - struct sock *newsk = NULL; struct sctp_association *asoc; - long timeo; + struct sock *newsk = NULL; int error = 0; + long timeo; lock_sock(sk); - sp = sctp_sk(sk); - ep = sp->ep; - if (!sctp_style(sk, TCP)) { error = -EOPNOTSUPP; goto out; @@ -4885,20 +4949,12 @@ static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg) /* We treat the list of associations on the endpoint as the accept * queue and pick the first association on the list. */ - asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); + asoc = list_entry(sctp_sk(sk)->ep->asocs.next, + struct sctp_association, asocs); - newsk = sp->pf->create_accept_sk(sk, asoc, arg->kern); - if (!newsk) { - error = -ENOMEM; - goto out; - } - - /* Populate the fields of the newsk from the oldsk and migrate the - * asoc to the newsk. - */ - error = sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); - if (error) { - sk_common_release(newsk); + newsk = sctp_clone_sock(sk, asoc, SCTP_SOCKET_TCP); + if (IS_ERR(newsk)) { + error = PTR_ERR(newsk); newsk = NULL; } @@ -5109,9 +5165,12 @@ static void sctp_destroy_sock(struct sock *sk) sp->do_auto_asconf = 0; list_del(&sp->auto_asconf_list); } + sctp_endpoint_free(sp->ep); + sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + SCTP_DBG_OBJCNT_DEC(sock); } static void sctp_destruct_sock(struct sock *sk) @@ -5615,11 +5674,11 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv /* Helper routine to branch off an association to a new socket. */ static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, - struct socket **sockp) + struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); - struct sctp_sock *sp = sctp_sk(sk); struct socket *sock; + struct sock *newsk; int err = 0; /* Do not peel off from one netns to another one. */ @@ -5635,30 +5694,24 @@ static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, if (!sctp_style(sk, UDP)) return -EINVAL; - /* Create a new socket. */ - err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); - if (err < 0) + err = sock_create_lite(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); + if (err) return err; - sctp_copy_sock(sock->sk, sk, asoc); - - /* Make peeled-off sockets more like 1-1 accepted sockets. - * Set the daddr and initialize id to something more random and also - * copy over any ip options. - */ - sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk); - sp->pf->copy_ip_options(sk, sock->sk); - - /* Populate the fields of the newsk from the oldsk and migrate the - * asoc to the newsk. - */ - err = sctp_sock_migrate(sk, sock->sk, asoc, - SCTP_SOCKET_UDP_HIGH_BANDWIDTH); - if (err) { + newsk = sctp_clone_sock(sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); + if (IS_ERR(newsk)) { sock_release(sock); - sock = NULL; + *sockp = NULL; + return PTR_ERR(newsk); } + lock_sock_nested(newsk, SINGLE_DEPTH_NESTING); + __inet_accept(sk->sk_socket, sock, newsk); + release_sock(newsk); + + sock->ops = sk->sk_socket->ops; + __module_get(sock->ops->owner); + *sockp = sock; return err; @@ -9441,71 +9494,6 @@ done: sctp_skb_set_owner_r(skb, sk); } -void sctp_copy_sock(struct sock *newsk, struct sock *sk, - struct sctp_association *asoc) -{ - struct inet_sock *inet = inet_sk(sk); - struct inet_sock *newinet; - struct sctp_sock *sp = sctp_sk(sk); - - newsk->sk_type = sk->sk_type; - newsk->sk_bound_dev_if = sk->sk_bound_dev_if; - newsk->sk_flags = sk->sk_flags; - newsk->sk_tsflags = sk->sk_tsflags; - newsk->sk_no_check_tx = sk->sk_no_check_tx; - newsk->sk_no_check_rx = sk->sk_no_check_rx; - newsk->sk_reuse = sk->sk_reuse; - sctp_sk(newsk)->reuse = sp->reuse; - - newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = sk->sk_destruct; - newsk->sk_family = sk->sk_family; - newsk->sk_protocol = IPPROTO_SCTP; - newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; - newsk->sk_sndbuf = sk->sk_sndbuf; - newsk->sk_rcvbuf = sk->sk_rcvbuf; - newsk->sk_lingertime = sk->sk_lingertime; - newsk->sk_rcvtimeo = READ_ONCE(sk->sk_rcvtimeo); - newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo); - newsk->sk_rxhash = sk->sk_rxhash; - - newinet = inet_sk(newsk); - - /* Initialize sk's sport, dport, rcv_saddr and daddr for - * getsockname() and getpeername() - */ - newinet->inet_sport = inet->inet_sport; - newinet->inet_saddr = inet->inet_saddr; - newinet->inet_rcv_saddr = inet->inet_rcv_saddr; - newinet->inet_dport = htons(asoc->peer.port); - newinet->pmtudisc = inet->pmtudisc; - atomic_set(&newinet->inet_id, get_random_u16()); - - newinet->uc_ttl = inet->uc_ttl; - inet_set_bit(MC_LOOP, newsk); - newinet->mc_ttl = 1; - newinet->mc_index = 0; - newinet->mc_list = NULL; - - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) - net_enable_timestamp(); - - /* Set newsk security attributes from original sk and connection - * security attribute from asoc. - */ - security_sctp_sk_clone(asoc, sk, newsk); -} - -static inline void sctp_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - size_t ancestor_size = sizeof(struct inet_sock); - - ancestor_size += sk_from->sk_prot->obj_size; - ancestor_size -= offsetof(struct sctp_sock, pd_lobby); - __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); -} - /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ @@ -9522,14 +9510,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sctp_bind_hashbucket *head; int err; - /* Migrate socket buffer sizes and all the socket level options to the - * new socket. - */ - newsk->sk_sndbuf = oldsk->sk_sndbuf; - newsk->sk_rcvbuf = oldsk->sk_rcvbuf; - /* Brute force copy old sctp opt. */ - sctp_copy_descendant(newsk, oldsk); - /* Restore the ep value that was overwritten with the above structure * copy. */ diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f205556c5b24..c2247793c88b 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -54,7 +54,7 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { - struct sctp_sched_ops *sched; + const struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; @@ -130,7 +130,7 @@ out: int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; @@ -166,7 +166,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) struct sctp_stream_out_ext *soute; int ret; - soute = kzalloc(sizeof(*soute), GFP_KERNEL); + soute = kzalloc_obj(*soute); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; @@ -182,7 +182,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) void sctp_stream_free(struct sctp_stream *stream) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); @@ -207,7 +207,7 @@ void sctp_stream_clear(struct sctp_stream *stream) void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 54afbe4fb087..50f8b5240359 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -91,7 +91,7 @@ static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } -static struct sctp_sched_ops sctp_sched_fcfs = { +static const struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, @@ -111,10 +111,10 @@ static void sctp_sched_ops_fcfs_init(void) /* API to other parts of the stack */ -static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; +static const struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, - struct sctp_sched_ops *sched_ops) + const struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } @@ -130,7 +130,7 @@ void sctp_sched_ops_init(void) static void sctp_sched_free_sched(struct sctp_stream *stream) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; @@ -148,9 +148,9 @@ static void sctp_sched_free_sched(struct sctp_stream *stream) int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { - struct sctp_sched_ops *old = asoc->outqueue.sched; + const struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; - struct sctp_sched_ops *n; + const struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; @@ -263,14 +263,14 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } -struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) +const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c index 4bd18a497a6d..776c6de46c22 100644 --- a/net/sctp/stream_sched_fc.c +++ b/net/sctp/stream_sched_fc.c @@ -188,7 +188,7 @@ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream) list_del_init(&soute->fc_list); } -static struct sctp_sched_ops sctp_sched_fc = { +static const struct sctp_sched_ops sctp_sched_fc = { .set = sctp_sched_fc_set, .get = sctp_sched_fc_get, .init = sctp_sched_fc_init, @@ -206,7 +206,7 @@ void sctp_sched_ops_fc_init(void) sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc); } -static struct sctp_sched_ops sctp_sched_wfq = { +static const struct sctp_sched_ops sctp_sched_wfq = { .set = sctp_sched_wfq_set, .get = sctp_sched_wfq_get, .init = sctp_sched_fc_init, diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 4d4d9da331f4..bd4f98db2822 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -42,7 +42,7 @@ static struct sctp_stream_priorities *sctp_sched_prio_new_head( { struct sctp_stream_priorities *p; - p = kmalloc(sizeof(*p), gfp); + p = kmalloc_obj(*p, gfp); if (!p) return NULL; @@ -300,7 +300,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) sctp_sched_prio_unsched(soute); } -static struct sctp_sched_ops sctp_sched_prio = { +static const struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index 1f235e7f643a..9157b653f196 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -171,7 +171,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) sctp_sched_rr_unsched(stream, soute); } -static struct sctp_sched_ops sctp_sched_rr = { +static const struct sctp_sched_ops sctp_sched_rr = { .set = sctp_sched_rr_set, .get = sctp_sched_rr_get, .init = sctp_sched_rr_init, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 0c56d9673cc1..6ea55b9fbde4 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -92,7 +92,7 @@ struct sctp_transport *sctp_transport_new(struct net *net, { struct sctp_transport *transport; - transport = kzalloc(sizeof(*transport), gfp); + transport = kzalloc_obj(*transport, gfp); if (!transport) return NULL; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 7101a48bce54..94bc9c7382ea 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -36,29 +36,26 @@ static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) return &((struct net_shaper_nl_ctx *)ctx)->binding; } -static void net_shaper_lock(struct net_shaper_binding *binding) +static struct net_shaper_hierarchy * +net_shaper_hierarchy(struct net_shaper_binding *binding) { - switch (binding->type) { - case NET_SHAPER_BINDING_TYPE_NETDEV: - netdev_lock(binding->netdev); - break; - } -} + /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ + if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) + return READ_ONCE(binding->netdev->net_shaper_hierarchy); -static void net_shaper_unlock(struct net_shaper_binding *binding) -{ - switch (binding->type) { - case NET_SHAPER_BINDING_TYPE_NETDEV: - netdev_unlock(binding->netdev); - break; - } + /* No other type supported yet. */ + return NULL; } static struct net_shaper_hierarchy * -net_shaper_hierarchy(struct net_shaper_binding *binding) +net_shaper_hierarchy_rcu(struct net_shaper_binding *binding) { - /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ - if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) + /* Readers look up the device and take a ref, then take RCU lock + * later at which point netdev may have been unregistered and flushed. + * READ_ONCE() pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. + */ + if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV && + READ_ONCE(binding->netdev->reg_state) <= NETREG_REGISTERED) return READ_ONCE(binding->netdev->net_shaper_hierarchy); /* No other type supported yet. */ @@ -204,12 +201,49 @@ static int net_shaper_ctx_setup(const struct genl_info *info, int type, return 0; } +/* Like net_shaper_ctx_setup(), but for "write" handlers (never for dumps!) + * Acquires the lock protecting the hierarchy (instance lock for netdev). + */ +static int net_shaper_ctx_setup_lock(const struct genl_info *info, int type, + struct net_shaper_nl_ctx *ctx) +{ + struct net *ns = genl_info_net(info); + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, type)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[type]); + dev = netdev_get_by_index_lock(ns, ifindex); + if (!dev) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + return -ENOENT; + } + + if (!dev->netdev_ops->net_shaper_ops) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + netdev_unlock(dev); + return -EOPNOTSUPP; + } + + ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; + ctx->binding.netdev = dev; + return 0; +} + static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx) { if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) netdev_put(ctx->binding.netdev, &ctx->dev_tracker); } +static void net_shaper_ctx_cleanup_unlock(struct net_shaper_nl_ctx *ctx) +{ + if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) + netdev_unlock(ctx->binding.netdev); +} + static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) { return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) | @@ -251,9 +285,10 @@ static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { - struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); u32 index = net_shaper_handle_to_index(handle); + struct net_shaper_hierarchy *hierarchy; + hierarchy = net_shaper_hierarchy_rcu(binding); if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID)) return NULL; @@ -262,7 +297,7 @@ net_shaper_lookup(struct net_shaper_binding *binding, } /* Allocate on demand the per device shaper's hierarchy container. - * Called under the net shaper lock + * Called under the lock protecting the hierarchy (instance lock for netdev) */ static struct net_shaper_hierarchy * net_shaper_hierarchy_setup(struct net_shaper_binding *binding) @@ -272,7 +307,7 @@ net_shaper_hierarchy_setup(struct net_shaper_binding *binding) if (hierarchy) return hierarchy; - hierarchy = kmalloc(sizeof(*hierarchy), GFP_KERNEL); + hierarchy = kmalloc_obj(*hierarchy); if (!hierarchy) return NULL; @@ -329,7 +364,7 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, id_allocated = true; } - cur = kzalloc(sizeof(*cur), GFP_KERNEL); + cur = kzalloc_obj(*cur); if (!cur) { ret = -ENOMEM; goto free_id; @@ -681,6 +716,22 @@ void net_shaper_nl_post_doit(const struct genl_split_ops *ops, net_shaper_generic_post(info); } +int net_shaper_nl_pre_doit_write(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx)); + + return net_shaper_ctx_setup_lock(info, NET_SHAPER_A_IFINDEX, ctx); +} + +void net_shaper_nl_post_doit_write(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + net_shaper_ctx_cleanup_unlock((struct net_shaper_nl_ctx *)info->ctx); +} + int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; @@ -759,11 +810,7 @@ int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_msg; - ret = genlmsg_reply(msg, info); - if (ret) - goto free_msg; - - return 0; + return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); @@ -782,17 +829,19 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, /* Don't error out dumps performed before any set operation. */ binding = net_shaper_binding_from_ctx(ctx); - hierarchy = net_shaper_hierarchy(binding); - if (!hierarchy) - return 0; rcu_read_lock(); + hierarchy = net_shaper_hierarchy_rcu(binding); + if (!hierarchy) + goto out_unlock; + for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, U32_MAX, XA_PRESENT)); ctx->start_index++) { ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; } +out_unlock: rcu_read_unlock(); return ret; @@ -810,45 +859,38 @@ int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) binding = net_shaper_binding_from_ctx(info->ctx); - net_shaper_lock(binding); ret = net_shaper_parse_info(binding, info->attrs, info, &shaper, &exists); if (ret) - goto unlock; + return ret; if (!exists) net_shaper_default_parent(&shaper.handle, &shaper.parent); hierarchy = net_shaper_hierarchy_setup(binding); - if (!hierarchy) { - ret = -ENOMEM; - goto unlock; - } + if (!hierarchy) + return -ENOMEM; /* The 'set' operation can't create node-scope shapers. */ handle = shaper.handle; if (handle.scope == NET_SHAPER_SCOPE_NODE && - !net_shaper_lookup(binding, &handle)) { - ret = -ENOENT; - goto unlock; - } + !net_shaper_lookup(binding, &handle)) + return -ENOENT; ret = net_shaper_pre_insert(binding, &handle, info->extack); if (ret) - goto unlock; + return ret; ops = net_shaper_ops(binding); ret = ops->set(binding, &shaper, info->extack); if (ret) { net_shaper_rollback(binding); - goto unlock; + return ret; } net_shaper_commit(binding, 1, &shaper); -unlock: - net_shaper_unlock(binding); - return ret; + return 0; } static int __net_shaper_delete(struct net_shaper_binding *binding, @@ -1033,8 +1075,7 @@ static int net_shaper_pre_del_node(struct net_shaper_binding *binding, return -EINVAL; } - leaves = kcalloc(shaper->leaves, sizeof(struct net_shaper), - GFP_KERNEL); + leaves = kzalloc_objs(struct net_shaper, shaper->leaves); if (!leaves) return -ENOMEM; @@ -1077,35 +1118,26 @@ int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) binding = net_shaper_binding_from_ctx(info->ctx); - net_shaper_lock(binding); ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, &handle); if (ret) - goto unlock; + return ret; hierarchy = net_shaper_hierarchy(binding); - if (!hierarchy) { - ret = -ENOENT; - goto unlock; - } + if (!hierarchy) + return -ENOENT; shaper = net_shaper_lookup(binding, &handle); - if (!shaper) { - ret = -ENOENT; - goto unlock; - } + if (!shaper) + return -ENOENT; if (handle.scope == NET_SHAPER_SCOPE_NODE) { ret = net_shaper_pre_del_node(binding, shaper, info->extack); if (ret) - goto unlock; + return ret; } - ret = __net_shaper_delete(binding, shaper, info->extack); - -unlock: - net_shaper_unlock(binding); - return ret; + return __net_shaper_delete(binding, shaper, info->extack); } static int net_shaper_group_send_reply(struct net_shaper_binding *binding, @@ -1154,21 +1186,17 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) if (!net_shaper_ops(binding)->group) return -EOPNOTSUPP; - net_shaper_lock(binding); leaves_count = net_shaper_list_len(info, NET_SHAPER_A_LEAVES); if (!leaves_count) { NL_SET_BAD_ATTR(info->extack, info->attrs[NET_SHAPER_A_LEAVES]); - ret = -EINVAL; - goto unlock; + return -EINVAL; } leaves = kcalloc(leaves_count, sizeof(struct net_shaper) + sizeof(struct net_shaper *), GFP_KERNEL); - if (!leaves) { - ret = -ENOMEM; - goto unlock; - } + if (!leaves) + return -ENOMEM; old_nodes = (void *)&leaves[leaves_count]; ret = net_shaper_parse_node(binding, info->attrs, info, &node); @@ -1245,9 +1273,6 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) free_leaves: kfree(leaves); - -unlock: - net_shaper_unlock(binding); return ret; free_msg: @@ -1314,10 +1339,7 @@ int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_msg; - ret = genlmsg_reply(msg, info); - if (ret) - goto free_msg; - return 0; + return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); @@ -1360,14 +1382,12 @@ static void net_shaper_flush(struct net_shaper_binding *binding) if (!hierarchy) return; - net_shaper_lock(binding); xa_lock(&hierarchy->shapers); xa_for_each(&hierarchy->shapers, index, cur) { __xa_erase(&hierarchy->shapers, index); kfree(cur); } xa_unlock(&hierarchy->shapers); - net_shaper_unlock(binding); kfree(hierarchy); } diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 204c8ae8c7b1..9b29be3ef19a 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -98,27 +99,27 @@ static const struct genl_split_ops net_shaper_nl_ops[] = { }, { .cmd = NET_SHAPER_CMD_SET, - .pre_doit = net_shaper_nl_pre_doit, + .pre_doit = net_shaper_nl_pre_doit_write, .doit = net_shaper_nl_set_doit, - .post_doit = net_shaper_nl_post_doit, + .post_doit = net_shaper_nl_post_doit_write, .policy = net_shaper_set_nl_policy, .maxattr = NET_SHAPER_A_IFINDEX, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { .cmd = NET_SHAPER_CMD_DELETE, - .pre_doit = net_shaper_nl_pre_doit, + .pre_doit = net_shaper_nl_pre_doit_write, .doit = net_shaper_nl_delete_doit, - .post_doit = net_shaper_nl_post_doit, + .post_doit = net_shaper_nl_post_doit_write, .policy = net_shaper_delete_nl_policy, .maxattr = NET_SHAPER_A_IFINDEX, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { .cmd = NET_SHAPER_CMD_GROUP, - .pre_doit = net_shaper_nl_pre_doit, + .pre_doit = net_shaper_nl_pre_doit_write, .doit = net_shaper_nl_group_doit, - .post_doit = net_shaper_nl_post_doit, + .post_doit = net_shaper_nl_post_doit_write, .policy = net_shaper_group_nl_policy, .maxattr = NET_SHAPER_A_LEAVES, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index cb7f9026fc23..42c46c52c775 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NET_SHAPER_GEN_H #define _LINUX_NET_SHAPER_GEN_H @@ -17,12 +18,17 @@ extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGH int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_pre_doit_write(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void +net_shaper_nl_post_doit_write(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_pre_dumpit(struct netlink_callback *cb); diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 99ecd59d1f4b..277ef504bc26 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -19,3 +19,13 @@ config SMC_DIAG smcss. if unsure, say Y. + +config SMC_HS_CTRL_BPF + bool "Generic eBPF hook for SMC handshake flow" + depends on SMC && BPF_JIT && BPF_SYSCALL + default y + help + SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC + handshake flow, which offer much greater flexibility in modifying the behavior + of the SMC protocol stack compared to a complete kernel-based approach. Select + this option if you want filtring the handshake process via eBPF programs. diff --git a/net/smc/Makefile b/net/smc/Makefile index 0e754cbc38f9..5368634c5dd6 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o +smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 77b99e8ef35a..1a565095376a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -16,8 +16,7 @@ * based on prototype from Frank Blaschka */ -#define KMSG_COMPONENT "smc" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "smc: " fmt #include <linux/module.h> #include <linux/socket.h> @@ -58,6 +57,7 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" #include "smc_inet.h" +#include "smc_hs_bpf.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -124,12 +124,21 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, - bool *own_req) + bool *own_req, + void (*opt_child_init)(struct sock *newsk, + const struct sock *sk)) { struct smc_sock *smc; struct sock *child; - smc = smc_clcsock_user_data(sk); + rcu_read_lock(); + smc = smc_clcsock_user_data_rcu(sk); + if (!smc || !refcount_inc_not_zero(&smc->sk.sk_refcnt)) { + rcu_read_unlock(); + smc = NULL; + goto drop; + } + rcu_read_unlock(); if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) @@ -142,7 +151,7 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, /* passthrough to original syn recv sock fct */ child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, - own_req); + own_req, opt_child_init); /* child must not inherit smc or its ops */ if (child) { rcu_assign_sk_user_data(child, NULL); @@ -151,11 +160,14 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) inet_csk(child)->icsk_af_ops = smc->ori_af_ops; } + sock_put(&smc->sk); return child; drop: dst_release(dst); tcp_listendrop(sk); + if (smc) + sock_put(&smc->sk); return NULL; } @@ -252,7 +264,7 @@ static void smc_fback_restore_callbacks(struct smc_sock *smc) struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); - clcsk->sk_user_data = NULL; + rcu_assign_sk_user_data(clcsk, NULL); smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); @@ -421,7 +433,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, return sk; } -int smc_bind(struct socket *sock, struct sockaddr *uaddr, +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -900,7 +912,7 @@ static void smc_fback_replace_callbacks(struct smc_sock *smc) struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); - clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + __rcu_assign_sk_user_data_with_flags(clcsk, smc, SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, &smc->clcsk_state_change); @@ -1195,7 +1207,7 @@ void smc_fill_gid_list(struct smc_link_group *lgr, memset(gidlist, 0, sizeof(*gidlist)); memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); - alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); + alt_ini = kzalloc_obj(*alt_ini); if (!alt_ini) goto out; @@ -1522,7 +1534,7 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, version); - ini = kzalloc(sizeof(*ini), GFP_KERNEL); + ini = kzalloc_obj(*ini); if (!ini) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, version); @@ -1642,7 +1654,7 @@ out: release_sock(&smc->sk); } -int smc_connect(struct socket *sock, struct sockaddr *addr, +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; @@ -1694,7 +1706,7 @@ int smc_connect(struct socket *sock, struct sockaddr *addr, rc = -EALREADY; goto out; } - rc = kernel_connect(smc->clcsock, addr, alen, flags); + rc = kernel_connect(smc->clcsock, (struct sockaddr_unsized *)addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; @@ -2140,7 +2152,7 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, } } -static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +static void smc_init_info_store_rc(u32 rc, struct smc_init_info *ini) { if (!ini->rc) ini->rc = rc; @@ -2203,7 +2215,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, mutex_unlock(&smcd_dev_list.mutex); if (!ini->ism_dev[0]) { - smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); + smc_init_info_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } @@ -2220,7 +2232,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, ini->ism_selected = i; rc = smc_listen_ism_init(new_smc, ini); if (rc) { - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); /* try next active ISM device */ continue; } @@ -2260,7 +2272,7 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, return; /* V1 ISM device found */ not_found: - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; @@ -2311,7 +2323,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); if (rc) { - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); goto not_found; } if (!ini->smcrv2.uses_gateway) @@ -2328,7 +2340,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, if (!rc) return; ini->smcr_version = smcr_version; - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); not_found: ini->smcr_version &= ~SMC_V2; @@ -2375,7 +2387,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, /* check for matching IP prefix and subnet length (V1) */ prfx_rc = smc_listen_prfx_check(new_smc, pclc); if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + smc_init_info_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2402,7 +2414,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, int rc; rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } return prfx_rc; @@ -2470,7 +2482,7 @@ static void smc_listen_work(struct work_struct *work) /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ - buf = kzalloc(sizeof(*buf), GFP_KERNEL); + buf = kzalloc_obj(*buf); if (!buf) { rc = SMC_CLC_DECL_MEM; goto out_decl; @@ -2490,7 +2502,7 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } - ini = kzalloc(sizeof(*ini), GFP_KERNEL); + ini = kzalloc_obj(*ini); if (!ini) { rc = SMC_CLC_DECL_MEM; goto out_decl; @@ -2663,8 +2675,8 @@ int smc_listen(struct socket *sock, int backlog) * smc-specific sk_data_ready function */ write_lock_bh(&smc->clcsock->sk->sk_callback_lock); - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + __rcu_assign_sk_user_data_with_flags(smc->clcsock->sk, smc, + SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); @@ -2685,10 +2697,11 @@ int smc_listen(struct socket *sock, int backlog) write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); - smc->clcsock->sk->sk_user_data = NULL; + rcu_assign_sk_user_data(smc->clcsock->sk, NULL); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } + sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; @@ -3357,11 +3370,10 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) return 0; } -static int __smc_create(struct net *net, struct socket *sock, int protocol, - int kern, struct socket *clcsock) +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; - struct smc_sock *smc; struct sock *sk; int rc; @@ -3380,15 +3392,7 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, if (!sk) goto out; - /* create internal TCP socket for CLC handshake and fallback */ - smc = smc_sk(sk); - - rc = 0; - if (clcsock) - smc->clcsock = clcsock; - else - rc = smc_create_clcsk(net, sk, family); - + rc = smc_create_clcsk(net, sk, family); if (rc) { sk_common_release(sk); sock->sk = NULL; @@ -3397,76 +3401,12 @@ out: return rc; } -static int smc_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - return __smc_create(net, sock, protocol, kern, NULL); -} - static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; -static int smc_ulp_init(struct sock *sk) -{ - struct socket *tcp = sk->sk_socket; - struct net *net = sock_net(sk); - struct socket *smcsock; - int protocol, ret; - - /* only TCP can be replaced */ - if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || - (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) - return -ESOCKTNOSUPPORT; - /* don't handle wq now */ - if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) - return -ENOTCONN; - - if (sk->sk_family == AF_INET) - protocol = SMCPROTO_SMC; - else - protocol = SMCPROTO_SMC6; - - smcsock = sock_alloc(); - if (!smcsock) - return -ENFILE; - - smcsock->type = SOCK_STREAM; - __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ - ret = __smc_create(net, smcsock, protocol, 1, tcp); - if (ret) { - sock_release(smcsock); /* module_put() which ops won't be NULL */ - return ret; - } - - /* replace tcp socket to smc */ - smcsock->file = tcp->file; - smcsock->file->private_data = smcsock; - smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ - smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ - tcp->file = NULL; - - return ret; -} - -static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, - const gfp_t priority) -{ - struct inet_connection_sock *icsk = inet_csk(newsk); - - /* don't inherit ulp ops to child when listen */ - icsk->icsk_ulp_ops = NULL; -} - -static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { - .name = "smc", - .owner = THIS_MODULE, - .init = smc_ulp_init, - .clone = smc_ulp_clone, -}; - unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) @@ -3589,21 +3529,21 @@ static int __init smc_init(void) pr_err("%s: ib_register fails with %d\n", __func__, rc); goto out_sock; } - - rc = tcp_register_ulp(&smc_ulp_ops); + rc = smc_inet_init(); if (rc) { - pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); goto out_ib; } - rc = smc_inet_init(); + rc = bpf_smc_hs_ctrl_init(); if (rc) { - pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); - goto out_ulp; + pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__, + rc); + goto out_inet; } static_branch_enable(&tcp_have_smc); return 0; -out_ulp: - tcp_unregister_ulp(&smc_ulp_ops); +out_inet: + smc_inet_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3639,7 +3579,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); smc_inet_exit(); - tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); @@ -3664,7 +3603,6 @@ MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); -MODULE_ALIAS_TCP_ULP("smc"); /* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/smc/smc.h b/net/smc/smc.h index 2c9084963739..52145df83f6e 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -42,9 +42,9 @@ void smc_unhash_sk(struct sock *sk); void smc_release_cb(struct sock *sk); int smc_release(struct socket *sock); -int smc_bind(struct socket *sock, struct sockaddr *uaddr, +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); -int smc_connect(struct socket *sock, struct sockaddr *addr, +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags); int smc_accept(struct socket *sock, struct socket *new_sock, struct proto_accept_arg *arg); @@ -346,6 +346,11 @@ static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } +static inline struct smc_sock *smc_clcsock_user_data_rcu(const struct sock *clcsk) +{ + return (struct smc_sock *)rcu_dereference_sk_user_data(clcsk); +} + /* save target_cb in saved_cb, and replace target_cb with new_cb */ static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), void (*new_cb)(struct sock *), diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 87c87edadde7..c38fc7bf0a7e 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -88,7 +88,7 @@ static int smc_clc_ueid_add(char *ueid) return -EINVAL; /* add a new ueid entry to the ueid table if there isn't one */ - new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL); + new_ueid = kzalloc_obj(*new_ueid); if (!new_ueid) return -ENOMEM; memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN); @@ -861,7 +861,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) struct kvec vec[8]; struct msghdr msg; - pclc = kzalloc(sizeof(*pclc), GFP_KERNEL); + pclc = kzalloc_obj(*pclc); if (!pclc) return -ENOMEM; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 10219f55aad1..bb0313ef5f7c 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -218,7 +218,7 @@ again: write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); - smc->clcsock->sk->sk_user_data = NULL; + rcu_assign_sk_user_data(smc->clcsock->sk, NULL); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index be0c2da83d2b..e2d083daeb7e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -810,6 +810,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); + lnk->max_send_wr = lgr->max_send_wr; + lnk->max_recv_wr = lgr->max_recv_wr; lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; @@ -836,27 +838,39 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, rc = smc_llc_link_init(lnk); if (rc) goto out; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; + goto clear_llc_lnk; + do { + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_alloc_link_mem(lnk); + if (!rc) + break; + else if (rc != -ENOMEM) /* give up */ + goto destroy_qp; + /* retry with smaller ... */ + lnk->max_send_wr /= 2; + lnk->max_recv_wr /= 2; + /* ... unless droping below old SMC_WR_BUF_SIZE */ + if (lnk->max_send_wr < 16 || lnk->max_recv_wr < 48) + goto destroy_qp; + smc_ib_destroy_queue_pair(lnk); + } while (1); + rc = smc_wr_create_link(lnk); if (rc) - goto destroy_qp; + goto free_link_mem; lnk->state = SMC_LNK_ACTIVATING; return 0; +free_link_mem: + smc_wr_free_link_mem(lnk); destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); clear_llc_lnk: smc_llc_link_clear(lnk, false); out: @@ -891,7 +905,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } } - lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); + lgr = kzalloc_obj(*lgr); if (!lgr) { rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; @@ -2306,7 +2320,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc; /* try to alloc a new buffer */ - buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + buf_desc = kzalloc_obj(*buf_desc); if (!buf_desc) return ERR_PTR(-ENOMEM); @@ -2380,7 +2394,7 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, int rc; /* try to alloc a new DMB */ - buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + buf_desc = kzalloc_obj(*buf_desc); if (!buf_desc) return ERR_PTR(-ENOMEM); if (is_dmb) { @@ -2564,7 +2578,7 @@ int smcd_buf_attach(struct smc_sock *smc) struct smc_buf_desc *buf_desc; int rc; - buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + buf_desc = kzalloc_obj(*buf_desc); if (!buf_desc) return -ENOMEM; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index a5a78cbff341..5c18f08a4c8a 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -34,6 +34,8 @@ * distributions may modify it to a value between * 16-255 as needed. */ +#define SMCR_MAX_SEND_WR_DEF 16 /* Default number of work requests per send queue */ +#define SMCR_MAX_RECV_WR_DEF 48 /* Default number of work requests per recv queue */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -173,6 +175,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + u16 max_send_wr; + u16 max_recv_wr; }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -366,6 +370,10 @@ struct smc_link_group { /* max conn can be assigned to lgr */ u8 max_links; /* max links can be added in lgr */ + u16 max_send_wr; + /* number of WR buffers on send */ + u16 max_recv_wr; + /* number of WR buffers on recv */ }; struct { /* SMC-D */ struct smcd_gid peer_gid; diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c new file mode 100644 index 000000000000..063d23d85850 --- /dev/null +++ b/net/smc/smc_hs_bpf.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic hook for SMC handshake flow. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2025, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/rculist.h> + +#include "smc_hs_bpf.h" + +static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock); +static LIST_HEAD(smc_hs_ctrl_list); + +static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl) +{ + int ret = 0; + + spin_lock(&smc_hs_ctrl_list_lock); + /* already exist or duplicate name */ + if (smc_hs_ctrl_find_by_name(ctrl->name)) + ret = -EEXIST; + else + list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list); + spin_unlock(&smc_hs_ctrl_list_lock); + return ret; +} + +static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl) +{ + spin_lock(&smc_hs_ctrl_list_lock); + list_del_rcu(&ctrl->list); + spin_unlock(&smc_hs_ctrl_list_lock); + + /* Ensure that all readers to complete */ + synchronize_rcu(); +} + +struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name) +{ + struct smc_hs_ctrl *ctrl; + + list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) { + if (strcmp(ctrl->name, name) == 0) + return ctrl; + } + return NULL; +} + +static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; } +static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp, + struct inet_request_sock *ireq) +{ + return 1; +} + +static struct smc_hs_ctrl __smc_bpf_hs_ctrl = { + .syn_option = __smc_bpf_stub_set_tcp_option, + .synack_option = __smc_bpf_stub_set_tcp_option_cond, +}; + +static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; } + +static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link) +{ + if (link) + return -EOPNOTSUPP; + + return smc_hs_ctrl_reg(kdata); +} + +static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link) +{ + smc_hs_ctrl_unreg(kdata); +} + +static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct smc_hs_ctrl *u_ctrl; + struct smc_hs_ctrl *k_ctrl; + u32 moff; + + u_ctrl = (const struct smc_hs_ctrl *)udata; + k_ctrl = (struct smc_hs_ctrl *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct smc_hs_ctrl, name): + if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name, + sizeof(u_ctrl->name)) <= 0) + return -EINVAL; + return 1; + case offsetof(struct smc_hs_ctrl, flags): + if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS) + return -EINVAL; + k_ctrl->flags = u_ctrl->flags; + return 1; + default: + break; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +static const struct bpf_verifier_ops smc_bpf_verifier_ops = { + .get_func_proto = bpf_smc_hs_func_proto, + .is_valid_access = bpf_tracing_btf_ctx_access, +}; + +static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = { + .name = "smc_hs_ctrl", + .init = smc_bpf_hs_ctrl_init, + .reg = smc_bpf_hs_ctrl_reg, + .unreg = smc_bpf_hs_ctrl_unreg, + .cfi_stubs = &__smc_bpf_hs_ctrl, + .verifier_ops = &smc_bpf_verifier_ops, + .init_member = smc_bpf_hs_ctrl_init_member, + .owner = THIS_MODULE, +}; + +int bpf_smc_hs_ctrl_init(void) +{ + return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl); +} diff --git a/net/smc/smc_hs_bpf.h b/net/smc/smc_hs_bpf.h new file mode 100644 index 000000000000..f5f1807c079e --- /dev/null +++ b/net/smc/smc_hs_bpf.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic hook for SMC handshake flow. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2025, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#ifndef __SMC_HS_CTRL +#define __SMC_HS_CTRL + +#include <net/smc.h> + +/* Find hs_ctrl by the target name, which required to be a c-string. + * Return NULL if no such ctrl was found,otherwise, return a valid ctrl. + * + * Note: Caller MUST ensure it's was invoked under rcu_read_lock. + */ +struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name); + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +int bpf_smc_hs_ctrl_init(void); +#else +static inline int bpf_smc_hs_ctrl_init(void) { return 0; } +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + +#endif /* __SMC_HS_CTRL */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0052f02756eb..9bb495707445 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -669,11 +669,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { - /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND - */ - .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, @@ -683,6 +678,11 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) }; int rc; + /* include unsolicited rdma_writes as well, + * there are max. 2 RDMA_WRITE per 1 WR_SEND + */ + qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr; + qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr; lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); if (IS_ERR(lnk->roce_qp)) @@ -944,7 +944,7 @@ static int smc_ib_add_dev(struct ib_device *ibdev) if (ibdev->node_type != RDMA_NODE_IB_CA) return -EOPNOTSUPP; - smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); + smcibdev = kzalloc_obj(*smcibdev); if (!smcibdev) return -ENOMEM; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 7b228ca2f96a..e0dba2c7b6e3 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -142,7 +142,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ - new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); + new_vlan = kzalloc_obj(*new_vlan); if (!new_vlan) return -ENOMEM; new_vlan->vlanid = vlanid; @@ -467,11 +467,10 @@ static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc_obj(*smcd); if (!smcd) return NULL; - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); + smcd->conn = kzalloc_objs(struct smc_connection *, max_dmbs); if (!smcd->conn) goto free_smcd; @@ -582,7 +581,7 @@ static void smcd_handle_event(struct dibs_dev *dibs, if (smcd->going_away) return; /* copy event to event work queue, and let it be handled there */ - wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + wrk = kmalloc_obj(*wrk, GFP_ATOMIC); if (!wrk) return; INIT_WORK(&wrk->work, smc_ism_event_work); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f865c58c3aa7..954b2ff1815c 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1040,7 +1040,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (!llc->qp_mtu) goto out_reject; - ini = kzalloc(sizeof(*ini), GFP_KERNEL); + ini = kzalloc_obj(*ini); if (!ini) { rc = -ENOMEM; goto out_reject; @@ -1180,7 +1180,7 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) goto out; - ini = kzalloc(sizeof(*ini), GFP_KERNEL); + ini = kzalloc_obj(*ini); if (!ini) goto out; @@ -1419,7 +1419,7 @@ int smc_llc_srv_add_link(struct smc_link *link, req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK) send_req_add_link_resp = true; - ini = kzalloc(sizeof(*ini), GFP_KERNEL); + ini = kzalloc_obj(*ini); if (!ini) { rc = -ENOMEM; goto out; @@ -2069,7 +2069,7 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) struct smc_llc_qentry *qentry; unsigned long flags; - qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + qentry = kmalloc_obj(*qentry, GFP_ATOMIC); if (!qentry) return; qentry->link = link; @@ -2157,6 +2157,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) init_waitqueue_head(&lgr->llc_msg_waiter); init_rwsem(&lgr->llc_conf_mutex); lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); + lgr->max_send_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_send_wr)); + lgr->max_recv_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_recv_wr)); } /* called after lgr was removed from lgr_list */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index a3a1e1fde8eb..63e286e2dfaa 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -368,7 +368,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, /* add a new netdev entry to the pnet table if there isn't one */ rc = -ENOMEM; - new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + new_pe = kzalloc_obj(*new_pe); if (!new_pe) goto out_put; new_pe->type = SMC_PNET_ETH; @@ -445,7 +445,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, return -EEXIST; /* add a new ib entry to the pnet table if there isn't one */ - new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + new_pe = kzalloc_obj(*new_pe); if (!new_pe) return -ENOMEM; new_pe->type = SMC_PNET_IB; @@ -747,7 +747,7 @@ static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnetids_ndev_entry *pe, *pi; - pe = kzalloc(sizeof(*pe), GFP_KERNEL); + pe = kzalloc_obj(*pe); if (!pe) return -ENOMEM; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index e7f1134453ef..d833e36f7fd4 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -161,17 +161,17 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; - pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + pages = kzalloc_objs(*pages, nr_pages); if (!pages) goto out; - partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + partial = kzalloc_objs(*partial, nr_pages); if (!partial) goto out_page; - priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); + priv = kzalloc_objs(*priv, nr_pages); if (!priv) goto out_part; for (i = 0; i < nr_pages; i++) { - priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + priv[i] = kzalloc_obj(**priv); if (!priv[i]) goto out_priv; } diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index e71b17d1e21c..59db611821f6 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -20,7 +20,7 @@ int smc_stats_init(struct net *net) { - net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); + net->smc.fback_rsn = kzalloc_obj(*net->smc.fback_rsn); if (!net->smc.fback_rsn) goto err_fback; net->smc.smc_stats = alloc_percpu(struct smc_stats); @@ -285,7 +285,7 @@ int smc_nl_get_stats(struct sk_buff *skb, attrs = nla_nest_start(skb, SMC_GEN_STATS); if (!attrs) goto errnest; - stats = kzalloc(sizeof(*stats), GFP_KERNEL); + stats = kzalloc_obj(*stats); if (!stats) goto erralloc; size = sizeof(*stats) / sizeof(u64); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 2fab6456f765..b1efed546243 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -12,12 +12,14 @@ #include <linux/init.h> #include <linux/sysctl.h> +#include <linux/bpf.h> #include <net/net_namespace.h> #include "smc.h" #include "smc_core.h" #include "smc_llc.h" #include "smc_sysctl.h" +#include "smc_hs_bpf.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; @@ -29,6 +31,71 @@ static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN; static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; +static unsigned int smcr_max_wr_min = 2; +static unsigned int smcr_max_wr_max = 2048; + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name) +{ + struct smc_hs_ctrl *ctrl = NULL; + + rcu_read_lock(); + /* null or empty name ask to clear current ctrl */ + if (name && name[0]) { + ctrl = smc_hs_ctrl_find_by_name(name); + if (!ctrl) { + rcu_read_unlock(); + return -EINVAL; + } + /* no change, just return */ + if (ctrl == rcu_dereference(net->smc.hs_ctrl)) { + rcu_read_unlock(); + return 0; + } + if (!bpf_try_module_get(ctrl, ctrl->owner)) { + rcu_read_unlock(); + return -EBUSY; + } + } + /* xhcg old ctrl with the new one atomically */ + ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl))); + /* release old ctrl */ + if (ctrl) + bpf_module_put(ctrl, ctrl->owner); + + rcu_read_unlock(); + return 0; +} + +static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl); + char val[SMC_HS_CTRL_NAME_MAX]; + const struct ctl_table tbl = { + .data = val, + .maxlen = SMC_HS_CTRL_NAME_MAX, + }; + struct smc_hs_ctrl *ctrl; + int ret; + + rcu_read_lock(); + ctrl = rcu_dereference(net->smc.hs_ctrl); + if (ctrl) + memcpy(val, ctrl->name, sizeof(ctrl->name)); + else + val[0] = '\0'; + rcu_read_unlock(); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) + ret = smc_net_replace_smc_hs_ctrl(net, val); + return ret; +} +#endif /* CONFIG_SMC_HS_CTRL_BPF */ static struct ctl_table smc_table[] = { { @@ -99,6 +166,33 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "smcr_max_send_wr", + .data = &init_net.smc.sysctl_smcr_max_send_wr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &smcr_max_wr_min, + .extra2 = &smcr_max_wr_max, + }, + { + .procname = "smcr_max_recv_wr", + .data = &init_net.smc.sysctl_smcr_max_recv_wr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &smcr_max_wr_min, + .extra2 = &smcr_max_wr_max, + }, +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + { + .procname = "hs_ctrl", + .data = &init_net.smc.hs_ctrl, + .mode = 0644, + .maxlen = SMC_HS_CTRL_NAME_MAX, + .proc_handler = proc_smc_hs_ctrl, + }, +#endif /* CONFIG_SMC_HS_CTRL_BPF */ }; int __net_init smc_sysctl_net_init(struct net *net) @@ -109,6 +203,16 @@ int __net_init smc_sysctl_net_init(struct net *net) table = smc_table; if (!net_eq(net, &init_net)) { int i; +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + struct smc_hs_ctrl *ctrl; + + rcu_read_lock(); + ctrl = rcu_dereference(init_net.smc.hs_ctrl); + if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE && + bpf_try_module_get(ctrl, ctrl->owner)) + rcu_assign_pointer(net->smc.hs_ctrl, ctrl); + rcu_read_unlock(); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); if (!table) @@ -130,6 +234,8 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; /* disable handshake limitation by default */ net->smc.limit_smc_hs = 0; @@ -139,6 +245,9 @@ err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + smc_net_replace_smc_hs_ctrl(net, NULL); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ return -ENOMEM; } @@ -148,6 +257,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net) table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + smc_net_replace_smc_hs_ctrl(net, NULL); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + if (!net_eq(net, &init_net)) kfree(table); } diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index eb2465ae1e15..8538915af7af 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -25,6 +25,8 @@ static inline int smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; return 0; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index b04a21b8c511..59c92b46945c 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -547,9 +547,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) IB_QP_DEST_QPN, &init_attr); - lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr, lnk->qp_attr.cap.max_recv_wr); } @@ -741,66 +741,56 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { /* allocate link related memory */ - link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); + link->wr_tx_bufs = kcalloc(link->max_send_wr, + SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, + link->wr_rx_bufs = kcalloc(link->max_recv_wr, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; - link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), - GFP_KERNEL); + link->wr_tx_ibs = kzalloc_objs(link->wr_tx_ibs[0], link->max_send_wr); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_ibs[0]), - GFP_KERNEL); + link->wr_rx_ibs = kzalloc_objs(link->wr_rx_ibs[0], link->max_recv_wr); if (!link->wr_rx_ibs) goto no_mem_wr_tx_ibs; - link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, - sizeof(link->wr_tx_rdmas[0]), - GFP_KERNEL); + link->wr_tx_rdmas = kzalloc_objs(link->wr_tx_rdmas[0], + link->max_send_wr); if (!link->wr_tx_rdmas) goto no_mem_wr_rx_ibs; - link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, - sizeof(link->wr_tx_rdma_sges[0]), - GFP_KERNEL); + link->wr_tx_rdma_sges = kzalloc_objs(link->wr_tx_rdma_sges[0], + link->max_send_wr); if (!link->wr_tx_rdma_sges) goto no_mem_wr_tx_rdmas; - link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), - GFP_KERNEL); + link->wr_tx_sges = kzalloc_objs(link->wr_tx_sges[0], link->max_send_wr); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(link->max_recv_wr, sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); + link->wr_tx_mask = bitmap_zalloc(link->max_send_wr, GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; - link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, - sizeof(link->wr_tx_pends[0]), - GFP_KERNEL); + link->wr_tx_pends = kzalloc_objs(link->wr_tx_pends[0], + link->max_send_wr); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; - link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, - sizeof(link->wr_tx_compl[0]), - GFP_KERNEL); + link->wr_tx_compl = kzalloc_objs(link->wr_tx_compl[0], + link->max_send_wr); if (!link->wr_tx_compl) goto no_mem_wr_tx_pends; if (link->lgr->smc_version == SMC_V2) { - link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), - GFP_KERNEL); + link->wr_tx_v2_ib = kzalloc_obj(*link->wr_tx_v2_ib); if (!link->wr_tx_v2_ib) goto no_mem_tx_compl; - link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), - GFP_KERNEL); + link->wr_tx_v2_sge = kzalloc_obj(*link->wr_tx_v2_sge); if (!link->wr_tx_v2_sge) goto no_mem_v2_ib; - link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), - GFP_KERNEL); + link->wr_tx_v2_pend = kzalloc_obj(*link->wr_tx_v2_pend); if (!link->wr_tx_v2_pend) goto no_mem_v2_sge; } @@ -905,7 +895,7 @@ int smc_wr_create_link(struct smc_link *lnk) goto dma_unmap; } smc_wr_init_sge(lnk); - bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); + bitmap_zero(lnk->wr_tx_mask, lnk->max_send_wr); init_waitqueue_head(&lnk->wr_tx_wait); rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); if (rc) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index f3008dda222a..aa4533af9122 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,8 +19,6 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ - #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ diff --git a/net/socket.c b/net/socket.c index e8892b218708..05952188127f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -674,7 +674,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) iput(SOCK_INODE(sock)); return; } - sock->file = NULL; + WRITE_ONCE(sock->file, NULL); } /** @@ -1872,7 +1872,7 @@ int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, addrlen); if (!err) err = READ_ONCE(sock->ops)->bind(sock, - (struct sockaddr *)address, + (struct sockaddr_unsized *)address, addrlen); return err; } @@ -2012,8 +2012,6 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s int __user *upeer_addrlen, int flags) { struct proto_accept_arg arg = { }; - struct file *newfile; - int newfd; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; @@ -2021,18 +2019,7 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - newfd = get_unused_fd_flags(flags); - if (unlikely(newfd < 0)) - return newfd; - - newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, - flags); - if (IS_ERR(newfile)) { - put_unused_fd(newfd); - return PTR_ERR(newfile); - } - fd_install(newfd, newfile); - return newfd; + return FD_ADD(flags, do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags)); } /* @@ -2099,8 +2086,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, if (err) goto out; - err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, - addrlen, sock->file->f_flags | file_flags); + err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)address, + addrlen, sock->file->f_flags | file_flags); out: return err; } @@ -2127,78 +2114,53 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, return __sys_connect(fd, uservaddr, addrlen); } -/* - * Get the local address ('name') of a socket object. Move the obtained - * name to user space. - */ - -int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len) +int do_getsockname(struct socket *sock, int peer, + struct sockaddr __user *usockaddr, int __user *usockaddr_len) { - struct socket *sock; struct sockaddr_storage address; - CLASS(fd, f)(fd); int err; - if (fd_empty(f)) - return -EBADF; - sock = sock_from_file(fd_file(f)); - if (unlikely(!sock)) - return -ENOTSOCK; - - err = security_socket_getsockname(sock); + if (peer) + err = security_socket_getpeername(sock); + else + err = security_socket_getsockname(sock); if (err) return err; - - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); if (err < 0) return err; - /* "err" is actually length in this case */ return move_addr_to_user(&address, err, usockaddr, usockaddr_len); } -SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, - int __user *, usockaddr_len) -{ - return __sys_getsockname(fd, usockaddr, usockaddr_len); -} - /* - * Get the remote address ('name') of a socket object. Move the obtained - * name to user space. + * Get the remote or local address ('name') of a socket object. Move the + * obtained name to user space. */ - -int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len) +int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len, int peer) { struct socket *sock; - struct sockaddr_storage address; CLASS(fd, f)(fd); - int err; if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; + return do_getsockname(sock, peer, usockaddr, usockaddr_len); +} - err = security_socket_getpeername(sock); - if (err) - return err; - - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1); - if (err < 0) - return err; - - /* "err" is actually length in this case */ - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); +SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, + int __user *, usockaddr_len) +{ + return __sys_getsockname(fd, usockaddr, usockaddr_len, 0); } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { - return __sys_getpeername(fd, usockaddr, usockaddr_len); + return __sys_getsockname(fd, usockaddr, usockaddr_len, 1); } /* @@ -3162,12 +3124,12 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + (int __user *)a[2], 0); break; case SYS_GETPEERNAME: err = - __sys_getpeername(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + __sys_getsockname(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], 1); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); @@ -3583,13 +3545,13 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, * Returns 0 or an error. */ -int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); - return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address, + return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)&address, addrlen); } EXPORT_SYMBOL(kernel_bind); @@ -3662,14 +3624,14 @@ EXPORT_SYMBOL(kernel_accept); * Returns 0 or an error code. */ -int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, +int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); - return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, + return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index e659fea2da70..fe0e76fdd1f1 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -127,7 +127,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, } if (!strp->skb_nextp) { - /* We are going to append to the frags_list of head. + /* We are going to append to the frag_list of head. * Need to unshare the frag_list. */ err = skb_unclone(head, GFP_ATOMIC); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5a827afd8e3b..68c0595ea2fd 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -290,12 +290,12 @@ rpcauth_init_credcache(struct rpc_auth *auth) struct rpc_cred_cache *new; unsigned int hashsize; - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc_obj(*new); if (!new) goto out_nocache; new->hashbits = auth_hashbits; hashsize = 1U << new->hashbits; - new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); + new->hashtable = kzalloc_objs(new->hashtable[0], hashsize); if (!new->hashtable) goto out_nohashtbl; spin_lock_init(&new->lock); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 5c095cb8cb20..9d3fb6848f40 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -39,6 +39,8 @@ static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; static const struct rpc_credops gss_nullops; +static void gss_free_callback(struct kref *kref); + #define GSS_RETRY_EXPIRED 5 static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; @@ -162,7 +164,7 @@ gss_alloc_context(void) { struct gss_cl_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (ctx != NULL) { ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ @@ -527,7 +529,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, int vers; int err = -ENOMEM; - gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL); + gss_msg = kzalloc_obj(*gss_msg); if (gss_msg == NULL) goto err; vers = get_pipe_version(gss_auth->net); @@ -551,6 +553,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, } return gss_msg; err_put_pipe_version: + kref_put(&gss_auth->kref, gss_free_callback); put_pipe_version(gss_auth->net); err_free_msg: kfree(gss_msg); @@ -911,7 +914,7 @@ static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt, struct gss_pipe *p; int err = -ENOMEM; - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kmalloc_obj(*p); if (p == NULL) goto err; p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); @@ -1026,7 +1029,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) if (!try_module_get(THIS_MODULE)) return ERR_PTR(err); - if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) + if (!(gss_auth = kmalloc_obj(*gss_auth))) goto out_dec; INIT_HLIST_NODE(&gss_auth->hash); gss_auth->target_name = NULL; @@ -1243,7 +1246,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) struct gss_cred *new; /* Make a copy of the cred so that we can reference count it */ - new = kzalloc(sizeof(*gss_cred), GFP_KERNEL); + new = kzalloc_obj(*gss_cred); if (new) { struct auth_cred acred = { .cred = gss_cred->gc_base.cr_cred, @@ -1377,7 +1380,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t struct gss_cred *cred = NULL; int err = -ENOMEM; - if (!(cred = kzalloc(sizeof(*cred), gfp))) + if (!(cred = kzalloc_obj(*cred, gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); @@ -1814,9 +1817,7 @@ alloc_enc_pages(struct rpc_rqst *rqstp) last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages - = kmalloc_array(rqstp->rq_enc_pages_num, - sizeof(struct page *), - GFP_KERNEL); + = kmalloc_objs(struct page *, rqstp->rq_enc_pages_num); if (!rqstp->rq_enc_pages) goto out; for (i=0; i < rqstp->rq_enc_pages_num; i++) { diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 3366505bc669..6db64a9111a9 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -473,7 +473,7 @@ gss_krb5_import_sec_context(const void *p, size_t len, struct gss_ctx *ctx_id, struct krb5_ctx *ctx; int ret; - ctx = kzalloc(sizeof(*ctx), gfp_mask); + ctx = kzalloc_obj(*ctx, gfp_mask); if (ctx == NULL) return -ENOMEM; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index c84d0cf61980..78eab245f94a 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -355,7 +355,7 @@ gss_import_sec_context(const void *input_token, size_t bufsize, time64_t *endtime, gfp_t gfp_mask) { - if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask))) + if (!(*ctx_id = kzalloc_obj(**ctx_id, gfp_mask))) return -ENOMEM; (*ctx_id)->mech_type = gss_mech_get(mech); diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index f549e4c05def..0fa4778620d9 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -214,7 +214,7 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) unsigned int i; arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); - arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); + arg->pages = kzalloc_objs(struct page *, arg->npages); if (!arg->pages) return -ENOMEM; for (i = 0; i < arg->npages; i++) { diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 7d2cdc2bd374..fceee648d545 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -244,11 +244,11 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, /* we recognize only 1 currently: CREDS_VALUE */ oa->count = 1; - oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL); + oa->data = kmalloc_obj(struct gssx_option); if (!oa->data) return -ENOMEM; - creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); + creds = kzalloc_obj(struct svc_cred); if (!creds) { err = -ENOMEM; goto free_oa; @@ -320,29 +320,47 @@ static int gssx_dec_status(struct xdr_stream *xdr, /* status->minor_status */ p = xdr_inline_decode(xdr, 8); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto out_free_mech; + } p = xdr_decode_hyper(p, &status->minor_status); /* status->major_status_string */ err = gssx_dec_buffer(xdr, &status->major_status_string); if (err) - return err; + goto out_free_mech; /* status->minor_status_string */ err = gssx_dec_buffer(xdr, &status->minor_status_string); if (err) - return err; + goto out_free_major_status_string; /* status->server_ctx */ err = gssx_dec_buffer(xdr, &status->server_ctx); if (err) - return err; + goto out_free_minor_status_string; /* we assume we have no options for now, so simply consume them */ /* status->options */ err = dummy_dec_opt_array(xdr, &status->options); + if (err) + goto out_free_server_ctx; + return 0; + +out_free_server_ctx: + kfree(status->server_ctx.data); + status->server_ctx.data = NULL; +out_free_minor_status_string: + kfree(status->minor_status_string.data); + status->minor_status_string.data = NULL; +out_free_major_status_string: + kfree(status->major_status_string.data); + status->major_status_string.data = NULL; +out_free_mech: + kfree(status->mech.data); + status->mech.data = NULL; return err; } @@ -505,28 +523,35 @@ static int gssx_dec_name(struct xdr_stream *xdr, /* name->name_type */ err = gssx_dec_buffer(xdr, &dummy_netobj); if (err) - return err; + goto out_free_display_name; /* name->exported_name */ err = gssx_dec_buffer(xdr, &dummy_netobj); if (err) - return err; + goto out_free_display_name; /* name->exported_composite_name */ err = gssx_dec_buffer(xdr, &dummy_netobj); if (err) - return err; + goto out_free_display_name; /* we assume we have no attributes for now, so simply consume them */ /* name->name_attributes */ err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array); if (err) - return err; + goto out_free_display_name; /* we assume we have no options for now, so simply consume them */ /* name->extensions */ err = dummy_dec_opt_array(xdr, &dummy_option_array); + if (err) + goto out_free_display_name; + return 0; + +out_free_display_name: + kfree(name->display_name.data); + name->display_name.data = NULL; return err; } @@ -649,32 +674,34 @@ static int gssx_dec_ctx(struct xdr_stream *xdr, /* ctx->state */ err = gssx_dec_buffer(xdr, &ctx->state); if (err) - return err; + goto out_free_exported_context_token; /* ctx->need_release */ err = gssx_dec_bool(xdr, &ctx->need_release); if (err) - return err; + goto out_free_state; /* ctx->mech */ err = gssx_dec_buffer(xdr, &ctx->mech); if (err) - return err; + goto out_free_state; /* ctx->src_name */ err = gssx_dec_name(xdr, &ctx->src_name); if (err) - return err; + goto out_free_mech; /* ctx->targ_name */ err = gssx_dec_name(xdr, &ctx->targ_name); if (err) - return err; + goto out_free_src_name; /* ctx->lifetime */ p = xdr_inline_decode(xdr, 8+8); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto out_free_targ_name; + } p = xdr_decode_hyper(p, &ctx->lifetime); /* ctx->ctx_flags */ @@ -683,17 +710,36 @@ static int gssx_dec_ctx(struct xdr_stream *xdr, /* ctx->locally_initiated */ err = gssx_dec_bool(xdr, &ctx->locally_initiated); if (err) - return err; + goto out_free_targ_name; /* ctx->open */ err = gssx_dec_bool(xdr, &ctx->open); if (err) - return err; + goto out_free_targ_name; /* we assume we have no options for now, so simply consume them */ /* ctx->options */ err = dummy_dec_opt_array(xdr, &ctx->options); + if (err) + goto out_free_targ_name; + + return 0; +out_free_targ_name: + kfree(ctx->targ_name.display_name.data); + ctx->targ_name.display_name.data = NULL; +out_free_src_name: + kfree(ctx->src_name.display_name.data); + ctx->src_name.display_name.data = NULL; +out_free_mech: + kfree(ctx->mech.data); + ctx->mech.data = NULL; +out_free_state: + kfree(ctx->state.data); + ctx->state.data = NULL; +out_free_exported_context_token: + kfree(ctx->exported_context_token.data); + ctx->exported_context_token.data = NULL; return err; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index a8ec30759a18..161d02cc1c2c 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -197,7 +197,7 @@ static void update_rsi(struct cache_head *cnew, struct cache_head *citem) static struct cache_head *rsi_alloc(void) { - struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); + struct rsi *rsii = kmalloc_obj(*rsii); if (rsii) return &rsii->h; else @@ -449,7 +449,7 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp) static struct cache_head * rsc_alloc(void) { - struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); + struct rsc *rsci = kmalloc_obj(*rsci); if (rsci) return &rsci->h; else @@ -814,7 +814,7 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) struct auth_domain *test; int stat = -ENOMEM; - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc_obj(*new); if (!new) goto out; kref_init(&new->h.ref); @@ -1069,7 +1069,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); - in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + in_token->pages = kzalloc_objs(struct page *, pages + 1); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; @@ -1083,7 +1083,8 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, } length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); - memcpy(page_address(in_token->pages[0]), xdr->p, length); + if (length) + memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; @@ -1630,7 +1631,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp) rqstp->rq_auth_stat = rpc_autherr_failed; if (!svcdata) - svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); + svcdata = kmalloc_obj(*svcdata); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 1e091d3fa607..6c742a3400c4 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -45,7 +45,7 @@ static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth, { struct rpc_cred *ret; - ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask()); + ret = kmalloc_obj(*ret, rpc_task_gfp_mask()); if (!ret) { if (!(flags & RPCAUTH_LOOKUP_ASYNC)) return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index caa94cf57123..0ffa4d01a938 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -25,6 +25,22 @@ unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt) } /* + * Helper function to nullify backchannel server pointer in transport. + * We need to synchronize setting the pointer to NULL (done so after + * the backchannel server is shutdown) with the usage of that pointer + * by the backchannel request processing routines + * xprt_complete_bc_request() and rpcrdma_bc_receive_call(). + */ +void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv) +{ + spin_lock(&xprt->bc_pa_lock); + svc_destroy(serv); + xprt->bc_serv = NULL; + spin_unlock(&xprt->bc_pa_lock); +} +EXPORT_SYMBOL_GPL(xprt_svc_destroy_nullify_bc); + +/* * Helper routines that track the number of preallocation elements * on the transport. */ @@ -78,7 +94,7 @@ static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt) struct rpc_rqst *req; /* Pre-allocate one backchannel rpc_rqst */ - req = kzalloc(sizeof(*req), gfp_flags); + req = kzalloc_obj(*req, gfp_flags); if (req == NULL) return NULL; @@ -131,7 +147,7 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel); int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) { struct rpc_rqst *req; - struct list_head tmp_list; + LIST_HEAD(tmp_list); int i; dprintk("RPC: setup backchannel transport\n"); @@ -147,7 +163,6 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) * lock is held on the rpc_xprt struct. It also makes cleanup * easier in case of memory allocation errors. */ - INIT_LIST_HEAD(&tmp_list); for (i = 0; i < min_reqs; i++) { /* Pre-allocate one backchannel rpc_rqst */ req = xprt_alloc_bc_req(xprt); @@ -354,7 +369,6 @@ found: void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) { struct rpc_xprt *xprt = req->rq_xprt; - struct svc_serv *bc_serv = xprt->bc_serv; spin_lock(&xprt->bc_pa_lock); list_del(&req->rq_bc_pa_list); @@ -365,7 +379,21 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); dprintk("RPC: add callback request to list\n"); + xprt_enqueue_bc_request(req); +} + +void xprt_enqueue_bc_request(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct svc_serv *bc_serv; + xprt_get(xprt); - lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list); - svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); + spin_lock(&xprt->bc_pa_lock); + bc_serv = xprt->bc_serv; + if (bc_serv) { + lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list); + svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); + } + spin_unlock(&xprt->bc_pa_lock); } +EXPORT_SYMBOL_GPL(xprt_enqueue_bc_request); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 131090f31e6a..ef8b7e8b1e9c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -11,6 +11,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/hex.h> #include <linux/slab.h> #include <linux/signal.h> #include <linux/sched.h> @@ -1037,7 +1038,7 @@ static int cache_open(struct inode *inode, struct file *filp, return -EACCES; nonseekable_open(inode, filp); if (filp->f_mode & FMODE_READ) { - rp = kmalloc(sizeof(*rp), GFP_KERNEL); + rp = kmalloc_obj(*rp); if (!rp) { module_put(cd->owner); return -ENOMEM; @@ -1061,14 +1062,25 @@ static int cache_release(struct inode *inode, struct file *filp, struct cache_reader *rp = filp->private_data; if (rp) { + struct cache_request *rq = NULL; + spin_lock(&queue_lock); if (rp->offset) { struct cache_queue *cq; - for (cq= &rp->q; &cq->list != &cd->queue; - cq = list_entry(cq->list.next, struct cache_queue, list)) + for (cq = &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, + struct cache_queue, list)) if (!cq->reader) { - container_of(cq, struct cache_request, q) - ->readers--; + struct cache_request *cr = + container_of(cq, + struct cache_request, q); + cr->readers--; + if (cr->readers == 0 && + !test_bit(CACHE_PENDING, + &cr->item->flags)) { + list_del(&cr->q.list); + rq = cr; + } break; } rp->offset = 0; @@ -1076,9 +1088,14 @@ static int cache_release(struct inode *inode, struct file *filp, list_del(&rp->q.list); spin_unlock(&queue_lock); + if (rq) { + cache_put(rq->item, cd); + kfree(rq->buf); + kfree(rq); + } + filp->private_data = NULL; kfree(rp); - } if (filp->f_mode & FMODE_WRITE) { atomic_dec(&cd->writers); @@ -1224,7 +1241,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) if (!buf) return -EAGAIN; - crq = kmalloc(sizeof (*crq), GFP_KERNEL); + crq = kmalloc_obj(*crq); if (!crq) { kfree(buf); return -EAGAIN; @@ -1744,8 +1761,7 @@ struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct ne if (cd == NULL) return ERR_PTR(-ENOMEM); - cd->hash_table = kcalloc(cd->hash_size, sizeof(struct hlist_head), - GFP_KERNEL); + cd->hash_table = kzalloc_objs(struct hlist_head, cd->hash_size); if (cd->hash_table == NULL) { kfree(cd); return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ca354ecfd02..bc8ca470718b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -374,7 +374,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, goto out_err; err = -ENOMEM; - clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); + clnt = kzalloc_obj(*clnt); if (!clnt) goto out_err; clnt->cl_parent = parent ? : clnt; @@ -1457,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, - (struct sockaddr *)&rpc_inaddr_loopback, + (struct sockaddr_unsized *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, - (struct sockaddr *)&rpc_in6addr_loopback, + (struct sockaddr_unsized *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: @@ -1474,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_connect(sock, sap, salen, 0); + err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; @@ -2976,7 +2976,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, return -EINVAL; } - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kmalloc_obj(*data); if (!data) return -ENOMEM; data->xps = xprt_switch_get(xps); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 0bd1df2ebb47..9d349cfbc483 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -511,7 +511,7 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags) { struct rpc_pipe *pipe; - pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL); + pipe = kzalloc_obj(struct rpc_pipe); if (!pipe) return ERR_PTR(-ENOMEM); init_pipe(pipe); @@ -536,17 +536,16 @@ static int rpc_new_file(struct dentry *parent, inode = rpc_get_inode(dir->i_sb, S_IFREG | mode); if (unlikely(!inode)) { - dput(dentry); - inode_unlock(dir); + simple_done_creating(dentry); return -ENOMEM; } inode->i_ino = iunique(dir->i_sb, 100); if (i_fop) inode->i_fop = i_fop; rpc_inode_setowner(inode, private); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); return 0; } @@ -563,18 +562,17 @@ static struct dentry *rpc_new_dir(struct dentry *parent, inode = rpc_get_inode(dir->i_sb, S_IFDIR | mode); if (unlikely(!inode)) { - dput(dentry); - inode_unlock(dir); + simple_done_creating(dentry); return ERR_PTR(-ENOMEM); } inode->i_ino = iunique(dir->i_sb, 100); inc_nlink(dir); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_mkdir(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); - return dentry; + return dentry; // borrowed } static int rpc_populate(struct dentry *parent, @@ -657,8 +655,7 @@ int rpc_mkpipe_dentry(struct dentry *parent, const char *name, inode = rpc_get_inode(dir->i_sb, umode); if (unlikely(!inode)) { - dput(dentry); - inode_unlock(dir); + simple_done_creating(dentry); err = -ENOMEM; goto failed; } @@ -668,10 +665,10 @@ int rpc_mkpipe_dentry(struct dentry *parent, const char *name, rpci->private = private; rpci->pipe = pipe; rpc_inode_setowner(inode, private); - d_instantiate(dentry, inode); - pipe->dentry = dentry; + pipe->dentry = dentry; // borrowed + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); return 0; failed: @@ -1206,7 +1203,7 @@ static void rpc_kill_sb(struct super_block *sb) sb); mutex_unlock(&sn->pipefs_sb_lock); out: - kill_litter_super(sb); + kill_anon_super(sb); put_net(net); } diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 53bcca365fb1..6aa372188c86 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -737,7 +737,7 @@ void rpcb_getport_async(struct rpc_task *task) goto bailout_nofree; } - map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask()); + map = kzalloc_obj(struct rpcbind_args, rpc_task_gfp_mask()); if (!map) { status = -ENOMEM; goto bailout_release_client; diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 383860cb1d5b..7093e18ac26c 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -126,7 +126,7 @@ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) struct rpc_iostats *stats; int i; - stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); + stats = kzalloc_objs(*stats, clnt->cl_maxproc); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 4704dce7284e..d8ccb8e4b5c2 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -488,7 +488,7 @@ __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, unsigned int xdrsize; unsigned int i; - if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) + if (!(serv = kzalloc_obj(*serv))) return NULL; serv->sv_name = prog->pg_name; serv->sv_programs = prog; @@ -523,8 +523,7 @@ __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, serv->sv_nrpools = npools; serv->sv_pools = - kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), - GFP_KERNEL); + kzalloc_objs(struct svc_pool, serv->sv_nrpools); if (!serv->sv_pools) { kfree(serv); return NULL; @@ -763,108 +762,88 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool) } EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread); -static struct svc_pool * -svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) -{ - return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; -} - -static struct svc_pool * -svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool, - unsigned int *state) +/** + * svc_new_thread - spawn a new thread in the given pool + * @serv: the serv to which the pool belongs + * @pool: pool in which thread should be spawned + * + * Create a new thread inside @pool, which is a part of @serv. + * Caller must hold the service mutex. + * + * Returns 0 on success, or -errno on failure. + */ +int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool) { - struct svc_pool *pool; - unsigned int i; + struct svc_rqst *rqstp; + struct task_struct *task; + int node; + int err = 0; - pool = target_pool; + node = svc_pool_map_get_node(pool->sp_id); - if (!pool) { - for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; - if (pool->sp_nrthreads) - break; - } + rqstp = svc_prepare_thread(serv, pool, node); + if (!rqstp) + return -ENOMEM; + task = kthread_create_on_node(serv->sv_threadfn, rqstp, + node, "%s", serv->sv_name); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto out; } - if (pool && pool->sp_nrthreads) { - set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); - set_bit(SP_NEED_VICTIM, &pool->sp_flags); - return pool; - } - return NULL; + rqstp->rq_task = task; + if (serv->sv_nrpools > 1) + svc_pool_map_set_cpumask(task, pool->sp_id); + + svc_sock_update_bufs(serv); + wake_up_process(task); + + /* Wait for the thread to signal initialization status */ + wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); + err = rqstp->rq_err; +out: + if (err) + svc_exit_thread(rqstp); + return err; } +EXPORT_SYMBOL_GPL(svc_new_thread); static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - struct svc_rqst *rqstp; - struct task_struct *task; - struct svc_pool *chosen_pool; - unsigned int state = serv->sv_nrthreads-1; - int node; - int err; - - do { - nrservs--; - chosen_pool = svc_pool_next(serv, pool, &state); - node = svc_pool_map_get_node(chosen_pool->sp_id); - - rqstp = svc_prepare_thread(serv, chosen_pool, node); - if (!rqstp) - return -ENOMEM; - task = kthread_create_on_node(serv->sv_threadfn, rqstp, - node, "%s", serv->sv_name); - if (IS_ERR(task)) { - svc_exit_thread(rqstp); - return PTR_ERR(task); - } - - rqstp->rq_task = task; - if (serv->sv_nrpools > 1) - svc_pool_map_set_cpumask(task, chosen_pool->sp_id); + int err = 0; - svc_sock_update_bufs(serv); - wake_up_process(task); + while (!err && nrservs--) + err = svc_new_thread(serv, pool); - wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); - err = rqstp->rq_err; - if (err) { - svc_exit_thread(rqstp); - return err; - } - } while (nrservs > 0); - - return 0; + return err; } static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { - unsigned int state = serv->sv_nrthreads-1; - struct svc_pool *victim; - do { - victim = svc_pool_victim(serv, pool, &state); - if (!victim) - break; - svc_pool_wake_idle_thread(victim); - wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS, - TASK_IDLE); + set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); + set_bit(SP_NEED_VICTIM, &pool->sp_flags); + svc_pool_wake_idle_thread(pool); + wait_on_bit(&pool->sp_flags, SP_VICTIM_REMAINS, TASK_IDLE); nrservs++; } while (nrservs < 0); return 0; } /** - * svc_set_num_threads - adjust number of threads per RPC service + * svc_set_pool_threads - adjust number of threads per pool * @serv: RPC service to adjust - * @pool: Specific pool from which to choose threads, or NULL - * @nrservs: New number of threads for @serv (0 or less means kill all threads) + * @pool: Specific pool from which to choose threads + * @min_threads: min number of threads to run in @pool + * @max_threads: max number of threads in @pool (0 means kill all threads) + * + * Create or destroy threads in @pool to bring it into an acceptable range + * between @min_threads and @max_threads. * - * Create or destroy threads to make the number of threads for @serv the - * given number. If @pool is non-NULL, change only threads in that pool; - * otherwise, round-robin between all pools for @serv. @serv's - * sv_nrthreads is adjusted for each thread created or destroyed. + * If @min_threads is 0 or larger than @max_threads, then it is ignored and + * the pool will be set to run a static @max_threads number of threads. * * Caller must ensure mutual exclusion between this and server startup or * shutdown. @@ -873,19 +852,85 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) * starting a thread. */ int -svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +svc_set_pool_threads(struct svc_serv *serv, struct svc_pool *pool, + unsigned int min_threads, unsigned int max_threads) { + int delta; + if (!pool) - nrservs -= serv->sv_nrthreads; - else - nrservs -= pool->sp_nrthreads; + return -EINVAL; + + /* clamp min threads to the max */ + if (min_threads > max_threads) + min_threads = max_threads; - if (nrservs > 0) - return svc_start_kthreads(serv, pool, nrservs); - if (nrservs < 0) - return svc_stop_kthreads(serv, pool, nrservs); + pool->sp_nrthrmin = min_threads; + pool->sp_nrthrmax = max_threads; + + /* + * When min_threads is set, then only change the number of + * threads to bring it within an acceptable range. + */ + if (min_threads) { + if (pool->sp_nrthreads > max_threads) + delta = max_threads; + else if (pool->sp_nrthreads < min_threads) + delta = min_threads; + else + return 0; + } else { + delta = max_threads; + } + + delta -= pool->sp_nrthreads; + if (delta > 0) + return svc_start_kthreads(serv, pool, delta); + if (delta < 0) + return svc_stop_kthreads(serv, pool, delta); return 0; } +EXPORT_SYMBOL_GPL(svc_set_pool_threads); + +/** + * svc_set_num_threads - adjust number of threads in serv + * @serv: RPC service to adjust + * @min_threads: min number of threads to run per pool + * @nrservs: New number of threads for @serv (0 means kill all threads) + * + * Create or destroy threads in @serv to bring it to @nrservs. If there + * are multiple pools then the new threads or victims will be distributed + * evenly among them. + * + * Caller must ensure mutual exclusion between this and server startup or + * shutdown. + * + * Returns zero on success or a negative errno if an error occurred while + * starting a thread. On failure, some pools may have already been + * adjusted; the caller is responsible for recovery. + */ +int +svc_set_num_threads(struct svc_serv *serv, unsigned int min_threads, + unsigned int nrservs) +{ + unsigned int base = nrservs / serv->sv_nrpools; + unsigned int remain = nrservs % serv->sv_nrpools; + int i, err = 0; + + for (i = 0; i < serv->sv_nrpools; ++i) { + struct svc_pool *pool = &serv->sv_pools[i]; + int threads = base; + + if (remain) { + ++threads; + --remain; + } + + err = svc_set_pool_threads(serv, pool, min_threads, threads); + if (err) + break; + } + return err; +} EXPORT_SYMBOL_GPL(svc_set_num_threads); /** diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6973184ff667..56a663b8939f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -714,15 +714,21 @@ svc_thread_should_sleep(struct svc_rqst *rqstp) return true; } -static void svc_thread_wait_for_work(struct svc_rqst *rqstp) +static bool svc_schedule_timeout(long timeo) +{ + return schedule_timeout(timeo ? timeo : MAX_SCHEDULE_TIMEOUT) == 0; +} + +static bool svc_thread_wait_for_work(struct svc_rqst *rqstp, long timeo) { struct svc_pool *pool = rqstp->rq_pool; + bool did_timeout = false; if (svc_thread_should_sleep(rqstp)) { set_current_state(TASK_IDLE | TASK_FREEZABLE); llist_add(&rqstp->rq_idle, &pool->sp_idle_threads); if (likely(svc_thread_should_sleep(rqstp))) - schedule(); + did_timeout = svc_schedule_timeout(timeo); while (!llist_del_first_this(&pool->sp_idle_threads, &rqstp->rq_idle)) { @@ -734,7 +740,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp) * for this new work. This thread can safely sleep * until woken again. */ - schedule(); + did_timeout = svc_schedule_timeout(timeo); set_current_state(TASK_IDLE | TASK_FREEZABLE); } __set_current_state(TASK_RUNNING); @@ -742,6 +748,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp) cond_resched(); } try_to_freeze(); + return did_timeout; } static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) @@ -835,25 +842,38 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp) /** * svc_recv - Receive and process the next request on any transport * @rqstp: an idle RPC service thread + * @timeo: timeout (in jiffies) (0 means infinite timeout) * * This code is carefully organised not to touch any cachelines in * the shared svc_serv structure, only cachelines in the local * svc_pool. + * + * If the timeout is 0, then the sleep will never time out. + * + * Returns -ETIMEDOUT if idle for an extended period + * -EBUSY if there is more work to do than available threads + * 0 otherwise. */ -void svc_recv(struct svc_rqst *rqstp) +int svc_recv(struct svc_rqst *rqstp, long timeo) { struct svc_pool *pool = rqstp->rq_pool; + bool did_timeout; + int ret = 0; if (!svc_alloc_arg(rqstp)) - return; + return ret; + + did_timeout = svc_thread_wait_for_work(rqstp, timeo); - svc_thread_wait_for_work(rqstp); + if (did_timeout && svc_thread_should_sleep(rqstp) && + pool->sp_nrthrmin && pool->sp_nrthreads > pool->sp_nrthrmin) + ret = -ETIMEDOUT; clear_bit(SP_TASK_PENDING, &pool->sp_flags); if (svc_thread_should_stop(rqstp)) { svc_thread_wake_next(rqstp); - return; + return ret; } rqstp->rq_xprt = svc_xprt_dequeue(pool); @@ -865,10 +885,22 @@ void svc_recv(struct svc_rqst *rqstp) * cache information to be provided. When there are no * idle threads, we reduce the wait time. */ - if (pool->sp_idle_threads.first) + if (pool->sp_idle_threads.first) { rqstp->rq_chandle.thread_wait = 5 * HZ; - else + } else { rqstp->rq_chandle.thread_wait = 1 * HZ; + /* + * No idle threads: signal -EBUSY so the caller + * can consider spawning another thread. Use + * SP_TASK_STARTING to limit this signal to one + * thread at a time; the caller clears this flag + * after starting a new thread. + */ + if (!did_timeout && timeo && + !test_and_set_bit(SP_TASK_STARTING, + &pool->sp_flags)) + ret = -EBUSY; + } trace_svc_xprt_dequeue(rqstp); svc_handle_xprt(rqstp, xprt); @@ -887,6 +919,7 @@ void svc_recv(struct svc_rqst *rqstp) } } #endif + return ret; } EXPORT_SYMBOL_GPL(svc_recv); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 8ca98b146ec8..3be69c145d2a 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -72,7 +72,7 @@ struct auth_domain *unix_domain_find(char *name) return rv; } - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc_obj(*new); if (new == NULL) return NULL; kref_init(&new->h.ref); @@ -143,7 +143,7 @@ static void update(struct cache_head *cnew, struct cache_head *citem) } static struct cache_head *ip_map_alloc(void) { - struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL); + struct ip_map *i = kmalloc_obj(*i); if (i) return &i->h; else @@ -458,7 +458,7 @@ static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem) } static struct cache_head *unix_gid_alloc(void) { - struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL); + struct unix_gid *g = kmalloc_obj(*g); if (g) return &g->h; else diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7b90abc5cf0e..f28c6076f7e8 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -68,6 +68,17 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + /* To-do: to avoid tying up an nfsd thread while waiting for a * handshake request, the request could instead be deferred. */ @@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); } @@ -1062,9 +1073,10 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, return svc_sock_reclen(svsk); err_too_large: - net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", - __func__, svsk->sk_xprt.xpt_server->sv_name, - svc_sock_reclen(svsk)); + net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n", + svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk), + (struct sockaddr *)&svsk->sk_xprt.xpt_remote); svc_xprt_deferred_close(&svsk->sk_xprt); err_short: return -EAGAIN; @@ -1235,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, int ret; /* The stream record marker is copied into a temporary page - * fragment buffer so that it can be included in rq_bvec. + * fragment buffer so that it can be included in sk_bvec. */ buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &marker, sizeof(marker)); - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, &rqstp->rq_res); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); @@ -1392,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} + /* * Initialize socket for RPC use and create svc_sock struct */ @@ -1402,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int sendpages; unsigned long pages; + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + pages = svc_serv_maxpages(serv); - svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); + svsk = kzalloc_flex(*svsk, sk_pages, pages); if (!svsk) return ERR_PTR(-ENOMEM); + + if (sendpages) { + svsk->sk_bvec = kzalloc_objs(*svsk->sk_bvec, sendpages); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1419,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { + kfree(svsk->sk_bvec); kfree(svsk); return ERR_PTR(err); } @@ -1557,7 +1598,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ - error = kernel_bind(sock, sin, len); + error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len); if (error < 0) goto bummer; @@ -1636,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(sock); page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 8b01b7ae2690..af8fac9cedd4 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -48,7 +48,7 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name, { struct kobject *kobj; - kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + kobj = kzalloc_obj(*kobj); if (kobj) { kobj->kset = kset; if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, @@ -397,7 +397,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); if (!dst_addr) goto out_err; - saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); + saved_addr = kzalloc_obj(*saved_addr); if (!saved_addr) goto out_err_free; saved_addr->addr = @@ -663,7 +663,7 @@ static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, { struct rpc_sysfs_client *p; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc_obj(*p); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; @@ -683,7 +683,7 @@ rpc_sysfs_xprt_switch_alloc(struct kobject *parent, { struct rpc_sysfs_xprt_switch *p; - p = kzalloc(sizeof(*p), gfp_flags); + p = kzalloc_obj(*p, gfp_flags); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; @@ -703,7 +703,7 @@ static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, { struct rpc_sysfs_xprt *p; - p = kzalloc(sizeof(*p), gfp_flags); + p = kzalloc_obj(*p, gfp_flags); if (!p) goto out; p->kobject.kset = rpc_sunrpc_kset; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 70efc727a9cd..e83d5d0be78b 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -118,7 +118,7 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) size_t i, n = xdr_buf_pagecount(buf); if (n != 0 && buf->bvec == NULL) { - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + buf->bvec = kmalloc_objs(buf->bvec[0], n, gfp); if (!buf->bvec) return -ENOMEM; for (i = 0; i < n; i++) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1023361845f9..4fbb57a29704 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1709,7 +1709,7 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) goto out; ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); - req = kzalloc(sizeof(*req), rpc_task_gfp_mask()); + req = kzalloc_obj(*req, rpc_task_gfp_mask()); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; @@ -1829,7 +1829,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { - req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); + req = kzalloc_obj(struct rpc_rqst); if (!req) goto out_free; list_add(&req->rq_list, &xprt->free); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 4c5e08b0aa64..3ba818d637be 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -150,7 +150,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, { struct rpc_xprt_switch *xps; - xps = kmalloc(sizeof(*xps), gfp_flags); + xps = kmalloc_obj(*xps, gfp_flags); if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 8c817e755262..2f0f9618dd05 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -9,6 +9,7 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/bc_xprt.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> @@ -220,7 +221,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct svc_serv *bc_serv; struct rpcrdma_req *req; struct rpc_rqst *rqst; struct xdr_buf *buf; @@ -261,11 +261,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ - bc_serv = xprt->bc_serv; - xprt_get(xprt); - lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list); - - svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]); + xprt_enqueue_bc_request(rqst); r_xprt->rx_stats.bcall_count++; return; diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index 28c68b5f6823..de49ad02053d 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -108,7 +108,7 @@ static int rpcrdma_add_one(struct ib_device *device) { struct rpcrdma_device *rd; - rd = kzalloc(sizeof(*rd), GFP_KERNEL); + rd = kzalloc_obj(*rd); if (!rd) return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c index b63cfeaa2923..1f8f7dad8b6f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_pcl.c +++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c @@ -29,7 +29,7 @@ static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position) { struct svc_rdma_chunk *chunk; - chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL); + chunk = kmalloc_flex(*chunk, ch_segments, segcount); if (!chunk) return NULL; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 661b3fe2779f..4ec2f9ae06aa 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -5,6 +5,8 @@ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. */ +#include <linux/bvec.h> +#include <linux/overflow.h> #include <rdma/rw.h> #include <linux/sunrpc/xdr.h> @@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); /* Each R/W context contains state for one chain of RDMA Read or * Write Work Requests. * - * Each WR chain handles a single contiguous server-side buffer, - * because scatterlist entries after the first have to start on - * page alignment. xdr_buf iovecs cannot guarantee alignment. + * Each WR chain handles a single contiguous server-side buffer. + * - each xdr_buf iovec is a single contiguous buffer + * - the xdr_buf pages array is a single contiguous buffer because the + * second through the last element always start on a page boundary * * Each WR chain handles only one R_key. Each RPC-over-RDMA segment * from a client may contain a unique R_key, so each WR chain moves * up to one segment at a time. * - * The scatterlist makes this data structure over 4KB in size. To - * make it less likely to fail, and to handle the allocation for - * smaller I/O requests without disabling bottom-halves, these - * contexts are created on demand, but cached and reused until the - * controlling svcxprt_rdma is destroyed. + * The inline bvec array is sized to handle most I/O requests without + * additional allocation. Larger requests fall back to dynamic allocation. + * These contexts are created on demand, but cached and reused until + * the controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; - unsigned int rw_first_sgl_nents; - struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[]; + unsigned int rw_first_bvec_nents; + struct bio_vec *rw_bvec; + struct bio_vec rw_first_bvec[]; }; +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt); + static inline struct svc_rdma_rw_ctxt * svc_rdma_next_ctxt(struct list_head *list) { @@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list) } static struct svc_rdma_rw_ctxt * -svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec) { struct ib_device *dev = rdma->sc_cm_id->device; - unsigned int first_sgl_nents = dev->attrs.max_send_sge; + unsigned int first_bvec_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec, + first_bvec_nents), GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); - ctxt->rw_first_sgl_nents = first_sgl_nents; + ctxt->rw_first_bvec_nents = first_bvec_nents; } - ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; - if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, - ctxt->rw_sg_table.sgl, - first_sgl_nents)) - goto out_free; + if (nr_bvec <= ctxt->rw_first_bvec_nents) { + ctxt->rw_bvec = ctxt->rw_first_bvec; + } else { + ctxt->rw_bvec = kmalloc_array_node(nr_bvec, + sizeof(*ctxt->rw_bvec), + GFP_KERNEL, + ibdev_to_node(dev)); + if (!ctxt->rw_bvec) + goto out_free; + } return ctxt; out_free: - kfree(ctxt); + /* Return cached contexts to cache; free freshly allocated ones */ + if (node) + svc_rdma_put_rw_ctxt(rdma, ctxt); + else + kfree(ctxt); out_noctx: - trace_svcrdma_rwctx_empty(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, nr_bvec); return NULL; } static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); llist_add(&ctxt->rw_node, list); } @@ -123,6 +139,7 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) * @ctxt: R/W context to prepare * @offset: RDMA offset * @handle: RDMA tag/handle + * @length: total number of bytes in the bvec array * @direction: I/O direction * * Returns on success, the number of WQEs that will be needed @@ -130,14 +147,18 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) */ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt, - u64 offset, u32 handle, + u64 offset, u32 handle, unsigned int length, enum dma_data_direction direction) { + struct bvec_iter iter = { + .bi_size = length, + }; int ret; - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, handle, direction); + ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, + iter, offset, handle, direction); if (unlikely(ret < 0)) { trace_svcrdma_dma_map_rw_err(rdma, offset, handle, ctxt->rw_nents, ret); @@ -175,7 +196,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, { struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; - LLIST_HEAD(free); trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); @@ -183,10 +203,11 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); - rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(ctxt, &free); + rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, + ctxt->rw_bvec, ctxt->rw_nents, dir); + if (ctxt->rw_bvec != ctxt->rw_first_bvec) + kfree(ctxt->rw_bvec); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -414,29 +435,26 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, return -ENOTCONN; } -/* Build and DMA-map an SGL that covers one kvec in an xdr_buf +/* Build a bvec that covers one kvec in an xdr_buf. */ -static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, - unsigned int len, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) { - struct scatterlist *sg = ctxt->rw_sg_table.sgl; - - sg_set_buf(&sg[0], info->wi_base, len); + bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len); info->wi_base += len; ctxt->rw_nents = 1; } -/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. +/* Build a bvec array that covers part of an xdr_buf's pagelist. */ -static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, - unsigned int remaining, - struct svc_rdma_rw_ctxt *ctxt) +static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) { - unsigned int sge_no, sge_bytes, page_off, page_no; + unsigned int bvec_idx, bvec_len, page_off, page_no; const struct xdr_buf *xdr = info->wi_xdr; - struct scatterlist *sg; struct page **page; page_off = info->wi_next_off + xdr->page_base; @@ -444,21 +462,19 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, page_off = offset_in_page(page_off); page = xdr->pages + page_no; info->wi_next_off += remaining; - sg = ctxt->rw_sg_table.sgl; - sge_no = 0; + bvec_idx = 0; do { - sge_bytes = min_t(unsigned int, remaining, - PAGE_SIZE - page_off); - sg_set_page(sg, *page, sge_bytes, page_off); - - remaining -= sge_bytes; - sg = sg_next(sg); + bvec_len = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len, + page_off); + remaining -= bvec_len; page_off = 0; - sge_no++; + bvec_idx++; page++; } while (remaining); - ctxt->rw_nents = sge_no; + ctxt->rw_nents = bvec_idx; } /* Construct RDMA Write WRs to send a portion of an xdr_buf containing @@ -496,7 +512,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, constructor(info, write_len, ctxt); offset = seg->rs_offset + info->wi_seg_off; ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, - DMA_TO_DEVICE); + write_len, DMA_TO_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_write); @@ -535,7 +551,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info, const struct kvec *iov) { info->wi_base = iov->iov_base; - return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec, iov->iov_len); } @@ -559,7 +575,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info, { info->wi_xdr = xdr; info->wi_next_off = offset - xdr->head[0].iov_len; - return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec, length); } @@ -734,29 +750,29 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, { struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; - unsigned int sge_no, seg_len, len; + unsigned int bvec_idx, nr_bvec, seg_len, len, total; struct svc_rdma_rw_ctxt *ctxt; - struct scatterlist *sg; int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); + if (check_add_overflow(head->rc_pageoff, len, &total)) + return -EINVAL; + nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); if (!ctxt) return -ENOMEM; - ctxt->rw_nents = sge_no; + ctxt->rw_nents = nr_bvec; - sg = ctxt->rw_sg_table.sgl; - for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) { seg_len = min_t(unsigned int, len, PAGE_SIZE - head->rc_pageoff); if (!head->rc_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], - seg_len, head->rc_pageoff); - sg = sg_next(sg); + bvec_set_page(&ctxt->rw_bvec[bvec_idx], + rqstp->rq_pages[head->rc_curpage], + seg_len, head->rc_pageoff); head->rc_pageoff += seg_len; if (head->rc_pageoff == PAGE_SIZE) { @@ -770,7 +786,8 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, } ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, - segment->rs_handle, DMA_FROM_DEVICE); + segment->rs_handle, segment->rs_length, + DMA_FROM_DEVICE); if (ret < 0) return -EIO; percpu_counter_inc(&svcrdma_stat_read); @@ -841,6 +858,9 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, for (page_no = 0; page_no < numpages; page_no++) { unsigned int page_len; + if (head->rc_curpage >= rqstp->rq_maxpages) + return -EINVAL; + page_len = min_t(unsigned int, remaining, PAGE_SIZE - head->rc_pageoff); @@ -848,7 +868,7 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, head->rc_page_count++; dst = page_address(rqstp->rq_pages[head->rc_curpage]); - memcpy(dst + head->rc_curpage, src + offset, page_len); + memcpy((unsigned char *)dst + head->rc_pageoff, src + offset, page_len); head->rc_readbytes += page_len; head->rc_pageoff += page_len; @@ -860,7 +880,7 @@ static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, offset += page_len; } - return -EINVAL; + return 0; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3d7f1413df02..9b623849723e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -462,7 +462,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } - /* Arbitrary estimate of the needed number of rdma_rw contexts. + /* Estimate the needed number of rdma_rw contexts. The maximum + * Read and Write chunks have one segment each. Each request + * can involve one Read chunk and either a Write chunk or Reply + * chunk; thus a factor of three. */ maxpayload = min(xprt->xpt_server->sv_max_payload, RPCSVC_MAXPAYLOAD_RDMA); @@ -470,7 +473,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) rdma_rw_mr_factor(dev, newxprt->sc_port_num, maxpayload >> PAGE_SHIFT); - newxprt->sc_sq_depth = rq_depth + ctxts; + newxprt->sc_sq_depth = rq_depth + + rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0); if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); @@ -591,12 +595,18 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_disconnect(rdma->sc_cm_id); } -static void __svc_rdma_free(struct work_struct *work) +/** + * svc_rdma_free - Release class-specific transport resources + * @xprt: Generic svc transport object + */ +static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = - container_of(work, struct svcxprt_rdma, sc_work); + container_of(xprt, struct svcxprt_rdma, sc_xprt); struct ib_device *device = rdma->sc_cm_id->device; + might_sleep(); + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); @@ -629,15 +639,6 @@ static void __svc_rdma_free(struct work_struct *work) kfree(rdma); } -static void svc_rdma_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - INIT_WORK(&rdma->sc_work, __svc_rdma_free); - schedule_work(&rdma->sc_work); -} - static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 63262ef0c2e3..b51a162885bb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -383,7 +383,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep; int rc; - ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS); + ep = kzalloc_obj(*ep, XPRTRDMA_GFP_FLAGS); if (!ep) return -ENOTCONN; ep->re_xprt = &r_xprt->rx_xprt; @@ -615,8 +615,8 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), - XPRTRDMA_GFP_FLAGS); + sc = kzalloc_flex(*sc, sc_sges, ep->re_attr.cap.max_send_sge, + XPRTRDMA_GFP_FLAGS); if (!sc) return NULL; @@ -639,7 +639,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * Sends are posted. */ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; - buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS); + buf->rb_sc_ctxs = kzalloc_objs(sc, i, XPRTRDMA_GFP_FLAGS); if (!buf->rb_sc_ctxs) return -ENOMEM; @@ -822,7 +822,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS); + req = kzalloc_obj(*req, XPRTRDMA_GFP_FLAGS); if (req == NULL) goto out1; @@ -952,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt) struct ib_device *device = ep->re_id->device; struct rpcrdma_rep *rep; - rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); + rep = kzalloc_obj(*rep, XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; @@ -1362,7 +1362,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) needed += RPCRDMA_MAX_RECV_BATCH; if (atomic_inc_return(&ep->re_receiving) > 1) - goto out; + goto out_dec; /* fast path: all needed reps can be found on the free list */ wr = NULL; @@ -1385,7 +1385,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) ++count; } if (!wr) - goto out; + goto out_dec; rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1400,9 +1400,10 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) --count; } } + +out_dec: if (atomic_dec_return(&ep->re_receiving) > 0) complete(&ep->re_done); - out: trace_xprtrdma_post_recvs(r_xprt, count); ep->re_receive_count += count; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 3aa987e7f072..2e1fe6013361 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1845,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); - err = kernel_bind(sock, (struct sockaddr *)&myaddr, - transport->xprt.addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr, + transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; @@ -2005,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_stream_start_connect(transport); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0); } /** @@ -2405,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), + xprt->addrlen, O_NONBLOCK); } /** diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 4d5fbacef496..b55df183e6d5 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -114,7 +114,7 @@ static int switchdev_deferred_enqueue(struct net_device *dev, { struct switchdev_deferred_item *dfitem; - dfitem = kmalloc(struct_size(dfitem, data, data_len), GFP_ATOMIC); + dfitem = kmalloc_flex(*dfitem, data, data_len, GFP_ATOMIC); if (!dfitem) return -ENOMEM; dfitem->dev = dev; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 114fef65f92e..76a1585d3f6b 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -692,7 +692,7 @@ int tipc_bcast_init(struct net *net) struct tipc_bc_base *bb = NULL; struct tipc_link *l = NULL; - bb = kzalloc(sizeof(*bb), GFP_KERNEL); + bb = kzalloc_obj(*bb); if (!bb) goto enomem; tn->bcbase = bb; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index ae1ddbf71853..a3bd1ef17558 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -322,7 +322,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } - b = kzalloc(sizeof(*b), GFP_ATOMIC); + b = kzalloc_obj(*b, GFP_ATOMIC); if (!b) return -ENOMEM; diff --git a/net/tipc/core.h b/net/tipc/core.h index 7f3fe3401c45..9ce5f9ff6cc0 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -44,6 +44,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> +#include <linux/hex.h> #include <linux/mm.h> #include <linux/timer.h> #include <linux/string.h> diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 751904f10aab..d3046a39ff72 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -460,7 +460,7 @@ static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) - atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); + atomic_add_unless(&tmp->users, -1, lim); rcu_read_unlock(); } @@ -524,7 +524,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, return -EEXIST; /* Allocate a new AEAD */ - tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + tmp = kzalloc_obj(*tmp, GFP_ATOMIC); if (unlikely(!tmp)) return -ENOMEM; @@ -560,7 +560,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, break; } - tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); + tfm_entry = kmalloc_obj(*tfm_entry); if (unlikely(!tfm_entry)) { crypto_free_aead(tfm); err = -ENOMEM; @@ -637,7 +637,7 @@ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) if (unlikely(*dst)) return -EEXIST; - aead = kzalloc(sizeof(*aead), GFP_ATOMIC); + aead = kzalloc_obj(*aead, GFP_ATOMIC); if (unlikely(!aead)) return -ENOMEM; @@ -1219,7 +1219,7 @@ void tipc_crypto_key_flush(struct tipc_crypto *c) rx = c; tx = tipc_net(rx->net)->crypto_tx; if (cancel_delayed_work(&rx->work)) { - kfree(rx->skey); + kfree_sensitive(rx->skey); rx->skey = NULL; atomic_xchg(&rx->key_distr, 0); tipc_node_put(rx->node); @@ -1472,7 +1472,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, return -EEXIST; /* Allocate crypto */ - c = kzalloc(sizeof(*c), GFP_ATOMIC); + c = kzalloc_obj(*c, GFP_ATOMIC); if (!c) return -ENOMEM; @@ -2394,7 +2394,7 @@ static void tipc_crypto_work_rx(struct work_struct *work) break; default: synchronize_rcu(); - kfree(rx->skey); + kfree_sensitive(rx->skey); rx->skey = NULL; break; } diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 775fd4f3f072..3e54d2df5683 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -353,7 +353,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; - d = kmalloc(sizeof(*d), GFP_ATOMIC); + d = kmalloc_obj(*d, GFP_ATOMIC); if (!d) return -ENOMEM; d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); diff --git a/net/tipc/group.c b/net/tipc/group.c index 3e137d8c9d2f..e0e6227b433b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -169,7 +169,7 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group *grp; u32 type = mreq->type; - grp = kzalloc(sizeof(*grp), GFP_ATOMIC); + grp = kzalloc_obj(*grp, GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); @@ -306,7 +306,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, struct tipc_member *m; int ret; - m = kzalloc(sizeof(*m), GFP_ATOMIC); + m = kzalloc_obj(*m, GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); diff --git a/net/tipc/link.c b/net/tipc/link.c index 931f55f781a1..49dfc098d89b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -487,7 +487,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, char self_str[NODE_ID_STR_LEN] = {0,}; struct tipc_link *l; - l = kzalloc(sizeof(*l), GFP_ATOMIC); + l = kzalloc_obj(*l, GFP_ATOMIC); if (!l) return false; *link = l; diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 572b79bf76ce..a94b9b36a700 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -393,7 +393,7 @@ static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, struct tipc_peer *self = mon->self; struct tipc_peer *cur, *prev, *p; - p = kzalloc(sizeof(*p), GFP_ATOMIC); + p = kzalloc_obj(*p, GFP_ATOMIC); *peer = p; if (!p) return false; @@ -654,9 +654,9 @@ int tipc_mon_create(struct net *net, int bearer_id) if (tn->monitors[bearer_id]) return 0; - mon = kzalloc(sizeof(*mon), GFP_ATOMIC); - self = kzalloc(sizeof(*self), GFP_ATOMIC); - dom = kzalloc(sizeof(*dom), GFP_ATOMIC); + mon = kzalloc_obj(*mon, GFP_ATOMIC); + self = kzalloc_obj(*self, GFP_ATOMIC); + dom = kzalloc_obj(*dom, GFP_ATOMIC); if (!mon || !self || !dom) { kfree(mon); kfree(self); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e74940eab3a4..253c72d1366e 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -230,7 +230,7 @@ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { - struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); + struct publication *p = kzalloc_obj(*p, GFP_ATOMIC); if (!p) return NULL; @@ -261,7 +261,7 @@ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_service *service; struct hlist_head *hd; - service = kzalloc(sizeof(*service), GFP_ATOMIC); + service = kzalloc_obj(*service, GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; @@ -314,7 +314,7 @@ static struct service_range *tipc_service_create_range(struct tipc_service *sc, else n = &parent->rb_right; } - sr = kzalloc(sizeof(*sr), GFP_ATOMIC); + sr = kzalloc_obj(*sr, GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; @@ -348,7 +348,8 @@ static bool tipc_service_insert_publ(struct net *net, /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { - if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { + if (_p->key == key && _p->sk.ref == p->sk.ref && + (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); @@ -388,7 +389,8 @@ static struct publication *tipc_service_remove_publ(struct service_range *r, u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { - if (p->key != key || (node && node != p->sk.node)) + if (p->key != key || p->sk.ref != sk->ref || + (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); @@ -888,7 +890,7 @@ int tipc_nametbl_init(struct net *net) struct name_table *nt; int i; - nt = kzalloc(sizeof(*nt), GFP_KERNEL); + nt = kzalloc_obj(*nt); if (!nt) return -ENOMEM; @@ -1156,7 +1158,7 @@ bool tipc_dest_push(struct list_head *l, u32 node, u32 port) if (tipc_dest_find(l, node, port)) return false; - dst = kmalloc(sizeof(*dst), GFP_ATOMIC); + dst = kmalloc_obj(*dst, GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 079aebb16ed8..2a786c56c8c5 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -202,8 +202,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, return -ENOMEM; } - attrbuf = kcalloc(tipc_genl_family.maxattr + 1, - sizeof(struct nlattr *), GFP_KERNEL); + attrbuf = kzalloc_objs(struct nlattr *, tipc_genl_family.maxattr + 1); if (!attrbuf) { err = -ENOMEM; goto err_out; @@ -338,9 +337,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (!trans_buf) return -ENOMEM; - attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); + attrbuf = kmalloc_objs(struct nlattr *, tipc_genl_family.maxattr + 1); if (!attrbuf) { err = -ENOMEM; goto trans_out; diff --git a/net/tipc/node.c b/net/tipc/node.c index a07fb073368c..af442a5ef8f3 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -535,7 +535,7 @@ update: goto exit; } - n = kzalloc(sizeof(*n), GFP_ATOMIC); + n = kzalloc_obj(*n, GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; @@ -703,7 +703,7 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } - conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + conn = kmalloc_obj(*conn, GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 1574a83384f8..9329919fb07f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -710,7 +710,7 @@ int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) return res; } -static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +static int tipc_bind(struct socket *sock, struct sockaddr_unsized *skaddr, int alen) { struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr; u32 atype = ua->addrtype; @@ -726,7 +726,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) return -EACCES; } } - return tipc_sk_bind(sock, skaddr, alen); + return tipc_sk_bind(sock, (struct sockaddr *)skaddr, alen); } /** @@ -2233,6 +2233,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, if (skb_queue_empty(&sk->sk_write_queue)) break; get_random_bytes(&delay, 2); + if (tsk->conn_timeout < 4) + tsk->conn_timeout = 4; delay %= (tsk->conn_timeout / 4); delay = msecs_to_jiffies(delay + 100); sk_reset_timer(sk, &sk->sk_timer, jiffies + delay); @@ -2565,7 +2567,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr) * * Return: 0 on success, errno otherwise */ -static int tipc_connect(struct socket *sock, struct sockaddr *dest, +static int tipc_connect(struct socket *sock, struct sockaddr_unsized *dest, int destlen, int flags) { struct sock *sk = sock->sk; @@ -3031,10 +3033,8 @@ static void tipc_sk_remove(struct tipc_sock *tsk) struct sock *sk = &tsk->sk; struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id); - if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) { - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) __sock_put(sk); - } } static const struct rhashtable_params tsk_rht_params = { @@ -3597,7 +3597,7 @@ int __tipc_dump_start(struct netlink_callback *cb, struct net *net) struct tipc_net *tn = tipc_net(net); if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc_obj(*iter); if (!iter) return -ENOMEM; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index f8490d94e323..352d304c3acd 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -143,7 +143,7 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net, pr_warn("Subscription rejected, illegal request\n"); return NULL; } - sub = kmalloc(sizeof(*sub), GFP_ATOMIC); + sub = kmalloc_obj(*sub, GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); return NULL; diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index aad7f96b6009..af530c9ed840 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -182,7 +182,7 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *s struct tipc_conn *con; int ret; - con = kzalloc(sizeof(*con), GFP_ATOMIC); + con = kzalloc_obj(*con, GFP_ATOMIC); if (!con) return ERR_PTR(-ENOMEM); @@ -325,7 +325,7 @@ void tipc_topsrv_queue_evt(struct net *net, int conid, if (!connected(con)) goto err; - e = kmalloc(sizeof(*e), GFP_ATOMIC); + e = kmalloc_obj(*e, GFP_ATOMIC); if (!e) goto err; e->inactive = (event == TIPC_SUBSCR_TIMEOUT); @@ -661,7 +661,7 @@ static int tipc_topsrv_start(struct net *net) struct tipc_topsrv *srv; int ret; - srv = kzalloc(sizeof(*srv), GFP_ATOMIC); + srv = kzalloc_obj(*srv, GFP_ATOMIC); if (!srv) return -ENOMEM; diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index b85ab0fb3b8c..2b8e385d1e51 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -309,7 +309,7 @@ static int tipc_udp_rcast_add(struct tipc_bearer *b, if (!ub) return -ENODEV; - rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); + rcast = kmalloc_obj(*rcast, GFP_ATOMIC); if (!rcast) return -ENOMEM; @@ -675,7 +675,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct net_device *dev; int rmcast = 0; - ub = kzalloc(sizeof(*ub), GFP_ATOMIC); + ub = kzalloc_obj(*ub, GFP_ATOMIC); if (!ub) return -ENOMEM; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 71734411ff4c..99c8eff9783e 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -346,7 +346,7 @@ static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record; skb_frag_t *frag; - record = kmalloc(sizeof(*record), GFP_KERNEL); + record = kmalloc_obj(*record); if (!record) return -ENOMEM; @@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk, if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } @@ -461,7 +462,7 @@ static int tls_push_data(struct sock *sk, /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ - max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + max_open_record_len = tls_ctx->tx_max_payload_len + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); @@ -1039,7 +1040,7 @@ static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *c struct tls_offload_context_tx *offload_ctx; __be64 rcd_sn; - offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL); + offload_ctx = kzalloc_obj(*offload_ctx); if (!offload_ctx) return NULL; @@ -1109,7 +1110,7 @@ int tls_set_device_offload(struct sock *sk) memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq); - start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); + start_marker_record = kmalloc_obj(*start_marker_record); if (!start_marker_record) { rc = -ENOMEM; goto release_netdev; @@ -1223,7 +1224,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_lock; } - context = kzalloc(sizeof(*context), GFP_KERNEL); + context = kzalloc_obj(*context); if (!context) { rc = -ENOMEM; goto release_lock; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 03d508a45aae..d3c72f509baa 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -383,7 +383,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) if (!payload_len) return skb; - sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC); + sg_in = kmalloc_objs(*sg_in, sg_in_max_elements, GFP_ATOMIC); if (!sg_in) goto free_orig; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 39a2ab47fe72..fd39acf41a61 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -541,6 +541,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, return 0; } +static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + u16 payload_len = ctx->tx_max_payload_len; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < sizeof(payload_len)) + return -EINVAL; + + if (put_user(sizeof(payload_len), optlen)) + return -EFAULT; + + if (copy_to_user(optval, &payload_len, sizeof(payload_len))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -560,6 +582,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_getsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_MAX_PAYLOAD_LEN: + rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -809,6 +834,32 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, return rc; } +static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx); + u16 value; + bool tls_13 = ctx->prot_info.version == TLS_1_3_VERSION; + + if (sw_ctx && sw_ctx->open_rec) + return -EBUSY; + + if (sockptr_is_null(optval) || optlen != sizeof(value)) + return -EINVAL; + + if (copy_from_sockptr(&value, optval, sizeof(value))) + return -EFAULT; + + if (value < TLS_MIN_RECORD_SIZE_LIM - (tls_13 ? 1 : 0) || + value > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + ctx->tx_max_payload_len = value; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -830,6 +881,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_setsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_MAX_PAYLOAD_LEN: + lock_sock(sk); + rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen); + release_sock(sk); + break; default: rc = -ENOPROTOOPT; break; @@ -859,7 +915,7 @@ struct tls_context *tls_ctx_create(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); + ctx = kzalloc_obj(*ctx, GFP_ATOMIC); if (!ctx) return NULL; @@ -1019,6 +1075,7 @@ static int tls_init(struct sock *sk) ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; + ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); @@ -1108,6 +1165,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) goto nla_failure; } + err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN, + ctx->tx_max_payload_len); + + if (err) + goto nla_failure; + rcu_read_unlock(); nla_nest_end(skb, start); return 0; @@ -1129,6 +1192,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin) nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */ 0; return size; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d17135369980..5fe07f110fe8 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; @@ -2533,7 +2533,7 @@ void tls_sw_cancel_work_tx(struct tls_context *tls_ctx) set_bit(BIT_TX_CLOSING, &ctx->tx_bitmask); set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask); - cancel_delayed_work_sync(&ctx->tx_work.work); + disable_delayed_work_sync(&ctx->tx_work.work); } void tls_sw_release_resources_tx(struct sock *sk) @@ -2698,7 +2698,7 @@ static struct tls_sw_context_tx *init_ctx_tx(struct tls_context *ctx, struct soc struct tls_sw_context_tx *sw_ctx_tx; if (!ctx->priv_ctx_tx) { - sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + sw_ctx_tx = kzalloc_obj(*sw_ctx_tx); if (!sw_ctx_tx) return NULL; } else { @@ -2719,7 +2719,7 @@ static struct tls_sw_context_rx *init_ctx_rx(struct tls_context *ctx) struct tls_sw_context_rx *sw_ctx_rx; if (!ctx->priv_ctx_rx) { - sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + sw_ctx_rx = kzalloc_obj(*sw_ctx_rx); if (!sw_ctx_rx) return NULL; } else { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 833c3616d2a2..b23c33df8b46 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -733,19 +733,7 @@ static void unix_release_sock(struct sock *sk, int embrion) /* ---- Socket is dead now and most probably destroyed ---- */ - /* - * Fixme: BSD difference: In BSD all sockets connected to us get - * ECONNRESET and we die on the spot. In Linux we behave - * like files and pipes do and wait for the last - * dereference. - * - * Can't we simply set sock->err? - * - * What the above comment does talk about? --ANK(980817) - */ - - if (READ_ONCE(unix_tot_inflight)) - unix_gc(); /* Garbage collect fds */ + unix_schedule_gc(NULL); } struct unix_peercred { @@ -854,8 +842,8 @@ out: } static int unix_release(struct socket *); -static int unix_bind(struct socket *, struct sockaddr *, int); -static int unix_stream_connect(struct socket *, struct sockaddr *, +static int unix_bind(struct socket *, struct sockaddr_unsized *, int); +static int unix_stream_connect(struct socket *, struct sockaddr_unsized *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); @@ -877,7 +865,7 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); -static int unix_dgram_connect(struct socket *, struct sockaddr *, +static int unix_dgram_connect(struct socket *, struct sockaddr_unsized *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, @@ -1210,25 +1198,16 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, unix_mkname_bsd(sunaddr, addr_len); if (flags & SOCK_COREDUMP) { - const struct cred *cred; - struct cred *kcred; struct path root; - kcred = prepare_kernel_cred(&init_task); - if (!kcred) { - err = -ENOMEM; - goto fail; - } - task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cred = override_creds(kcred); - err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, - LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | - LOOKUP_NO_MAGICLINKS, &path); - put_cred(revert_creds(cred)); + scoped_with_kernel_creds() + err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | + LOOKUP_NO_MAGICLINKS, &path); path_put(&root); if (err) goto fail; @@ -1399,7 +1378,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) - err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); + err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); @@ -1477,7 +1456,7 @@ out: return err; } -static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; @@ -1523,7 +1502,7 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) unix_state_unlock(sk2); } -static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, +static int unix_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; @@ -1642,7 +1621,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) return timeo; } -static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, +static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; @@ -1671,10 +1650,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); - /* First of all allocate resources. - * If we will make it after state is locked, - * we will have to recheck all again in any case. - */ + err = prepare_peercred(&peercred); + if (err) + goto out; /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); @@ -1683,10 +1661,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; } - err = prepare_peercred(&peercred); - if (err) - goto out; - /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { @@ -1811,7 +1785,7 @@ restart: __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sock_put(other); return 0; @@ -1984,6 +1958,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + unix_peek_fpl(scm->fp); } static void unix_destruct_scm(struct sk_buff *skb) @@ -2110,8 +2086,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) return err; - wait_for_unix_gc(scm.fp); - if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; @@ -2306,7 +2280,7 @@ restart_locked: scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sock_put(other); scm_destroy(&scm); return len; @@ -2379,7 +2353,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, sk_send_sigurg(other); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); return 0; out_unlock: @@ -2405,8 +2379,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) return err; - wait_for_unix_gc(scm.fp); - if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -2507,7 +2479,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, spin_unlock(&other->sk_receive_queue.lock); unix_state_unlock(other); - other->sk_data_ready(other); + READ_ONCE(other->sk_data_ready)(other); sent += size; } @@ -3111,12 +3083,15 @@ unlock: mutex_unlock(&u->iolock); if (msg) { + bool do_cmsg = READ_ONCE(u->recvmsg_inq); + scm_recv_unix(sock, msg, &scm, flags); - if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { + if ((do_cmsg | msg->msg_get_inq) && (copied ?: err) >= 0) { msg->msg_inq = READ_ONCE(u->inq_len); - put_cmsg(msg, SOL_SOCKET, SCM_INQ, - sizeof(msg->msg_inq), &msg->msg_inq); + if (do_cmsg) + put_cmsg(msg, SOL_SOCKET, SCM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); } } else { scm_destroy(&scm); @@ -3285,9 +3260,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct file *f; - int fd; - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -3297,18 +3269,7 @@ static int unix_open_file(struct sock *sk) if (!unix_sk(sk)->path.dentry) return -ENOENT; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); - if (IS_ERR(f)) { - put_unused_fd(fd); - return PTR_ERR(f); - } - - fd_install(fd, f); - return fd; + return FD_ADD(O_CLOEXEC, dentry_open(&unix_sk(sk)->path, O_PATH, current_cred())); } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -3839,14 +3800,12 @@ static int __net_init unix_net_init(struct net *net) goto err_sysctl; #endif - net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, - sizeof(spinlock_t), GFP_KERNEL); + net->unx.table.locks = kvmalloc_objs(spinlock_t, UNIX_HASH_SIZE); if (!net->unx.table.locks) goto err_proc; - net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, - sizeof(struct hlist_head), - GFP_KERNEL); + net->unx.table.buckets = kvmalloc_objs(struct hlist_head, + UNIX_HASH_SIZE); if (!net->unx.table.buckets) goto free_locks; diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h index 59db179df9bb..8119dbeef3a3 100644 --- a/net/unix/af_unix.h +++ b/net/unix/af_unix.h @@ -24,14 +24,13 @@ struct unix_skb_parms { #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) /* GC for SCM_RIGHTS */ -extern unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); void unix_update_edges(struct unix_sock *receiver); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); -void unix_gc(void); -void wait_for_unix_gc(struct scm_fp_list *fpl); +void unix_peek_fpl(struct scm_fp_list *fpl); +void unix_schedule_gc(struct user_struct *user); /* SOCK_DIAG */ long unix_inq_len(struct sock *sk); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 65396a4e1b07..a7967a345827 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -121,8 +121,13 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) return edge->successor->vertex; } -static bool unix_graph_maybe_cyclic; -static bool unix_graph_grouped; +enum { + UNIX_GRAPH_NOT_CYCLIC, + UNIX_GRAPH_MAYBE_CYCLIC, + UNIX_GRAPH_CYCLIC, +}; + +static unsigned char unix_graph_state; static void unix_update_graph(struct unix_vertex *vertex) { @@ -132,8 +137,7 @@ static void unix_update_graph(struct unix_vertex *vertex) if (!vertex) return; - unix_graph_maybe_cyclic = true; - unix_graph_grouped = false; + WRITE_ONCE(unix_graph_state, UNIX_GRAPH_MAYBE_CYCLIC); } static LIST_HEAD(unix_unvisited_vertices); @@ -195,8 +199,7 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } } -static DEFINE_SPINLOCK(unix_gc_lock); -unsigned int unix_tot_inflight; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(unix_gc_lock); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { @@ -222,7 +225,6 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) } while (i < fpl->count_unix); receiver->scm_stat.nr_unix_fds += fpl->count_unix; - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); @@ -253,7 +255,6 @@ void unix_del_edges(struct scm_fp_list *fpl) receiver = fpl->edges[0].successor; receiver->scm_stat.nr_unix_fds -= fpl->count_unix; } - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); @@ -287,18 +288,20 @@ int unix_prepare_fpl(struct scm_fp_list *fpl) return 0; for (i = 0; i < fpl->count_unix; i++) { - vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + vertex = kmalloc_obj(*vertex); if (!vertex) goto err; list_add(&vertex->entry, &fpl->vertices); } - fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), - GFP_KERNEL_ACCOUNT); + fpl->edges = kvmalloc_objs(*fpl->edges, fpl->count_unix, + GFP_KERNEL_ACCOUNT); if (!fpl->edges) goto err; + unix_schedule_gc(fpl->user); + return 0; err: @@ -315,6 +318,25 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static bool gc_in_progress; +static seqcount_t unix_peek_seq = SEQCNT_ZERO(unix_peek_seq); + +void unix_peek_fpl(struct scm_fp_list *fpl) +{ + static DEFINE_SPINLOCK(unix_peek_lock); + + if (!fpl || !fpl->count_unix) + return; + + if (!READ_ONCE(gc_in_progress)) + return; + + /* Invalidate the final refcnt check in unix_vertex_dead(). */ + spin_lock(&unix_peek_lock); + raw_write_seqcount_barrier(&unix_peek_seq); + spin_unlock(&unix_peek_lock); +} + static bool unix_vertex_dead(struct unix_vertex *vertex) { struct unix_edge *edge; @@ -348,6 +370,36 @@ static bool unix_vertex_dead(struct unix_vertex *vertex) return true; } +static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +static bool unix_scc_dead(struct list_head *scc, bool fast) +{ + struct unix_vertex *vertex; + bool scc_dead = true; + unsigned int seq; + + seq = read_seqcount_begin(&unix_peek_seq); + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + /* Don't restart DFS from this vertex. */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack for __unix_walk_scc(). */ + if (!fast) + vertex->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); + } + + /* If MSG_PEEK intervened, defer this SCC to the next round. */ + if (read_seqcount_retry(&unix_peek_seq, seq)) + return false; + + return scc_dead; +} + static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { struct unix_vertex *vertex; @@ -401,12 +453,11 @@ static bool unix_scc_cyclic(struct list_head *scc) return false; } -static LIST_HEAD(unix_visited_vertices); -static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; - -static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, - struct sk_buff_head *hitlist) +static unsigned long __unix_walk_scc(struct unix_vertex *vertex, + unsigned long *last_index, + struct sk_buff_head *hitlist) { + unsigned long cyclic_sccs = 0; LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); @@ -469,9 +520,7 @@ prev_vertex: } if (vertex->index == vertex->scc_index) { - struct unix_vertex *v; struct list_head scc; - bool scc_dead = true; /* SCC finalised. * @@ -480,25 +529,14 @@ prev_vertex: */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); - list_for_each_entry_reverse(v, &scc, scc_entry) { - /* Don't restart DFS from this vertex in unix_walk_scc(). */ - list_move_tail(&v->entry, &unix_visited_vertices); - - /* Mark vertex as off-stack. */ - v->index = unix_vertex_grouped_index; - - if (scc_dead) - scc_dead = unix_vertex_dead(v); - } - - if (scc_dead) { + if (unix_scc_dead(&scc, false)) { unix_collect_skb(&scc, hitlist); } else { if (unix_vertex_max_scc_index < vertex->scc_index) unix_vertex_max_scc_index = vertex->scc_index; - if (!unix_graph_maybe_cyclic) - unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + if (unix_scc_cyclic(&scc)) + cyclic_sccs++; } list_del(&scc); @@ -507,13 +545,17 @@ prev_vertex: /* Need backtracking ? */ if (!list_empty(&edge_stack)) goto prev_vertex; + + return cyclic_sccs; } +static unsigned long unix_graph_cyclic_sccs; + static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; + unsigned long cyclic_sccs = 0; - unix_graph_maybe_cyclic = false; unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; /* Visit every vertex exactly once. @@ -523,62 +565,58 @@ static void unix_walk_scc(struct sk_buff_head *hitlist) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex, &last_index, hitlist); + cyclic_sccs += __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); - unix_graph_grouped = true; + WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); + WRITE_ONCE(unix_graph_state, + cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { - unix_graph_maybe_cyclic = false; + unsigned long cyclic_sccs = unix_graph_cyclic_sccs; while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; - bool scc_dead = true; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); list_add(&scc, &vertex->scc_entry); - list_for_each_entry_reverse(vertex, &scc, scc_entry) { - list_move_tail(&vertex->entry, &unix_visited_vertices); - - if (scc_dead) - scc_dead = unix_vertex_dead(vertex); - } - - if (scc_dead) + if (unix_scc_dead(&scc, true)) { + cyclic_sccs--; unix_collect_skb(&scc, hitlist); - else if (!unix_graph_maybe_cyclic) - unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); -} -static bool gc_in_progress; + WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); + WRITE_ONCE(unix_graph_state, + cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); +} -static void __unix_gc(struct work_struct *work) +static void unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; struct sk_buff *skb; spin_lock(&unix_gc_lock); - if (!unix_graph_maybe_cyclic) { + if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { spin_unlock(&unix_gc_lock); goto skip_gc; } __skb_queue_head_init(&hitlist); - if (unix_graph_grouped) + if (unix_graph_state == UNIX_GRAPH_CYCLIC) unix_walk_scc_fast(&hitlist); else unix_walk_scc(&hitlist); @@ -595,36 +633,27 @@ skip_gc: WRITE_ONCE(gc_in_progress, false); } -static DECLARE_WORK(unix_gc_work, __unix_gc); +static DECLARE_WORK(unix_gc_work, unix_gc); -void unix_gc(void) -{ - WRITE_ONCE(gc_in_progress, true); - queue_work(system_dfl_wq, &unix_gc_work); -} - -#define UNIX_INFLIGHT_TRIGGER_GC 16000 -#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) -void wait_for_unix_gc(struct scm_fp_list *fpl) +void unix_schedule_gc(struct user_struct *user) { - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight(), and __unix_gc(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); + if (READ_ONCE(unix_graph_state) == UNIX_GRAPH_NOT_CYCLIC) + return; /* Penalise users who want to send AF_UNIX sockets * but whose sockets have not been received yet. */ - if (!fpl || !fpl->count_unix || - READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + if (user && + READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; - if (READ_ONCE(gc_in_progress)) + if (!READ_ONCE(gc_in_progress)) { + WRITE_ONCE(gc_in_progress, true); + queue_work(system_dfl_wq, &unix_gc_work); + } + + if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index a9ca9c3b87b3..2f7d94d682cb 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -83,6 +83,56 @@ * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening + * + * - Namespaces in vsock support two different modes: "local" and "global". + * Each mode defines how the namespace interacts with CIDs. + * Each namespace exposes two sysctl files: + * + * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's + * mode, which is set at namespace creation and immutable thereafter. + * - /proc/sys/net/vsock/child_ns_mode (write-once) controls what mode future + * child namespaces will inherit when created. The initial value matches + * the namespace's own ns_mode. + * + * Changing child_ns_mode only affects newly created namespaces, not the + * current namespace or existing children. A "local" namespace cannot set + * child_ns_mode to "global". child_ns_mode is write-once, so that it may be + * configured and locked down by a namespace manager. Writing a different + * value after the first write returns -EBUSY. At namespace creation, ns_mode + * is inherited from the parent's child_ns_mode. + * + * The init_net mode is "global" and cannot be modified. The init_net + * child_ns_mode is also write-once, so an init process (e.g. systemd) can + * set it to "local" to ensure all new namespaces inherit local mode. + * + * The modes affect the allocation and accessibility of CIDs as follows: + * + * - global - access and allocation are all system-wide + * - all CID allocation from global namespaces draw from the same + * system-wide pool. + * - if one global namespace has already allocated some CID, another + * global namespace will not be able to allocate the same CID. + * - global mode AF_VSOCK sockets can reach any VM or socket in any global + * namespace, they are not contained to only their own namespace. + * - AF_VSOCK sockets in a global mode namespace cannot reach VMs or + * sockets in any local mode namespace. + * - local - access and allocation are contained within the namespace + * - CID allocation draws only from a private pool local only to the + * namespace, and does not affect the CIDs available for allocation in any + * other namespace (global or local). + * - VMs in a local namespace do not collide with CIDs in any other local + * namespace or any global namespace. For example, if a VM in a local mode + * namespace is given CID 10, then CID 10 is still available for + * allocation in any other namespace, but not in the same namespace. + * - AF_VSOCK sockets in a local mode namespace can connect only to VMs or + * other sockets within their own namespace. + * - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve + * to any transport that is not compatible with local mode. There is no + * error that propagates to the user (as there is for connection attempts) + * because it is possible for some packet to reach this socket from + * a different transport that *does* support local mode. For + * example, virtio-vsock may not support local mode, but the socket + * may still accept a connection from vhost-vsock which does. */ #include <linux/compat.h> @@ -100,20 +150,31 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> +#include <linux/proc_fs.h> #include <linux/poll.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> #include <linux/stddef.h> +#include <linux/sysctl.h> #include <linux/unistd.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/af_vsock.h> +#include <net/netns/vsock.h> #include <uapi/linux/vm_sockets.h> #include <uapi/asm-generic/ioctls.h> +#define VSOCK_NET_MODE_STR_GLOBAL "global" +#define VSOCK_NET_MODE_STR_LOCAL "local" + +/* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'. + * The newline is added by proc_dostring() for read operations. + */ +#define VSOCK_NET_MODE_STR_MAX 8 + static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); @@ -235,33 +296,42 @@ static void __vsock_remove_connected(struct vsock_sock *vsk) sock_put(&vsk->sk); } -static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) +static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr, + struct net *net) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { - if (vsock_addr_equals_addr(addr, &vsk->local_addr)) - return sk_vsock(vsk); + struct sock *sk = sk_vsock(vsk); + + if (vsock_addr_equals_addr(addr, &vsk->local_addr) && + vsock_net_check_mode(sock_net(sk), net)) + return sk; if (addr->svm_port == vsk->local_addr.svm_port && (vsk->local_addr.svm_cid == VMADDR_CID_ANY || - addr->svm_cid == VMADDR_CID_ANY)) - return sk_vsock(vsk); + addr->svm_cid == VMADDR_CID_ANY) && + vsock_net_check_mode(sock_net(sk), net)) + return sk; } return NULL; } -static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, - struct sockaddr_vm *dst) +static struct sock * +__vsock_find_connected_socket_net(struct sockaddr_vm *src, + struct sockaddr_vm *dst, struct net *net) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_connected_sockets(src, dst), connected_table) { + struct sock *sk = sk_vsock(vsk); + if (vsock_addr_equals_addr(src, &vsk->remote_addr) && - dst->svm_port == vsk->local_addr.svm_port) { - return sk_vsock(vsk); + dst->svm_port == vsk->local_addr.svm_port && + vsock_net_check_mode(sock_net(sk), net)) { + return sk; } } @@ -304,12 +374,18 @@ void vsock_remove_connected(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_remove_connected); -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) +/* Find a bound socket, filtering by namespace and namespace mode. + * + * Use this in transports that are namespace-aware and can provide the + * network namespace context. + */ +struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr, + struct net *net) { struct sock *sk; spin_lock_bh(&vsock_table_lock); - sk = __vsock_find_bound_socket(addr); + sk = __vsock_find_bound_socket_net(addr, net); if (sk) sock_hold(sk); @@ -317,15 +393,32 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) return sk; } +EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net); + +/* Find a bound socket without namespace filtering. + * + * Use this in transports that lack namespace context. All sockets are + * treated as if in global mode. + */ +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) +{ + return vsock_find_bound_socket_net(addr, NULL); +} EXPORT_SYMBOL_GPL(vsock_find_bound_socket); -struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, - struct sockaddr_vm *dst) +/* Find a connected socket, filtering by namespace and namespace mode. + * + * Use this in transports that are namespace-aware and can provide the + * network namespace context. + */ +struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src, + struct sockaddr_vm *dst, + struct net *net) { struct sock *sk; spin_lock_bh(&vsock_table_lock); - sk = __vsock_find_connected_socket(src, dst); + sk = __vsock_find_connected_socket_net(src, dst, net); if (sk) sock_hold(sk); @@ -333,6 +426,18 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, return sk; } +EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net); + +/* Find a connected socket without namespace filtering. + * + * Use this in transports that lack namespace context. All sockets are + * treated as if in global mode. + */ +struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, + struct sockaddr_vm *dst) +{ + return vsock_find_connected_socket_net(src, dst, NULL); +} EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) @@ -528,7 +633,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || - !new_transport->seqpacket_allow(remote_cid)) { + !new_transport->seqpacket_allow(vsk, remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } @@ -676,11 +781,11 @@ out: static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - static u32 port; + struct net *net = sock_net(sk_vsock(vsk)); struct sockaddr_vm new_addr; - if (!port) - port = get_random_u32_above(LAST_RESERVED_PORT); + if (!net->vsock.port) + net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); @@ -689,13 +794,13 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { - if (port == VMADDR_PORT_ANY || - port <= LAST_RESERVED_PORT) - port = LAST_RESERVED_PORT + 1; + if (net->vsock.port == VMADDR_PORT_ANY || + net->vsock.port <= LAST_RESERVED_PORT) + net->vsock.port = LAST_RESERVED_PORT + 1; - new_addr.svm_port = port++; + new_addr.svm_port = net->vsock.port++; - if (!__vsock_find_bound_socket(&new_addr)) { + if (!__vsock_find_bound_socket_net(&new_addr, net)) { found = true; break; } @@ -712,7 +817,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, return -EACCES; } - if (__vsock_find_bound_socket(&new_addr)) + if (__vsock_find_bound_socket_net(&new_addr, net)) return -EADDRINUSE; } @@ -987,7 +1092,7 @@ static int vsock_release(struct socket *sock) } static int -vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { int err; struct sock *sk; @@ -1314,7 +1419,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (!transport->dgram_allow(remote_addr->svm_cid, + if (!transport->dgram_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; @@ -1328,7 +1433,7 @@ out: } static int vsock_dgram_connect(struct socket *sock, - struct sockaddr *addr, int addr_len, int flags) + struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; @@ -1355,7 +1460,7 @@ static int vsock_dgram_connect(struct socket *sock, if (err) goto out; - if (!vsk->transport->dgram_allow(remote_addr->svm_cid, + if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; @@ -1528,7 +1633,7 @@ static void vsock_connect_timeout(struct work_struct *work) sock_put(sk); } -static int vsock_connect(struct socket *sock, struct sockaddr *addr, +static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; @@ -1585,7 +1690,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * endpoints. */ if (!transport || - !transport->stream_allow(remote_addr->svm_cid, + !transport->stream_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; @@ -1787,6 +1892,10 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, } else { newsock->state = SS_CONNECTED; sock_graft(connected, newsock); + + set_bit(SOCK_CUSTOM_SOCKOPT, + &connected->sk_socket->flags); + if (vsock_msgzerocopy_allow(vconnected->transport)) set_bit(SOCK_SUPPORT_ZC, &connected->sk_socket->flags); @@ -2658,6 +2767,189 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; +static int __vsock_net_mode_string(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + enum vsock_net_mode mode, + enum vsock_net_mode *new_mode) +{ + char data[VSOCK_NET_MODE_STR_MAX] = {0}; + struct ctl_table tmp; + int ret; + + if (!table->data || !table->maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + tmp = *table; + tmp.data = data; + + if (!write) { + const char *p; + + switch (mode) { + case VSOCK_NET_MODE_GLOBAL: + p = VSOCK_NET_MODE_STR_GLOBAL; + break; + case VSOCK_NET_MODE_LOCAL: + p = VSOCK_NET_MODE_STR_LOCAL; + break; + default: + WARN_ONCE(true, "netns has invalid vsock mode"); + *lenp = 0; + return 0; + } + + strscpy(data, p, sizeof(data)); + tmp.maxlen = strlen(p); + } + + ret = proc_dostring(&tmp, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + if (*lenp >= sizeof(data)) + return -EINVAL; + + if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data))) + *new_mode = VSOCK_NET_MODE_GLOBAL; + else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data))) + *new_mode = VSOCK_NET_MODE_LOCAL; + else + return -EINVAL; + + return 0; +} + +static int vsock_net_mode_string(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net; + + if (write) + return -EPERM; + + net = container_of(table->data, struct net, vsock.mode); + + return __vsock_net_mode_string(table, write, buffer, lenp, ppos, + vsock_net_mode(net), NULL); +} + +static int vsock_net_child_mode_string(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + enum vsock_net_mode new_mode; + struct net *net; + int ret; + + net = container_of(table->data, struct net, vsock.child_ns_mode); + + ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos, + vsock_net_child_mode(net), &new_mode); + if (ret) + return ret; + + if (write) { + /* Prevent a "local" namespace from escalating to "global", + * which would give nested namespaces access to global CIDs. + */ + if (vsock_net_mode(net) == VSOCK_NET_MODE_LOCAL && + new_mode == VSOCK_NET_MODE_GLOBAL) + return -EPERM; + + if (!vsock_net_set_child_mode(net, new_mode)) + return -EBUSY; + } + + return 0; +} + +static struct ctl_table vsock_table[] = { + { + .procname = "ns_mode", + .data = &init_net.vsock.mode, + .maxlen = VSOCK_NET_MODE_STR_MAX, + .mode = 0444, + .proc_handler = vsock_net_mode_string + }, + { + .procname = "child_ns_mode", + .data = &init_net.vsock.child_ns_mode, + .maxlen = VSOCK_NET_MODE_STR_MAX, + .mode = 0644, + .proc_handler = vsock_net_child_mode_string + }, +}; + +static int __net_init vsock_sysctl_register(struct net *net) +{ + struct ctl_table *table; + + if (net_eq(net, &init_net)) { + table = vsock_table; + } else { + table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->vsock.mode; + table[1].data = &net->vsock.child_ns_mode; + } + + net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table, + ARRAY_SIZE(vsock_table)); + if (!net->vsock.sysctl_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void vsock_sysctl_unregister(struct net *net) +{ + const struct ctl_table *table; + + table = net->vsock.sysctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->vsock.sysctl_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static void vsock_net_init(struct net *net) +{ + if (net_eq(net, &init_net)) + net->vsock.mode = VSOCK_NET_MODE_GLOBAL; + else + net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); + + net->vsock.child_ns_mode = net->vsock.mode; +} + +static __net_init int vsock_sysctl_init_net(struct net *net) +{ + vsock_net_init(net); + + if (vsock_sysctl_register(net)) + return -ENOMEM; + + return 0; +} + +static __net_exit void vsock_sysctl_exit_net(struct net *net) +{ + vsock_sysctl_unregister(net); +} + +static struct pernet_operations vsock_sysctl_ops = { + .init = vsock_sysctl_init_net, + .exit = vsock_sysctl_exit_net, +}; + static int __init vsock_init(void) { int err = 0; @@ -2685,10 +2977,17 @@ static int __init vsock_init(void) goto err_unregister_proto; } + if (register_pernet_subsys(&vsock_sysctl_ops)) { + err = -ENOMEM; + goto err_unregister_sock; + } + vsock_bpf_build_proto(); return 0; +err_unregister_sock: + sock_unregister(AF_VSOCK); err_unregister_proto: proto_unregister(&vsock_proto); err_deregister_misc: @@ -2702,6 +3001,7 @@ static void __exit vsock_exit(void) misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); + unregister_pernet_subsys(&vsock_sysctl_ops); } const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 432fcbbd14d4..069386a74557 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -444,7 +444,7 @@ static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) struct hvsock *hvs; struct sock *sk = sk_vsock(vsk); - hvs = kzalloc(sizeof(*hvs), GFP_KERNEL); + hvs = kzalloc_obj(*hvs); if (!hvs) return -ENOMEM; @@ -570,7 +570,7 @@ static int hvs_dgram_enqueue(struct vsock_sock *vsk, return -EOPNOTSUPP; } -static bool hvs_dgram_allow(u32 cid, u32 port) +static bool hvs_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port) { return false; } @@ -655,7 +655,7 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg, BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE); - send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); + send_buf = kmalloc_obj(*send_buf); if (!send_buf) return -ENOMEM; @@ -745,8 +745,11 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk) return hvs->chan != NULL; } -static bool hvs_stream_allow(u32 cid, u32 port) +static bool hvs_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port) { + if (!vsock_net_mode_global(vsk)) + return false; + if (cid == VMADDR_CID_HOST) return true; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 8c867023a2e5..77fe5b7b066c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -17,6 +17,7 @@ #include <linux/virtio_ids.h> #include <linux/virtio_config.h> #include <linux/virtio_vsock.h> +#include <linux/dma-mapping.h> #include <net/sock.h> #include <linux/mutex.h> #include <net/af_vsock.h> @@ -54,13 +55,6 @@ struct virtio_vsock { int rx_buf_nr; int rx_buf_max_nr; - /* The following fields are protected by event_lock. - * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. - */ - struct mutex event_lock; - bool event_run; - struct virtio_vsock_event event_list[8]; - u32 guest_cid; bool seqpacket_allow; @@ -74,6 +68,15 @@ struct virtio_vsock { */ struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1]; struct scatterlist out_bufs[MAX_SKB_FRAGS + 1]; + + /* The following fields are protected by event_lock. + * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. + */ + struct mutex event_lock; + bool event_run; + __dma_from_device_group_begin(); + struct virtio_vsock_event event_list[8]; + __dma_from_device_group_end(); }; static u32 virtio_transport_get_local_cid(void) @@ -231,7 +234,7 @@ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struc } static int -virtio_transport_send_pkt(struct sk_buff *skb) +virtio_transport_send_pkt(struct sk_buff *skb, struct net *net) { struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; @@ -390,7 +393,7 @@ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, sg_init_one(&sg, event, sizeof(*event)); - return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL); + return virtqueue_add_inbuf_cache_clean(vq, &sg, 1, event, GFP_KERNEL); } /* event_lock must be held */ @@ -536,7 +539,13 @@ static bool virtio_transport_msgzerocopy_allow(void) return true; } -static bool virtio_transport_seqpacket_allow(u32 remote_cid); +bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port) +{ + return vsock_net_mode_global(vsk); +} + +static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, + u32 remote_cid); static struct virtio_transport virtio_transport = { .transport = { @@ -593,11 +602,15 @@ static struct virtio_transport virtio_transport = { .can_msgzerocopy = virtio_transport_can_msgzerocopy, }; -static bool virtio_transport_seqpacket_allow(u32 remote_cid) +static bool +virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid) { struct virtio_vsock *vsock; bool seqpacket_allow; + if (!vsock_net_mode_global(vsk)) + return false; + seqpacket_allow = false; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); @@ -660,7 +673,11 @@ static void virtio_transport_rx_work(struct work_struct *work) virtio_vsock_skb_put(skb, payload_len); virtio_transport_deliver_tap_pkt(skb); - virtio_transport_recv_pkt(&virtio_transport, skb); + + /* Force virtio-transport into global mode since it + * does not yet support local-mode namespacing. + */ + virtio_transport_recv_pkt(&virtio_transport, skb, NULL); } } while (!virtqueue_enable_cb(vq)); @@ -788,7 +805,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) goto out; } - vsock = kzalloc(sizeof(*vsock), GFP_KERNEL); + vsock = kzalloc_obj(*vsock); if (!vsock) { ret = -ENOMEM; goto out; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index dcc8a1d5851e..8a9fb23c6e85 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -28,6 +28,7 @@ static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, bool cancel_timeout); +static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs); static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) @@ -413,7 +414,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_inc_tx_pkt(vvs, skb); - ret = t_ops->send_pkt(skb); + ret = t_ops->send_pkt(skb, info->net); if (ret < 0) break; @@ -499,9 +500,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) return 0; spin_lock_bh(&vvs->tx_lock); - ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); - if (ret > credit) - ret = credit; + ret = min_t(u32, credit, virtio_transport_has_space(vvs)); vvs->tx_cnt += ret; vvs->bytes_unsent += ret; spin_unlock_bh(&vvs->tx_lock); @@ -527,6 +526,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -822,6 +822,15 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); +static u32 virtio_transport_tx_buf_size(struct virtio_vsock_sock *vvs) +{ + /* The peer advertises its receive buffer via peer_buf_alloc, but we + * cap it to our local buf_alloc so a remote peer cannot force us to + * queue more data than our own buffer configuration allows. + */ + return min(vvs->peer_buf_alloc, vvs->buf_alloc); +} + int virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, struct msghdr *msg, @@ -831,7 +840,7 @@ virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, spin_lock_bh(&vvs->tx_lock); - if (len > vvs->peer_buf_alloc) { + if (len > virtio_transport_tx_buf_size(vvs)) { spin_unlock_bh(&vvs->tx_lock); return -EMSGSIZE; } @@ -877,12 +886,16 @@ u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data); -static s64 virtio_transport_has_space(struct vsock_sock *vsk) +static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs) { - struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; - bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + /* Use s64 arithmetic so if the peer shrinks peer_buf_alloc while + * we have bytes in flight (tx_cnt - peer_fwd_cnt), the subtraction + * does not underflow. + */ + bytes = (s64)virtio_transport_tx_buf_size(vvs) - + (vvs->tx_cnt - vvs->peer_fwd_cnt); if (bytes < 0) bytes = 0; @@ -895,7 +908,7 @@ s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) s64 bytes; spin_lock_bh(&vvs->tx_lock); - bytes = virtio_transport_has_space(vsk); + bytes = virtio_transport_has_space(vvs); spin_unlock_bh(&vvs->tx_lock); return bytes; @@ -907,7 +920,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, { struct virtio_vsock_sock *vvs; - vvs = kzalloc(sizeof(*vvs), GFP_KERNEL); + vvs = kzalloc_obj(*vvs); if (!vvs) return -ENOMEM; @@ -1043,12 +1056,6 @@ bool virtio_transport_stream_is_active(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); -bool virtio_transport_stream_allow(u32 cid, u32 port) -{ - return true; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); - int virtio_transport_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr) { @@ -1056,7 +1063,7 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); -bool virtio_transport_dgram_allow(u32 cid, u32 port) +bool virtio_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port) { return false; } @@ -1067,6 +1074,7 @@ int virtio_transport_connect(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1082,6 +1090,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) (mode & SEND_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_SEND : 0), .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1108,6 +1117,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, .msg = msg, .pkt_len = len, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1145,6 +1155,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk, .op = VIRTIO_VSOCK_OP_RST, .reply = !!skb, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; /* Send RST only if the original pkt is not a RST pkt */ @@ -1156,15 +1167,31 @@ static int virtio_transport_reset(struct vsock_sock *vsk, /* Normally packets are associated with a socket. There may be no socket if an * attempt was made to connect to a socket that does not exist. + * + * net refers to the namespace of whoever sent the invalid message. For + * loopback, this is the namespace of the socket. For vhost, this is the + * namespace of the VM (i.e., vhost_vsock). */ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, - struct sk_buff *skb) + struct sk_buff *skb, struct net *net) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, .type = le16_to_cpu(hdr->type), .reply = true, + + /* Set sk owner to socket we are replying to (may be NULL for + * non-loopback). This keeps a reference to the sock and + * sock_net(sk) until the reply skb is freed. + */ + .vsk = vsock_sk(skb->sk), + + /* net is not defined here because we pass it directly to + * t->send_pkt(), instead of relying on + * virtio_transport_send_pkt_info() to pass it. It is not needed + * by virtio_transport_alloc_skb(). + */ }; struct sk_buff *reply; @@ -1183,7 +1210,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, if (!reply) return -ENOMEM; - return t->send_pkt(reply); + return t->send_pkt(reply, net); } /* This function should be called with sk_lock held and SOCK_DONE set */ @@ -1359,9 +1386,11 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small - * payload. + * payload. Skip non-linear (e.g. zerocopy) skbs; these carry payload + * in skb_shinfo. */ - if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) { + if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue) && + !skb_is_nonlinear(skb)) { struct virtio_vsock_hdr *last_hdr; struct sk_buff *last_skb; @@ -1465,6 +1494,7 @@ virtio_transport_send_response(struct vsock_sock *vsk, .remote_port = le32_to_cpu(hdr->src_port), .reply = true, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1490,7 +1520,7 @@ static bool virtio_transport_space_update(struct sock *sk, spin_lock_bh(&vvs->tx_lock); vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); - space_available = virtio_transport_has_space(vsk); + space_available = virtio_transport_has_space(vvs); spin_unlock_bh(&vvs->tx_lock); return space_available; } @@ -1507,12 +1537,12 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, int ret; if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -EINVAL; } if (sk_acceptq_is_full(sk)) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -ENOMEM; } @@ -1520,13 +1550,13 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, * Subsequent enqueues would lead to a memory leak. */ if (sk->sk_shutdown == SHUTDOWN_MASK) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -ESHUTDOWN; } child = vsock_create_connected(sk); if (!child) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -ENOMEM; } @@ -1548,7 +1578,7 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, */ if (ret || vchild->transport != &t->transport) { release_sock(child); - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); sock_put(child); return ret; } @@ -1576,7 +1606,7 @@ static bool virtio_transport_valid_type(u16 type) * lock. */ void virtio_transport_recv_pkt(struct virtio_transport *t, - struct sk_buff *skb) + struct sk_buff *skb, struct net *net) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct sockaddr_vm src, dst; @@ -1599,24 +1629,24 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, le32_to_cpu(hdr->fwd_cnt)); if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); goto free_pkt; } /* The socket must be in connected or bound table * otherwise send reset back */ - sk = vsock_find_connected_socket(&src, &dst); + sk = vsock_find_connected_socket_net(&src, &dst, net); if (!sk) { - sk = vsock_find_bound_socket(&dst); + sk = vsock_find_bound_socket_net(&dst, net); if (!sk) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); goto free_pkt; } } if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); sock_put(sk); goto free_pkt; } @@ -1635,7 +1665,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, */ if (sock_flag(sk, SOCK_DONE) || (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); release_sock(sk); sock_put(sk); goto free_pkt; @@ -1667,7 +1697,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, kfree_skb(skb); break; default: - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); kfree_skb(skb); break; } diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 7eccd6708d66..4296ca1183f1 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -161,7 +161,7 @@ vmci_transport_packet_init(struct vmci_transport_packet *pkt, case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: - memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait)); + pkt->u.wait = *wait; break; case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2: @@ -262,7 +262,7 @@ vmci_transport_alloc_send_control_pkt(struct sockaddr_vm *src, struct vmci_transport_packet *pkt; int err; - pkt = kmalloc(sizeof(*pkt), GFP_KERNEL); + pkt = kmalloc_obj(*pkt); if (!pkt) return -ENOMEM; @@ -646,13 +646,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) return VMCI_SUCCESS; } -static bool vmci_transport_stream_allow(u32 cid, u32 port) +static bool vmci_transport_stream_allow(struct vsock_sock *vsk, u32 cid, + u32 port) { static const u32 non_socket_contexts[] = { VMADDR_CID_LOCAL, }; int i; + if (!vsock_net_mode_global(vsk)) + return false; + BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts)); for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) { @@ -682,12 +686,10 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) err = VMCI_SUCCESS; bh_process_pkt = false; - /* Ignore incoming packets from contexts without sockets, or resources - * that aren't vsock implementations. + /* Ignore incoming packets from resources that aren't vsock + * implementations. */ - - if (!vmci_transport_stream_allow(dg->src.context, -1) - || vmci_transport_peer_rid(dg->src.context) != dg->src.resource) + if (vmci_transport_peer_rid(dg->src.context) != dg->src.resource) return VMCI_ERROR_NO_ACCESS; if (VMCI_DG_SIZE(dg) < sizeof(*pkt)) @@ -749,6 +751,12 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) goto out; } + /* Ignore incoming packets from contexts without sockets. */ + if (!vmci_transport_stream_allow(vsk, dg->src.context, -1)) { + err = VMCI_ERROR_NO_ACCESS; + goto out; + } + /* We do most everything in a work queue, but let's fast path the * notification of reads and writes to help data transfer performance. * We can only do this if there is no process context code executing @@ -771,7 +779,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) if (!bh_process_pkt) { struct vmci_transport_recv_pkt_info *recv_pkt_info; - recv_pkt_info = kmalloc(sizeof(*recv_pkt_info), GFP_ATOMIC); + recv_pkt_info = kmalloc_obj(*recv_pkt_info, GFP_ATOMIC); if (!recv_pkt_info) { if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0) pr_err("unable to send reset\n"); @@ -1575,7 +1583,7 @@ static int vmci_transport_recv_connected(struct sock *sk, static int vmci_transport_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk) { - vsk->trans = kmalloc(sizeof(struct vmci_transport), GFP_KERNEL); + vsk->trans = kmalloc_obj(struct vmci_transport); if (!vsk->trans) return -ENOMEM; @@ -1784,8 +1792,12 @@ out: return err; } -static bool vmci_transport_dgram_allow(u32 cid, u32 port) +static bool vmci_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, + u32 port) { + if (!vsock_net_mode_global(vsk)) + return false; + if (cid == VMADDR_CID_HYPERVISOR) { /* Registrations of PBRPC Servers do not modify VMX/Hypervisor * state and are allowed. diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c index 223b9660a759..a986aa6fff9b 100644 --- a/net/vmw_vsock/vsock_addr.c +++ b/net/vmw_vsock/vsock_addr.c @@ -57,7 +57,7 @@ bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, } EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); -int vsock_addr_cast(const struct sockaddr *addr, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr) { if (len < sizeof(**out_addr)) diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index bc2ff918b315..8068d1b6e851 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -26,7 +26,7 @@ static u32 vsock_loopback_get_local_cid(void) return VMADDR_CID_LOCAL; } -static int vsock_loopback_send_pkt(struct sk_buff *skb) +static int vsock_loopback_send_pkt(struct sk_buff *skb, struct net *net) { struct vsock_loopback *vsock = &the_vsock_loopback; int len = skb->len; @@ -46,7 +46,15 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) return 0; } -static bool vsock_loopback_seqpacket_allow(u32 remote_cid); +static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, + u32 remote_cid); + +static bool vsock_loopback_stream_allow(struct vsock_sock *vsk, u32 cid, + u32 port) +{ + return true; +} + static bool vsock_loopback_msgzerocopy_allow(void) { return true; @@ -76,7 +84,7 @@ static struct virtio_transport loopback_transport = { .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, - .stream_allow = virtio_transport_stream_allow, + .stream_allow = vsock_loopback_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, @@ -106,9 +114,10 @@ static struct virtio_transport loopback_transport = { .send_pkt = vsock_loopback_send_pkt, }; -static bool vsock_loopback_seqpacket_allow(u32 remote_cid) +static bool +vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid) { - return true; + return vsock_net_mode_global(vsk); } static void vsock_loopback_work(struct work_struct *work) @@ -130,7 +139,8 @@ static void vsock_loopback_work(struct work_struct *work) */ virtio_transport_consume_skb_sent(skb, false); virtio_transport_deliver_tap_pkt(skb); - virtio_transport_recv_pkt(&loopback_transport, skb); + virtio_transport_recv_pkt(&loopback_transport, skb, + sock_net(skb->sk)); } } diff --git a/net/wireless/core.c b/net/wireless/core.c index 54a34d8d356e..28ca4290ca99 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -34,6 +34,9 @@ /* name for sysfs, %d is appended */ #define PHY_NAME "phy" +/* maximum length of radio debugfs directory name */ +#define RADIO_DEBUGFSDIR_MAX_LEN 8 + MODULE_AUTHOR("Johannes Berg"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("wireless configuration support"); @@ -262,6 +265,8 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, rdev_stop_nan(rdev, wdev); wdev->is_running = false; + eth_zero_addr(wdev->u.nan.cluster_id); + rdev->opencount--; } @@ -344,7 +349,7 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) guard(wiphy)(&rdev->wiphy); - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); cfg80211_remove_virtual_intf(rdev, wdev); } } @@ -428,7 +433,7 @@ static void cfg80211_wiphy_work(struct work_struct *work) if (wk) { list_del_init(&wk->entry); if (!list_empty(&rdev->wiphy_work_list)) - queue_work(system_unbound_wq, work); + queue_work(system_dfl_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); trace_wiphy_work_run(&rdev->wiphy, wk); @@ -658,12 +663,8 @@ int wiphy_verify_iface_combinations(struct wiphy *wiphy, c->limits[j].max > 1)) return -EINVAL; - /* Only a single NAN can be allowed, avoid this - * check for multi-radio global combination, since it - * hold the capabilities of all radio combinations. - */ - if (!combined_radio && - WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && + /* Only a single NAN can be allowed */ + if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && c->limits[j].max > 1)) return -EINVAL; @@ -999,9 +1000,8 @@ int wiphy_register(struct wiphy *wiphy) if (wiphy->n_radio > 0) { int idx; - wiphy->radio_cfg = kcalloc(wiphy->n_radio, - sizeof(*wiphy->radio_cfg), - GFP_KERNEL); + wiphy->radio_cfg = kzalloc_objs(*wiphy->radio_cfg, + wiphy->n_radio); if (!wiphy->radio_cfg) return -ENOMEM; /* @@ -1042,6 +1042,18 @@ int wiphy_register(struct wiphy *wiphy) /* add to debugfs */ rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy), ieee80211_debugfs_dir); + if (wiphy->n_radio > 0) { + int idx; + char radio_name[RADIO_DEBUGFSDIR_MAX_LEN]; + + for (idx = 0; idx < wiphy->n_radio; idx++) { + scnprintf(radio_name, sizeof(radio_name), "radio%d", + idx); + wiphy->radio_cfg[idx].radio_debugfsdir = + debugfs_create_dir(radio_name, + rdev->wiphy.debugfsdir); + } + } cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); @@ -1051,12 +1063,12 @@ int wiphy_register(struct wiphy *wiphy) wiphy_regulatory_register(wiphy); if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { - struct regulatory_request request; - - request.wiphy_idx = get_wiphy_idx(wiphy); - request.initiator = NL80211_REGDOM_SET_BY_DRIVER; - request.alpha2[0] = '9'; - request.alpha2[1] = '9'; + struct regulatory_request request = { + .wiphy_idx = get_wiphy_idx(wiphy), + .initiator = NL80211_REGDOM_SET_BY_DRIVER, + .alpha2[0] = '9', + .alpha2[1] = '9', + }; nl80211_send_reg_change_event(&request); } @@ -1199,6 +1211,7 @@ void wiphy_unregister(struct wiphy *wiphy) /* this has nothing to do now but make sure it's gone */ cancel_work_sync(&rdev->wiphy_work); + cancel_work_sync(&rdev->rfkill_block); cancel_work_sync(&rdev->conn_work); flush_work(&rdev->event_work); cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); @@ -1356,7 +1369,8 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, } void cfg80211_leave(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) + struct wireless_dev *wdev, + int link_id) { struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; @@ -1365,6 +1379,7 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, cfg80211_pmsr_wdev_down(wdev); + cfg80211_stop_radar_detection(wdev); cfg80211_stop_background_radar_detection(wdev); switch (wdev->iftype) { @@ -1393,14 +1408,16 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, dev, -1, true); + cfg80211_stop_ap(rdev, dev, link_id, true); break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; case NL80211_IFTYPE_P2P_DEVICE: + cfg80211_stop_p2p_device(rdev, wdev); + break; case NL80211_IFTYPE_NAN: - /* cannot happen, has no netdev */ + cfg80211_stop_nan(rdev, wdev); break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: @@ -1414,27 +1431,34 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, } } -void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, - gfp_t gfp) +void cfg80211_stop_link(struct wiphy *wiphy, struct wireless_dev *wdev, + int link_id, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_event *ev; unsigned long flags; - trace_cfg80211_stop_iface(wiphy, wdev); + /* Only AP/GO interfaces may have a specific link_id */ + if (WARN_ON_ONCE(link_id != -1 && + wdev->iftype != NL80211_IFTYPE_AP && + wdev->iftype != NL80211_IFTYPE_P2P_GO)) + link_id = -1; + + trace_cfg80211_stop_link(wiphy, wdev, link_id); - ev = kzalloc(sizeof(*ev), gfp); + ev = kzalloc_obj(*ev, gfp); if (!ev) return; ev->type = EVENT_STOPPED; + ev->link_id = link_id; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } -EXPORT_SYMBOL(cfg80211_stop_iface); +EXPORT_SYMBOL(cfg80211_stop_link); void cfg80211_init_wdev(struct wireless_dev *wdev) { @@ -1573,7 +1597,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; case NETDEV_GOING_DOWN: scoped_guard(wiphy, &rdev->wiphy) { - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); cfg80211_remove_links(wdev); } /* since we just did cfg80211_leave() nothing to do there */ @@ -1698,7 +1722,7 @@ void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work) list_add_tail(&work->entry, &rdev->wiphy_work_list); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); - queue_work(system_unbound_wq, &rdev->wiphy_work); + queue_work(system_dfl_wq, &rdev->wiphy_work); } EXPORT_SYMBOL_GPL(wiphy_work_queue); diff --git a/net/wireless/core.h b/net/wireless/core.h index b6bd7f4d6385..6ac57b7b2615 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -289,6 +289,7 @@ struct cfg80211_event { u8 td_bitmap_len; } pa; }; + int link_id; }; struct cfg80211_cached_keys { @@ -489,6 +490,7 @@ cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rde struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); +void cfg80211_stop_radar_detection(struct wireless_dev *wdev); void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev); void cfg80211_background_cac_done_wk(struct work_struct *work); @@ -536,7 +538,8 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num); void cfg80211_leave(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); + struct wireless_dev *wdev, + int link_id); void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); @@ -550,7 +553,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, bool signal_valid, unsigned long ts); enum ieee80211_ap_reg_power -cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len); +cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len, + u32 client_flags); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 40e49074e2ee..f9e7fff1ef25 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -29,6 +29,24 @@ static const struct file_operations name## _ops = { \ .llseek = generic_file_llseek, \ } +#define DEBUGFS_RADIO_READONLY_FILE(name, buflen, fmt, value...) \ +static ssize_t name## _read(struct file *file, char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + struct wiphy_radio_cfg *radio_cfg = file->private_data; \ + char buf[buflen]; \ + int res; \ + \ + res = scnprintf(buf, buflen, fmt "\n", ##value); \ + return simple_read_from_buffer(userbuf, count, ppos, buf, res); \ +} \ + \ +static const struct file_operations name## _ops = { \ + .read = name## _read, \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", wiphy->rts_threshold); DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", @@ -38,6 +56,9 @@ DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", wiphy->retry_long); +DEBUGFS_RADIO_READONLY_FILE(radio_rts_threshold, 20, "%d", + radio_cfg->rts_threshold); + static int ht_print_chan(struct ieee80211_channel *chan, char *buf, int buf_size, int offset) { @@ -100,15 +121,27 @@ static const struct file_operations ht40allow_map_ops = { #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops) +#define DEBUGFS_RADIO_ADD(name, radio_idx) \ + debugfs_create_file(#name, 0444, radiod, \ + &rdev->wiphy.radio_cfg[radio_idx], \ + &name## _ops) + void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) { struct dentry *phyd = rdev->wiphy.debugfsdir; + struct dentry *radiod; + u8 i; DEBUGFS_ADD(rts_threshold); DEBUGFS_ADD(fragmentation_threshold); DEBUGFS_ADD(short_retry_limit); DEBUGFS_ADD(long_retry_limit); DEBUGFS_ADD(ht40allow_map); + + for (i = 0; i < rdev->wiphy.n_radio; i++) { + radiod = rdev->wiphy.radio_cfg[i].radio_debugfsdir; + DEBUGFS_RADIO_ADD(radio_rts_threshold, i); + } } struct debugfs_read_work { diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 1e3ed29f7cfc..a7024af39b40 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -69,7 +69,7 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, if (WARN_ON(!channel)) return; - ev = kzalloc(sizeof(*ev), gfp); + ev = kzalloc_obj(*ev, gfp); if (!ev) return; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 46394eb2086f..3fc175f9f868 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1295,6 +1295,25 @@ cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rde return 0; } +void cfg80211_stop_radar_detection(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + int link_id; + + for_each_valid_link(wdev, link_id) { + struct cfg80211_chan_def chandef; + + if (!wdev->links[link_id].cac_started) + continue; + + chandef = *wdev_chandef(wdev, link_id); + rdev_end_cac(rdev, wdev->netdev, link_id); + nl80211_radar_notify(rdev, &chandef, NL80211_RADAR_CAC_ABORTED, + wdev->netdev, GFP_KERNEL); + } +} + void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) { struct wiphy *wiphy = wdev->wiphy; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03d07b54359a..b94231c8441c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/if.h> @@ -332,6 +332,15 @@ static int validate_nan_cluster_id(const struct nlattr *attr, return 0; } +static int validate_uhr_capa(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + + return ieee80211_uhr_capa_size_ok(data, len, false); +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -361,6 +370,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 }, + [NL80211_PMSR_FTM_REQ_ATTR_RSTA] = { .type = NLA_FLAG }, }; static const struct nla_policy @@ -932,6 +942,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, + [NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG }, + [NL80211_ATTR_UHR_CAPABILITY] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_capa, 255), + [NL80211_ATTR_DISABLE_UHR] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1092,8 +1106,7 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, struct nlattr **attrbuf_free = NULL; if (!attrbuf) { - attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), - GFP_KERNEL); + attrbuf = kzalloc_objs(*attrbuf, NUM_NL80211_ATTR); if (!attrbuf) return -ENOMEM; attrbuf_free = attrbuf; @@ -1314,6 +1327,12 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_UHR) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHR)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -1595,7 +1614,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, if (!have_key) return NULL; - result = kzalloc(sizeof(*result), GFP_KERNEL); + result = kzalloc_obj(*result); if (!result) return ERR_PTR(-ENOMEM); @@ -1675,7 +1694,9 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) return -ENOLINK; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (wdev->connected) + if (wdev->connected || + (wiphy_ext_feature_isset(wdev->wiphy, + NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION))) return 0; return -ENOLINK; case NL80211_IFTYPE_NAN: @@ -1947,6 +1968,7 @@ nl80211_send_iftype_data(struct sk_buff *msg, { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap; + const struct ieee80211_sta_uhr_cap *uhr_cap = &iftdata->uhr_cap; if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, iftdata->types_mask)) @@ -1998,6 +2020,14 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (uhr_cap->has_uhr) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC, + sizeof(uhr_cap->mac), &uhr_cap->mac) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY, + sizeof(uhr_cap->phy), &uhr_cap->phy)) + return -ENOBUFS; + } + if (sband->band == NL80211_BAND_6GHZ && nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, sizeof(iftdata->he_6ghz_capa), @@ -2307,6 +2337,32 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, if (cap->ftm.non_trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED)) return -ENOBUFS; + if (cap->ftm.support_6ghz && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP, + cap->ftm.max_tx_ltf_rep)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP, + cap->ftm.max_rx_ltf_rep)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS, + cap->ftm.max_tx_sts)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS, + cap->ftm.max_rx_sts)) + return -ENOBUFS; + if (cap->ftm.max_total_ltf_tx > 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX, + cap->ftm.max_total_ltf_tx)) + return -ENOBUFS; + if (cap->ftm.max_total_ltf_rx > 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX, + cap->ftm.max_total_ltf_rx)) + return -ENOBUFS; + if (cap->ftm.support_rsta && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT)) + return -ENOBUFS; nla_nest_end(msg, ftm); return 0; @@ -3310,7 +3366,7 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl80211_dump_wiphy_state *state) { - struct nlattr **tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL); + struct nlattr **tb = kzalloc_objs(*tb, NUM_NL80211_ATTR); int ret; if (!tb) @@ -3362,7 +3418,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); if (!state) { - state = kzalloc(sizeof(*state), GFP_KERNEL); + state = kzalloc_obj(*state); if (!state) { rtnl_unlock(); return -ENOMEM; @@ -3544,6 +3600,9 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (cfg80211_chandef_is_s1g(chandef)) + chandef->width = NL80211_CHAN_WIDTH_1; + if (attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { enum nl80211_channel_type chantype; @@ -4175,6 +4234,9 @@ int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *ch if (chandef->punctured && nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, chandef->punctured)) return -ENOBUFS; + if (chandef->s1g_primary_2mhz && + nla_put_flag(msg, NL80211_ATTR_S1G_PRIMARY_2MHZ)) + return -ENOBUFS; return 0; } @@ -5270,7 +5332,7 @@ static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy, if (n_entries > wiphy->max_acl_mac_addrs) return ERR_PTR(-EOPNOTSUPP); - acl = kzalloc(struct_size(acl, mac_addrs, n_entries), GFP_KERNEL); + acl = kzalloc_flex(*acl, mac_addrs, n_entries); if (!acl) return ERR_PTR(-ENOMEM); acl->n_acl_entries = n_entries; @@ -6050,7 +6112,7 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs) num_elems++; } - elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); + elems = kzalloc_flex(*elems, elem, num_elems); if (!elems) return ERR_PTR(-ENOMEM); elems->cnt = num_elems; @@ -6082,7 +6144,7 @@ nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs, num_elems++; } - elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); + elems = kzalloc_flex(*elems, elem, num_elems); if (!elems) return ERR_PTR(-ENOMEM); elems->cnt = num_elems; @@ -6423,6 +6485,17 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) cap->datalen - 1)) return -EINVAL; } + + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_UHR_OPER, ies, ies_len); + if (cap) { + if (!cap->datalen) + return -EINVAL; + params->uhr_oper = (void *)(cap->data + 1); + if (!ieee80211_uhr_oper_size_ok((const u8 *)params->uhr_oper, + cap->datalen - 1, true)) + return -EINVAL; + } + return 0; } @@ -6464,6 +6537,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK)) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE) + return false; return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && @@ -6481,6 +6558,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && auth_type == NL80211_AUTHTYPE_FILS_SK) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE) + return false; return true; case NL80211_CMD_START_AP: if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -6546,6 +6627,9 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params (channel->flags & IEEE80211_CHAN_NO_EHT)) return -EOPNOTSUPP; + if (params->uhr_oper && (channel->flags & IEEE80211_CHAN_NO_UHR)) + return -EOPNOTSUPP; + return 0; } @@ -6617,7 +6701,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]) != NL80211_SMPS_OFF) return -EOPNOTSUPP; - params = kzalloc(sizeof(*params), GFP_KERNEL); + params = kzalloc_obj(*params); if (!params) return -ENOMEM; @@ -6745,7 +6829,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) beacon_check.relax = true; beacon_check.reg_power = cfg80211_get_6ghz_power_type(params->beacon.tail, - params->beacon.tail_len); + params->beacon.tail_len, 0); if (!cfg80211_reg_check_beaconing(&rdev->wiphy, ¶ms->chandef, &beacon_check)) { err = -EINVAL; @@ -6910,7 +6994,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) if (!wdev->links[link_id].ap.beacon_interval) return -EINVAL; - params = kzalloc(sizeof(*params), GFP_KERNEL); + params = kzalloc_obj(*params); if (!params) return -ENOMEM; @@ -6924,7 +7008,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) beacon_check.relax = true; beacon_check.reg_power = cfg80211_get_6ghz_power_type(params->beacon.tail, - params->beacon.tail_len); + params->beacon.tail_len, 0); if (!cfg80211_reg_check_beaconing(&rdev->wiphy, &wdev->links[link_id].ap.chandef, &beacon_check)) { @@ -7128,7 +7212,8 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) break; case RATE_INFO_BW_EHT_RU: rate_flg = 0; - WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS)); + WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS) && + !(info->flags & RATE_INFO_FLAGS_UHR_MCS)); break; } @@ -7181,6 +7266,23 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, info->eht_ru_alloc)) return false; + } else if (info->flags & RATE_INFO_FLAGS_UHR_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_UHR_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi)) + return false; + if (info->bw == RATE_INFO_BW_EHT_RU && + nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, + info->eht_ru_alloc)) + return false; + if (info->flags & RATE_INFO_FLAGS_UHR_ELR_MCS && + nla_put_flag(msg, NL80211_RATE_INFO_UHR_ELR)) + return false; + if (info->flags & RATE_INFO_FLAGS_UHR_IM && + nla_put_flag(msg, NL80211_RATE_INFO_UHR_IM)) + return false; } nla_nest_end(msg, rate); @@ -7882,7 +7984,7 @@ static int nl80211_dump_station(struct sk_buff *skb, for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { sinfo.links[i] = - kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL); + kzalloc_obj(*sinfo.links[0]); if (!sinfo.links[i]) { err = -ENOMEM; goto out_err; @@ -7946,7 +8048,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { - sinfo.links[i] = kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL); + sinfo.links[i] = kzalloc_obj(*sinfo.links[0]); if (!sinfo.links[i]) { cfg80211_sinfo_release_content(&sinfo); return -ENOMEM; @@ -8054,7 +8156,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, if (params->ext_capab || params->link_sta_params.ht_capa || params->link_sta_params.vht_capa || params->link_sta_params.he_capa || - params->link_sta_params.eht_capa) + params->link_sta_params.eht_capa || + params->link_sta_params.uhr_capa) return -EINVAL; if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) return -EINVAL; @@ -8274,6 +8377,16 @@ static int nl80211_set_station_tdls(struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params->link_sta_params.eht_capa) + return -EINVAL; + + params->link_sta_params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params->link_sta_params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) params->link_sta_params.s1g_capa = nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); @@ -8594,6 +8707,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params.link_sta_params.eht_capa) + return -EINVAL; + + params.link_sta_params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params.link_sta_params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { params.eml_cap_present = true; params.eml_cap = @@ -8653,10 +8776,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.ht_capa = NULL; params.link_sta_params.vht_capa = NULL; - /* HE and EHT require WME */ + /* HE, EHT and UHR require WME */ if (params.link_sta_params.he_capa_len || params.link_sta_params.he_6ghz_capa || - params.link_sta_params.eht_capa_len) + params.link_sta_params.eht_capa_len || + params.link_sta_params.uhr_capa_len) return -EINVAL; } @@ -8773,6 +8897,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) goto out; } } + + params.epp_peer = + nla_get_flag(info->attrs[NL80211_ATTR_EPP_PEER]); + err = rdev_add_station(rdev, dev, mac_addr, ¶ms); out: dev_put(params.vlan); @@ -10028,7 +10156,7 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) goto out; } - rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); + rd = kzalloc_flex(*rd, reg_rules, num_rules); if (!rd) { r = -ENOMEM; goto out; @@ -11391,8 +11519,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) goto free; - csa_attrs = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*csa_attrs), - GFP_KERNEL); + csa_attrs = kzalloc_objs(*csa_attrs, NL80211_ATTR_MAX + 1); if (!csa_attrs) { err = -ENOMEM; goto free; @@ -11652,7 +11779,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) bool dump_include_use_data; int err; - attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL); + attrbuf = kzalloc_objs(*attrbuf, NUM_NL80211_ATTR); if (!attrbuf) return -ENOMEM; @@ -11792,7 +11919,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) int res; bool radio_stats; - attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL); + attrbuf = kzalloc_objs(*attrbuf, NUM_NL80211_ATTR); if (!attrbuf) return -ENOMEM; @@ -11947,7 +12074,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if ((auth_type == NL80211_AUTHTYPE_SAE || auth_type == NL80211_AUTHTYPE_FILS_SK || auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || - auth_type == NL80211_AUTHTYPE_FILS_PK) && + auth_type == NL80211_AUTHTYPE_FILS_PK || + auth_type == NL80211_AUTHTYPE_EPPKE) && !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; @@ -11955,7 +12083,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (auth_type != NL80211_AUTHTYPE_SAE && auth_type != NL80211_AUTHTYPE_FILS_SK && auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && - auth_type != NL80211_AUTHTYPE_FILS_PK) + auth_type != NL80211_AUTHTYPE_FILS_PK && + auth_type != NL80211_AUTHTYPE_EPPKE) return -EINVAL; req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); @@ -12235,9 +12364,6 @@ static int nl80211_process_links(struct cfg80211_registered_device *rdev, return -EINVAL; } } - - links[link_id].disabled = - nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]); } return 0; @@ -12326,6 +12452,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) req.flags |= ASSOC_REQ_DISABLE_EHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR])) + req.flags |= ASSOC_REQ_DISABLE_UHR; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&req.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -12417,13 +12546,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } - if (req.links[req.link_id].disabled) { - GENL_SET_ERR_MSG(info, - "cannot have assoc link disabled"); - err = -EINVAL; - goto free; - } - if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) req.ext_mld_capa_ops = nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); @@ -12988,8 +13110,7 @@ static int nl80211_testmode_dump(struct sk_buff *skb, goto out_err; } } else { - attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), - GFP_KERNEL); + attrbuf = kzalloc_objs(*attrbuf, NUM_NL80211_ATTR); if (!attrbuf) { err = -ENOMEM; goto out_err; @@ -13205,6 +13326,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) connect.flags |= ASSOC_REQ_DISABLE_EHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR])) + connect.flags |= ASSOC_REQ_DISABLE_UHR; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&connect.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -14183,9 +14307,8 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, } if (n_thresholds) { - cqm_config = kzalloc(struct_size(cqm_config, rssi_thresholds, - n_thresholds), - GFP_KERNEL); + cqm_config = kzalloc_flex(*cqm_config, rssi_thresholds, + n_thresholds); if (!cqm_config) return -ENOMEM; @@ -14811,7 +14934,7 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, struct nlattr **tb; int err; - tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL); + tb = kzalloc_objs(*tb, NUM_NL80211_ATTR); if (!tb) return -ENOMEM; @@ -14927,9 +15050,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (n_patterns > wowlan->n_patterns) return -EINVAL; - new_triggers.patterns = kcalloc(n_patterns, - sizeof(new_triggers.patterns[0]), - GFP_KERNEL); + new_triggers.patterns = kzalloc_objs(new_triggers.patterns[0], + n_patterns); if (!new_triggers.patterns) return -ENOMEM; @@ -15176,8 +15298,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, if (n_patterns > coalesce->n_patterns) return -EINVAL; - new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]), - GFP_KERNEL); + new_rule->patterns = kzalloc_objs(new_rule->patterns[0], n_patterns); if (!new_rule->patterns) return -ENOMEM; @@ -15255,8 +15376,7 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) if (n_rules > coalesce->n_rules) return -EINVAL; - new_coalesce = kzalloc(struct_size(new_coalesce, rules, n_rules), - GFP_KERNEL); + new_coalesce = kzalloc_flex(*new_coalesce, rules, n_rules); if (!new_coalesce) return -ENOMEM; @@ -15416,7 +15536,7 @@ static int nl80211_register_beacons(struct sk_buff *skb, struct genl_info *info) if (!(rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS)) return -EOPNOTSUPP; - nreg = kzalloc(sizeof(*nreg), GFP_KERNEL); + nreg = kzalloc_obj(*nreg); if (!nreg) return -ENOMEM; @@ -15557,7 +15677,8 @@ static int nl80211_parse_nan_band_config(struct wiphy *wiphy, static int nl80211_parse_nan_conf(struct wiphy *wiphy, struct genl_info *info, struct cfg80211_nan_conf *conf, - u32 *changed_flags) + u32 *changed_flags, + bool start) { struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; int err, rem; @@ -15604,7 +15725,7 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, return err; changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; - if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start) conf->cluster_id = nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); @@ -15715,7 +15836,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL, true); if (err) return err; @@ -15772,7 +15893,7 @@ static int handle_nan_filter(struct nlattr *attr_filter, BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters)); - filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL); + filter = kzalloc_objs(*func->rx_filters, n_entries); if (!filter) return -ENOMEM; @@ -15832,7 +15953,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, if (err) return err; - func = kzalloc(sizeof(*func), GFP_KERNEL); + func = kzalloc_obj(*func); if (!func) return -ENOMEM; @@ -15971,8 +16092,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, func->srf_num_macs = n_entries; func->srf_macs = - kcalloc(n_entries, sizeof(*func->srf_macs), - GFP_KERNEL); + kzalloc_objs(*func->srf_macs, n_entries); if (!func->srf_macs) { err = -ENOMEM; goto out; @@ -16081,7 +16201,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed, false); if (err) return err; @@ -16100,6 +16220,9 @@ void cfg80211_nan_match(struct wireless_dev *wdev, struct sk_buff *msg; void *hdr; + if (WARN_ON(wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE)) + return; + if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr)) return; @@ -16182,6 +16305,9 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev, struct nlattr *func_attr; void *hdr; + if (WARN_ON(wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE)) + return; + if (WARN_ON(!inst_id)) return; @@ -16468,7 +16594,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, return 0; } - attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL); + attrbuf = kzalloc_objs(*attrbuf, NUM_NL80211_ATTR); if (!attrbuf) return -ENOMEM; @@ -16699,7 +16825,7 @@ static int nl80211_set_qos_map(struct sk_buff *skb, if (len % 2) return -EINVAL; - qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL); + qos_map = kzalloc_obj(struct cfg80211_qos_map); if (!qos_map) return -ENOMEM; @@ -17333,8 +17459,7 @@ static int nl80211_set_tid_config(struct sk_buff *skb, rem_conf) num_conf++; - tid_config = kzalloc(struct_size(tid_config, tid_conf, num_conf), - GFP_KERNEL); + tid_config = kzalloc_flex(*tid_config, tid_conf, num_conf); if (!tid_config) return -ENOMEM; @@ -17400,7 +17525,7 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) if (err) return err; - tb = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*tb), GFP_KERNEL); + tb = kzalloc_objs(*tb, NL80211_ATTR_MAX + 1); if (!tb) return -ENOMEM; @@ -17630,6 +17755,16 @@ nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params.eht_capa) + return -EINVAL; + + params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); @@ -18118,7 +18253,7 @@ static int nl80211_set_sar_specs(struct sk_buff *skb, struct genl_info *info) if (specs > rdev->wiphy.sar_capa->num_freq_ranges) return -EINVAL; - sar_spec = kzalloc(struct_size(sar_spec, sub_specs, specs), GFP_KERNEL); + sar_spec = kzalloc_flex(*sar_spec, sub_specs, specs); if (!sar_spec) return -ENOMEM; diff --git a/net/wireless/of.c b/net/wireless/of.c index de221f0edca5..60a864465331 100644 --- a/net/wireless/of.c +++ b/net/wireless/of.c @@ -98,7 +98,7 @@ void wiphy_read_of_freq_limits(struct wiphy *wiphy) } n_freq_limits = len / sizeof(u32) / 2; - freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL); + freq_limits = kzalloc_objs(*freq_limits, n_freq_limits); if (!freq_limits) { err = -ENOMEM; goto out_kfree; diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index a117f5093ca2..50e8e19aa366 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -85,11 +85,6 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } - out->ftm.burst_duration = 15; - if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) - out->ftm.burst_duration = - nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); - out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = @@ -164,6 +159,12 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) + out->ftm.burst_duration = + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); + else if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) + out->ftm.burst_duration = 15; + out->ftm.lmr_feedback = !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && @@ -186,6 +187,21 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); } + out->ftm.rsta = !!tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA]; + if (out->ftm.rsta && !capa->ftm.support_rsta) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA], + "FTM: RSTA not supported by device"); + return -EOPNOTSUPP; + } + + if (out->ftm.rsta && !out->ftm.lmr_feedback) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA], + "FTM: RSTA set without LMR feedback"); + return -EINVAL; + } + return 0; } @@ -296,7 +312,7 @@ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) } } - req = kzalloc(struct_size(req, peers, count), GFP_KERNEL); + req = kzalloc_flex(*req, peers, count); if (!req) return -ENOMEM; req->n_peers = count; @@ -453,6 +469,7 @@ static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg, PUT(u8, NUM_BURSTS_EXP, num_bursts_exp); PUT(u8, BURST_DURATION, burst_duration); PUT(u8, FTMS_PER_BURST, ftms_per_burst); + PUT(u16, BURST_PERIOD, burst_period); PUTOPT(s32, RSSI_AVG, rssi_avg); PUTOPT(s32, RSSI_SPREAD, rssi_spread); if (res->ftm.tx_rate_valid && @@ -647,6 +664,7 @@ void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) } spin_unlock_bh(&wdev->pmsr_lock); + cancel_work_sync(&wdev->pmsr_free_wk); if (found) cfg80211_pmsr_process_abort(wdev); diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 326faea38ca3..c85eaa583a46 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -239,14 +239,14 @@ int ieee80211_radiotap_iterator_next( default: if (!iterator->current_namespace || iterator->_arg_index >= iterator->current_namespace->n_bits) { - if (iterator->current_namespace == &radiotap_ns) - return -ENOENT; align = 0; } else { align = iterator->current_namespace->align_size[iterator->_arg_index].align; size = iterator->current_namespace->align_size[iterator->_arg_index].size; } if (!align) { + if (iterator->current_namespace == &radiotap_ns) + return -ENOENT; /* skip all subsequent data */ iterator->_arg = iterator->_next_ns_data; /* give up on this namespace */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 73cab51f6379..1c5c38d18feb 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2025 Intel Corporation + * Copyright (C) 2018 - 2026 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -452,8 +452,7 @@ reg_copy_regd(const struct ieee80211_regdomain *src_regd) struct ieee80211_regdomain *regd; unsigned int i; - regd = kzalloc(struct_size(regd, reg_rules, src_regd->n_reg_rules), - GFP_KERNEL); + regd = kzalloc_flex(*regd, reg_rules, src_regd->n_reg_rules); if (!regd) return ERR_PTR(-ENOMEM); @@ -510,7 +509,7 @@ static int reg_schedule_apply(const struct ieee80211_regdomain *regdom) { struct reg_regdb_apply_request *request; - request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL); + request = kzalloc_obj(struct reg_regdb_apply_request); if (!request) { kfree(regdom); return -ENOMEM; @@ -933,8 +932,7 @@ static int regdb_query_country(const struct fwdb_header *db, struct ieee80211_regdomain *regdom; unsigned int i; - regdom = kzalloc(struct_size(regdom, reg_rules, coll->n_rules), - GFP_KERNEL); + regdom = kzalloc_flex(*regdom, reg_rules, coll->n_rules); if (!regdom) return -ENOMEM; @@ -1100,7 +1098,7 @@ int reg_reload_regdb(void) /* reset regulatory domain */ current_regdomain = get_cfg80211_regdom(); - request = kzalloc(sizeof(*request), GFP_KERNEL); + request = kzalloc_obj(*request); if (!request) { err = -ENOMEM; goto out_unlock; @@ -1532,7 +1530,7 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, if (!num_rules) return NULL; - rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); + rd = kzalloc_flex(*rd, reg_rules, num_rules); if (!rd) return NULL; @@ -1605,6 +1603,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY) channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; + if (rd_flags & NL80211_RRF_NO_UHR) + channel_flags |= IEEE80211_CHAN_NO_UHR; return channel_flags; } @@ -2332,8 +2332,17 @@ static void reg_process_ht_flags(struct wiphy *wiphy) if (!wiphy) return; - for (band = 0; band < NUM_NL80211_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) { + /* + * Don't apply HT flags to channels within the S1G band. + * Each bonded channel will instead be validated individually + * within cfg80211_s1g_usable(). + */ + if (band == NL80211_BAND_S1GHZ) + continue; + reg_process_ht_flags_band(wiphy, wiphy->bands[band]); + } } static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) @@ -2442,7 +2451,7 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy) list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); } static void reg_check_chans_work(struct work_struct *work) @@ -3213,7 +3222,7 @@ static int regulatory_hint_core(const char *alpha2) { struct regulatory_request *request; - request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + request = kzalloc_obj(struct regulatory_request); if (!request) return -ENOMEM; @@ -3239,7 +3248,7 @@ int regulatory_hint_user(const char *alpha2, if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2)) return -EINVAL; - request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + request = kzalloc_obj(struct regulatory_request); if (!request) return -ENOMEM; @@ -3309,7 +3318,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) wiphy->regulatory_flags &= ~REGULATORY_CUSTOM_REG; - request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); + request = kzalloc_obj(struct regulatory_request); if (!request) return -ENOMEM; @@ -3342,7 +3351,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band, if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) return; - request = kzalloc(sizeof(*request), GFP_KERNEL); + request = kzalloc_obj(*request); if (!request) return; @@ -3655,7 +3664,7 @@ void regulatory_hint_found_beacon(struct wiphy *wiphy, if (processing) return; - reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); + reg_beacon = kzalloc_obj(struct reg_beacon, gfp); if (!reg_beacon) return; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 90a9187a6b13..328af43ef832 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -729,7 +729,7 @@ cfg80211_parse_colocated_ap_iter(void *_data, u8 type, bss_params))) return RNR_ITER_CONTINUE; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) return RNR_ITER_ERROR; @@ -895,7 +895,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev, if (ret) continue; - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) continue; @@ -1085,8 +1085,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) if (!n_channels) return cfg80211_scan_6ghz(rdev, true); - request = kzalloc(struct_size(request, req.channels, n_channels), - GFP_KERNEL); + request = kzalloc_flex(*request, req.channels, n_channels); if (!request) return -ENOMEM; @@ -1959,7 +1958,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, ether_addr_copy(known->parent_bssid, new->parent_bssid); known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; known->pub.bssid_index = new->pub.bssid_index; - known->pub.use_for &= new->pub.use_for; + known->pub.use_for = new->pub.use_for; known->pub.cannot_use_reasons = new->pub.cannot_use_reasons; known->bss_source = new->bss_source; @@ -2212,7 +2211,8 @@ struct cfg80211_inform_single_bss_data { }; enum ieee80211_ap_reg_power -cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) +cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len, + u32 client_flags) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; struct ieee80211_he_operation *he_oper; @@ -2230,26 +2230,13 @@ cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) if (!he_6ghz_oper) return IEEE80211_REG_UNSET_AP; - switch (u8_get_bits(he_6ghz_oper->control, - IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: - return IEEE80211_REG_LPI_AP; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: - return IEEE80211_REG_SP_AP; - case IEEE80211_6GHZ_CTRL_REG_VLP_AP: - return IEEE80211_REG_VLP_AP; - default: - return IEEE80211_REG_UNSET_AP; - } + return cfg80211_6ghz_power_type(he_6ghz_oper->control, client_flags); } static bool cfg80211_6ghz_power_type_valid(const u8 *elems, size_t elems_len, const u32 flags) { - switch (cfg80211_get_6ghz_power_type(elems, elems_len)) { + switch (cfg80211_get_6ghz_power_type(elems, elems_len, flags)) { case IEEE80211_REG_LPI_AP: return true; case IEEE80211_REG_SP_AP: @@ -2706,7 +2693,7 @@ cfg80211_defrag_mle(const struct element *mle, const u8 *ie, size_t ielen, buf_len += elem->datalen; } - res = kzalloc(struct_size(res, data, buf_len), gfp); + res = kzalloc_flex(*res, data, buf_len, gfp); if (!res) return NULL; @@ -2922,9 +2909,8 @@ cfg80211_gen_reporter_rnr(struct cfg80211_bss *source_bss, bool is_mbssid, le16_encode_bits(bss_change_count, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); - res = kzalloc(struct_size(res, data, - sizeof(ap_info) + ap_info.tbtt_info_len), - gfp); + res = kzalloc_flex(*res, data, sizeof(ap_info) + ap_info.tbtt_info_len, + gfp); if (!res) return NULL; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 3a028ff287fb..5b21432450d5 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -570,7 +570,7 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (wdev->conn) return -EINPROGRESS; - wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); + wdev->conn = kzalloc_obj(*wdev->conn); if (!wdev->conn) return -ENOMEM; @@ -910,7 +910,7 @@ void __cfg80211_connect_result(struct net_device *dev, ssid_len = min(ssid->datalen, IEEE80211_MAX_SSID_LEN); memcpy(wdev->u.client.ssid, ssid->data, ssid_len); - wdev->u.client.ssid_len = ssid->datalen; + wdev->u.client.ssid_len = ssid_len; break; } rcu_read_unlock(); diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 62f26618f674..2e0ea69b9604 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -88,7 +88,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); } static int wiphy_suspend(struct device *dev) @@ -137,7 +137,7 @@ static int wiphy_resume(struct device *dev) if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; - queue_work(system_unbound_wq, &rdev->wiphy_work); + queue_work(system_dfl_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) diff --git a/net/wireless/tests/util.c b/net/wireless/tests/util.c index 8abdaeb820ce..118f7fb4226d 100644 --- a/net/wireless/tests/util.c +++ b/net/wireless/tests/util.c @@ -17,7 +17,7 @@ int t_wiphy_init(struct kunit_resource *resource, void *ctx) struct wiphy *wiphy; struct t_wiphy_priv *priv; - ops = kzalloc(sizeof(*ops), GFP_KERNEL); + ops = kzalloc_obj(*ops); KUNIT_ASSERT_NOT_NULL(test, ops); wiphy = wiphy_new_nm(ops, sizeof(*priv), "kunit"); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2b71f1d867a0..643ccf4f0227 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3915,19 +3915,22 @@ TRACE_EVENT(cfg80211_ft_event, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap) ); -TRACE_EVENT(cfg80211_stop_iface, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev), +TRACE_EVENT(cfg80211_stop_link, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + int link_id), + TP_ARGS(wiphy, wdev, link_id), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY + __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; + __entry->link_id = link_id; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, - WIPHY_PR_ARG, WDEV_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_pmsr_report, diff --git a/net/wireless/util.c b/net/wireless/util.c index 56724b33af04..b78530c3e3f8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023, 2025 Intel Corporation + * Copyright (C) 2018-2023, 2025-2026 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> @@ -1144,7 +1144,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) ev->ij.channel); break; case EVENT_STOPPED: - cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); + cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev, + ev->link_id); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, @@ -1203,28 +1204,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); - switch (otype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, dev, -1, true); - break; - case NL80211_IFTYPE_ADHOC: - cfg80211_leave_ibss(rdev, dev, false); - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - cfg80211_disconnect(rdev, dev, - WLAN_REASON_DEAUTH_LEAVING, true); - break; - case NL80211_IFTYPE_MESH_POINT: - /* mesh should be handled? */ - break; - case NL80211_IFTYPE_OCB: - cfg80211_leave_ocb(rdev, dev); - break; - default: - break; - } + cfg80211_leave(rdev, dev->ieee80211_ptr, -1); cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); @@ -1582,36 +1562,42 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); - result = tmp; /* and take NSS, DCM into account */ - result = (result * rate->nss) / 8; + tmp *= rate->nss; + do_div(tmp, 8); if (rate->he_dcm) - result /= 2; + do_div(tmp, 2); + + result = tmp; return result / 10000; } -static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +static u32 _cfg80211_calculate_bitrate_eht_uhr(struct rate_info *rate) { #define SCALE 6144 - static const u32 mcs_divisors[16] = { - 102399, /* 16.666666... */ - 51201, /* 8.333333... */ - 34134, /* 5.555555... */ - 25599, /* 4.166666... */ - 17067, /* 2.777777... */ - 12801, /* 2.083333... */ - 11377, /* 1.851725... */ - 10239, /* 1.666666... */ - 8532, /* 1.388888... */ - 7680, /* 1.250000... */ - 6828, /* 1.111111... */ - 6144, /* 1.000000... */ - 5690, /* 0.926106... */ - 5120, /* 0.833333... */ - 409600, /* 66.666666... */ - 204800, /* 33.333333... */ + static const u32 mcs_divisors[] = { + [ 0] = 102399, /* 16.666666... */ + [ 1] = 51201, /* 8.333333... */ + [ 2] = 34134, /* 5.555555... */ + [ 3] = 25599, /* 4.166666... */ + [ 4] = 17067, /* 2.777777... */ + [ 5] = 12801, /* 2.083333... */ + [ 6] = 11377, /* 1.851725... */ + [ 7] = 10239, /* 1.666666... */ + [ 8] = 8532, /* 1.388888... */ + [ 9] = 7680, /* 1.250000... */ + [10] = 6828, /* 1.111111... */ + [11] = 6144, /* 1.000000... */ + [12] = 5690, /* 0.926106... */ + [13] = 5120, /* 0.833333... */ + [14] = 409600, /* 66.666666... */ + [15] = 204800, /* 33.333333... */ + [17] = 38400, /* 6.250180... */ + [19] = 19200, /* 3.125090... */ + [20] = 15360, /* 2.500000... */ + [23] = 9600, /* 1.562545... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; @@ -1622,8 +1608,6 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) u64 tmp; u32 result; - if (WARN_ON_ONCE(rate->mcs > 15)) - return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > @@ -1704,7 +1688,7 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { - WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", + WARN(1, "invalid EHT or UHR MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } @@ -1718,11 +1702,64 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) tmp *= rate->nss; do_div(tmp, 8); + /* and handle interference mitigation - 0.9x */ + if (rate->flags & RATE_INFO_FLAGS_UHR_IM) { + if (WARN(rate->nss != 1 || rate->mcs == 15, + "invalid NSS or MCS for UHR IM\n")) + return 0; + tmp *= 9000; + do_div(tmp, 10000); + } + result = tmp; return result / 10000; } +static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +{ + if (WARN_ONCE(rate->mcs > 15, "bad EHT MCS %d\n", rate->mcs)) + return 0; + + if (WARN_ONCE(rate->flags & (RATE_INFO_FLAGS_UHR_ELR_MCS | + RATE_INFO_FLAGS_UHR_IM), + "bad EHT MCS flags 0x%x\n", rate->flags)) + return 0; + + return _cfg80211_calculate_bitrate_eht_uhr(rate); +} + +static u32 cfg80211_calculate_bitrate_uhr(struct rate_info *rate) +{ + if (rate->flags & RATE_INFO_FLAGS_UHR_ELR_MCS) { + WARN_ONCE(rate->eht_gi != NL80211_RATE_INFO_EHT_GI_1_6, + "bad UHR ELR guard interval %d\n", + rate->eht_gi); + WARN_ONCE(rate->mcs > 1, "bad UHR ELR MCS %d\n", rate->mcs); + WARN_ONCE(rate->nss != 1, "bad UHR ELR NSS %d\n", rate->nss); + WARN_ONCE(rate->bw != RATE_INFO_BW_20, + "bad UHR ELR bandwidth %d\n", + rate->bw); + WARN_ONCE(rate->flags & RATE_INFO_FLAGS_UHR_IM, + "bad UHR MCS flags 0x%x\n", rate->flags); + if (rate->mcs == 0) + return 17; + return 33; + } + + switch (rate->mcs) { + case 0 ... 15: + case 17: + case 19: + case 20: + case 23: + return _cfg80211_calculate_bitrate_eht_uhr(rate); + } + + WARN_ONCE(1, "bad UHR MCS %d\n", rate->mcs); + return 0; +} + static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ @@ -1847,6 +1884,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); + if (rate->flags & RATE_INFO_FLAGS_UHR_MCS) + return cfg80211_calculate_bitrate_uhr(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); @@ -2674,8 +2713,8 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, gfp_t gfp) { - link_sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, - sizeof(*link_sinfo->pertid), gfp); + link_sinfo->pertid = kzalloc_objs(*link_sinfo->pertid, + IEEE80211_NUM_TIDS + 1, gfp); if (!link_sinfo->pertid) return -ENOMEM; @@ -2685,9 +2724,8 @@ EXPORT_SYMBOL(cfg80211_link_sinfo_alloc_tid_stats); int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { - sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, - sizeof(*(sinfo->pertid)), - gfp); + sinfo->pertid = kzalloc_objs(*(sinfo->pertid), IEEE80211_NUM_TIDS + 1, + gfp); if (!sinfo->pertid) return -ENOMEM; @@ -2942,9 +2980,8 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); -static bool -ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, - u32 freq, u32 width) +bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, + u32 freq, u32 width) { const struct wiphy_radio_freq_range *r; int i; @@ -2958,6 +2995,7 @@ ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, return false; } +EXPORT_SYMBOL(ieee80211_radio_freq_range_valid); bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 1241fda78a68..5a70a0120343 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -414,8 +414,7 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, * to do it first in case the allocation fails. Don't use wext. */ if (!wdev->wext.keys) { - wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), - GFP_KERNEL); + wdev->wext.keys = kzalloc_obj(*wdev->wext.keys); if (!wdev->wext.keys) return -ENOMEM; for (i = 0; i < 4; i++) @@ -684,7 +683,7 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, idx = erq->flags & IW_ENCODE_INDEX; if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { - if (idx < 4 || idx > 5) { + if (idx < 5 || idx > 6) { idx = wdev->wext.default_mgmt_key; if (idx < 0) return -EINVAL; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index c32a7c6903d5..7b8e94214b07 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -1101,6 +1101,10 @@ static int compat_standard_call(struct net_device *dev, return ioctl_standard_call(dev, iwr, cmd, info, handler); iwp_compat = (struct compat_iw_point *) &iwr->u.data; + + /* struct iw_point has a 32bit hole on 64bit arches. */ + memset(&iwp, 0, sizeof(iwp)); + iwp.pointer = compat_ptr(iwp_compat->pointer); iwp.length = iwp_compat->length; iwp.flags = iwp_compat->flags; diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c index 674d426a9d24..37d1147019c2 100644 --- a/net/wireless/wext-priv.c +++ b/net/wireless/wext-priv.c @@ -228,6 +228,10 @@ int compat_private_call(struct net_device *dev, struct iwreq *iwr, struct iw_point iwp; iwp_compat = (struct compat_iw_point *) &iwr->u.data; + + /* struct iw_point has a 32bit hole on 64bit arches. */ + memset(&iwp, 0, sizeof(iwp)); + iwp.pointer = compat_ptr(iwp_compat->pointer); iwp.length = iwp_compat->length; iwp.flags = iwp_compat->flags; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 655d1e0ae25f..af8762b24039 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -670,7 +670,7 @@ out: return 0; } -static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int x25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; @@ -743,7 +743,7 @@ static int x25_wait_for_connection_establishment(struct sock *sk) return rc; } -static int x25_connect(struct socket *sock, struct sockaddr *uaddr, +static int x25_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c index 21b30b56e889..3b406906eb9e 100644 --- a/net/x25/x25_forward.c +++ b/net/x25/x25_forward.c @@ -55,8 +55,7 @@ int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, /* Save the forwarding details for future traffic */ if (!same_lci){ - if ((new_frwd = kmalloc(sizeof(struct x25_forward), - GFP_ATOMIC)) == NULL){ + if ((new_frwd = kmalloc_obj(struct x25_forward, GFP_ATOMIC)) == NULL){ rc = -ENOMEM; goto out_put_nb; } diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 4608aa5b4f31..d1e8cd81c214 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -262,7 +262,7 @@ void x25_link_terminated(struct x25_neigh *nb) */ void x25_link_device_up(struct net_device *dev) { - struct x25_neigh *nb = kmalloc(sizeof(*nb), GFP_ATOMIC); + struct x25_neigh *nb = kmalloc_obj(*nb, GFP_ATOMIC); if (!nb) return; diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 647f325ed867..b28055f852df 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -37,7 +37,7 @@ static int x25_add_route(struct x25_address *address, unsigned int sigdigits, goto out; } - rt = kmalloc(sizeof(*rt), GFP_ATOMIC); + rt = kmalloc_obj(*rt, GFP_ATOMIC); rc = -ENOMEM; if (!rt) goto out; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9f76ca591d54..066ce07c506d 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -97,7 +97,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) long npgs; int err; - umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); + umem->pgs = kvzalloc_objs(*umem->pgs, umem->npgs, + GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; @@ -249,7 +250,7 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) struct xdp_umem *umem; int err; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); + umem = kzalloc_obj(*umem); if (!umem) return ERR_PTR(-ENOMEM); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 69bbcca8ac75..6149f6a79897 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -167,26 +167,32 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; + u32 num_desc; int err; - if (frags) - contd = XDP_PKT_CONTD; + if (likely(!frags)) { + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err) + goto err; + return 0; + } - err = __xsk_rcv_zc(xs, xskb, len, contd); - if (err) + contd = XDP_PKT_CONTD; + num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + err = -ENOBUFS; goto err; - if (likely(!frags)) - return 0; + } + __xsk_rcv_zc(xs, xskb, len, contd); xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; - err = __xsk_rcv_zc(xs, pos, len, contd); - if (err) - goto err; - list_del(&pos->list_node); + __xsk_rcv_zc(xs, pos, len, contd); + list_del_init(&pos->list_node); } return 0; @@ -541,12 +547,11 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { - unsigned long flags; int ret; - spin_lock_irqsave(&pool->cq_lock, flags); + spin_lock(&pool->cq->cq_cached_prod_lock); ret = xskq_prod_reserve(pool->cq); - spin_unlock_irqrestore(&pool->cq_lock, flags); + spin_unlock(&pool->cq->cq_cached_prod_lock); return ret; } @@ -597,7 +602,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, unsigned long flags; u32 idx, i; - spin_lock_irqsave(&pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_prod_lock, flags); idx = xskq_get_prod(pool->cq); if (unlikely(num_descs > 1)) { @@ -615,19 +620,18 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, descs_processed++; } xskq_prod_submit_n(pool->cq, descs_processed); - spin_unlock_irqrestore(&pool->cq_lock, flags); + spin_unlock_irqrestore(&pool->cq_prod_lock, flags); } static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { - unsigned long flags; - - spin_lock_irqsave(&pool->cq_lock, flags); + spin_lock(&pool->cq->cq_cached_prod_lock); xskq_prod_cancel_n(pool->cq, n); - spin_unlock_irqrestore(&pool->cq_lock, flags); + spin_unlock(&pool->cq->cq_cached_prod_lock); } -static void xsk_destruct_skb(struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; @@ -1274,7 +1278,7 @@ static bool xsk_validate_queues(struct xdp_sock *xs) return xs->fq_tmp && xs->cq_tmp; } -static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; @@ -1351,6 +1355,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { + /* One fill and completion ring required for each queue id. */ + if (!xsk_validate_queues(xs)) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + /* Share the umem with another socket on another qid * and/or device. */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index aa9788f20d0d..37b7a68b89b3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -12,26 +12,22 @@ void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - unsigned long flags; - if (!xs->tx) return; - spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + spin_lock(&pool->xsk_tx_list_lock); list_add_rcu(&xs->tx_list, &pool->xsk_tx_list); - spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); + spin_unlock(&pool->xsk_tx_list_lock); } void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - unsigned long flags; - if (!xs->tx) return; - spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + spin_lock(&pool->xsk_tx_list_lock); list_del_rcu(&xs->tx_list); - spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); + spin_unlock(&pool->xsk_tx_list_lock); } void xp_destroy(struct xsk_buff_pool *pool) @@ -46,8 +42,7 @@ void xp_destroy(struct xsk_buff_pool *pool) int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), - GFP_KERNEL); + pool->tx_descs = kvzalloc_objs(*pool->tx_descs, xs->tx->nentries); if (!pool->tx_descs) return -ENOMEM; @@ -63,11 +58,11 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, u32 i, entries; entries = unaligned ? umem->chunks : 0; - pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); + pool = kvzalloc_flex(*pool, free_heads, entries); if (!pool) goto out; - pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL); + pool->heads = kvzalloc_objs(*pool->heads, umem->chunks); if (!pool->heads) goto out; @@ -94,7 +89,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); - spin_lock_init(&pool->cq_lock); + spin_lock_init(&pool->cq_prod_lock); + spin_lock_init(&xs->cq_tmp->cq_cached_prod_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; @@ -158,10 +154,6 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } -#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \ - NETDEV_XDP_ACT_REDIRECT | \ - NETDEV_XDP_ACT_XSK_ZEROCOPY) - int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { @@ -203,7 +195,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, /* For copy-mode, we are done. */ return 0; - if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) { + if ((netdev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) { err = -EOPNOTSUPP; goto err_unreg_pool; } @@ -254,10 +246,6 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, u16 flags; struct xdp_umem *umem = umem_xs->umem; - /* One fill and completion ring required for each queue id. */ - if (!pool->fq || !pool->cq) - return -EINVAL; - flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; @@ -339,11 +327,11 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi { struct xsk_dma_map *dma_map; - dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL); + dma_map = kzalloc_obj(*dma_map); if (!dma_map) return NULL; - dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); + dma_map->dma_pages = kvzalloc_objs(*dma_map->dma_pages, nr_pages); if (!dma_map->dma_pages) { kfree(dma_map); return NULL; @@ -431,7 +419,8 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ } } - pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); + pool->dma_pages = kvzalloc_objs(*pool->dma_pages, + dma_map->dma_pages_cnt); if (!pool->dma_pages) return -ENOMEM; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index d2c264030017..4dd01b7d858e 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -26,7 +26,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) struct xsk_queue *q; size_t size; - q = kzalloc(sizeof(*q), GFP_KERNEL); + q = kzalloc_obj(*q); if (!q) return NULL; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 1eb8d9f8b104..ec08d9c102b1 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -46,6 +46,11 @@ struct xsk_queue { u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: when sockets share a single cq when the same netdev + * and queue id is shared. + */ + spinlock_t cq_cached_prod_lock; }; struct parsed_desc { diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index f0157702718f..4a62817a88f8 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -110,14 +110,17 @@ config XFRM_IPCOMP select CRYPTO_DEFLATE config NET_KEY - tristate "PF_KEY sockets" + tristate "PF_KEY sockets (deprecated)" select XFRM_ALGO help PF_KEYv2 socket family, compatible to KAME ones. - They are required if you are going to use IPsec tools ported - from KAME. - Say Y unless you know what you are doing. + The PF_KEYv2 socket interface is deprecated and + scheduled for removal. All maintained IKE daemons + no longer need PF_KEY sockets. Please use the netlink + interface (XFRM_USER) to configure IPsec. + + If unsure, say N. config NET_KEY_MIGRATE bool "PF_KEY MIGRATE" diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index bf744ac9d5a7..e1b11ab59f6e 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -465,7 +465,7 @@ static int espintcp_init_sk(struct sock *sk) if (sk->sk_user_data) return -EBUSY; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (!ctx) return -ENOMEM; @@ -536,7 +536,7 @@ static void espintcp_close(struct sock *sk, long timeout) sk->sk_prot = &tcp_prot; barrier(); - cancel_work_sync(&ctx->work); + disable_work_sync(&ctx->work); strp_done(&ctx->strp); skb_queue_purge(&ctx->out_queue); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 52ae0e034d29..550457e4c4f0 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -544,6 +544,14 @@ static int xfrm_dev_down(struct net_device *dev) return NOTIFY_DONE; } +static int xfrm_dev_unregister(struct net_device *dev) +{ + xfrm_dev_state_flush(dev_net(dev), dev, true); + xfrm_dev_policy_flush(dev_net(dev), dev, true); + + return NOTIFY_DONE; +} + static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -556,8 +564,10 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void return xfrm_api_check(dev); case NETDEV_DOWN: - case NETDEV_UNREGISTER: return xfrm_dev_down(dev); + + case NETDEV_UNREGISTER: + return xfrm_dev_unregister(dev); } return NOTIFY_DONE; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c9ddef869aa5..4ed346e682c7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -505,6 +505,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) async = 1; dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; + spin_lock(&x->lock); goto resume; } /* GRO call */ @@ -541,9 +542,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + + nexthdr = x->type_offload->input_tail(x, skb); } - goto lock; + goto process; } family = XFRM_SPI_SKB_CB(skb)->family; @@ -611,7 +614,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } -lock: +process: + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); + + XFRM_SKB_CB(skb)->seq.input.low = seq; + XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { @@ -638,21 +646,13 @@ lock: goto drop_unlock; } - spin_unlock(&x->lock); - if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); - goto drop; + goto drop_unlock; } - seq_hi = htonl(xfrm_replay_seqhi(x, seq)); - - XFRM_SKB_CB(skb)->seq.input.low = seq; - XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - - if (crypto_done) { - nexthdr = x->type_offload->input_tail(x, skb); - } else { + if (!crypto_done) { + spin_unlock(&x->lock); dev_hold(skb->dev); nexthdr = x->type->input(x, skb); @@ -660,9 +660,9 @@ lock: return 0; dev_put(skb->dev); + spin_lock(&x->lock); } resume: - spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, @@ -676,7 +676,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (xfrm_replay_recheck(x, skb, seq)) { + if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 43fdc6ed8dd1..5f38dff16177 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -336,7 +336,7 @@ int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) } err = -ENOMEM; - ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); + ipcd = kzalloc_obj(*ipcd); if (!ipcd) goto out; diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 3b6d7284fc70..050a82101ca5 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -2526,8 +2526,8 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x, nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); /* saved array is for saving 1..N seq nums from wantseq */ if (xc->reorder_win_size) { - xtfs->w_saved = kcalloc(xc->reorder_win_size, - sizeof(*xtfs->w_saved), GFP_KERNEL); + xtfs->w_saved = kzalloc_objs(*xtfs->w_saved, + xc->reorder_win_size); if (!xtfs->w_saved) { NL_SET_ERR_MSG(extack, "Cannot alloc reorder window"); return -ENOMEM; @@ -2658,8 +2658,8 @@ static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) xtfs->ra_newskb = NULL; if (xtfs->cfg.reorder_win_size) { - xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, - sizeof(*xtfs->w_saved), GFP_KERNEL); + xtfs->w_saved = kzalloc_objs(*xtfs->w_saved, + xtfs->cfg.reorder_win_size); if (!xtfs->w_saved) { kfree_sensitive(xtfs); return -ENOMEM; @@ -2677,7 +2677,7 @@ static int iptfs_init_state(struct xfrm_state *x) /* We have arrived here from xfrm_state_clone() */ xtfs = x->mode_data; } else { - xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); + xtfs = kzalloc_obj(*xtfs); if (!xtfs) return -ENOMEM; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 62486f866975..9e1a522ab1c5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -429,7 +429,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; - policy = kzalloc(sizeof(struct xfrm_policy), gfp); + policy = kzalloc_obj(struct xfrm_policy, gfp); if (policy) { write_pnet(&policy->xp_net, net); @@ -765,7 +765,7 @@ xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) if (bin) return bin; - bin = kzalloc(sizeof(*bin), GFP_ATOMIC); + bin = kzalloc_obj(*bin, GFP_ATOMIC); if (!bin) return NULL; @@ -836,7 +836,7 @@ xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) { struct xfrm_pol_inexact_node *node; - node = kzalloc(sizeof(*node), GFP_ATOMIC); + node = kzalloc_obj(*node, GFP_ATOMIC); if (node) xfrm_pol_inexact_node_init(node, addr, prefixlen); @@ -3801,8 +3801,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; + int i, k = 0; int ti = 0; - int i, k; sp = skb_sec_path(skb); if (!sp) @@ -3828,6 +3828,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, tpp = stp; } + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET && sp == &dummy) + /* This policy template was already checked by HW + * and secpath was removed in __xfrm_policy_check2. + */ + goto out; + /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement @@ -3837,7 +3843,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * verified to allow them to be skipped in future policy * checks (e.g. nested tunnels). */ - for (i = xfrm_nr-1, k = 0; i >= 0; i--) { + for (i = xfrm_nr - 1; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) @@ -3853,6 +3859,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, goto reject; } +out: xfrm_pols_put(pols, npols); sp->verified_cnt = k; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9e14e453b55c..98b362d51836 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3151,6 +3151,7 @@ int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) int err; if (family == AF_INET && + (!x->dir || x->dir == XFRM_SA_DIR_OUT) && READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)) x->props.flags |= XFRM_STATE_NOPMTUDISC; diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c index 2248eda741f8..4180c317f9bc 100644 --- a/net/xfrm/xfrm_state_bpf.c +++ b/net/xfrm/xfrm_state_bpf.c @@ -68,7 +68,7 @@ bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 struct net *net = dev_net(xdp->rxq->dev); struct xfrm_state *x; - if (!opts || opts__sz < sizeof(opts->error)) + if (opts__sz < sizeof(opts->error)) return NULL; if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) { |
