From 4bc5e64e6cf37007e436970024e5998ee0935651 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 26 Nov 2021 01:13:32 +0100 Subject: efi: Move efifb_setup_from_dmi() prototype from arch headers Commit 8633ef82f101 ("drivers/firmware: consolidate EFI framebuffer setup for all arches") made the Generic System Framebuffers (sysfb) driver able to be built on non-x86 architectures. But it left the efifb_setup_from_dmi() function prototype declaration in the architecture specific headers. This could lead to the following compiler warning as reported by the kernel test robot: drivers/firmware/efi/sysfb_efi.c:70:6: warning: no previous prototype for function 'efifb_setup_from_dmi' [-Wmissing-prototypes] void efifb_setup_from_dmi(struct screen_info *si, const char *opt) ^ drivers/firmware/efi/sysfb_efi.c:70:1: note: declare 'static' if the function is not intended to be used outside of this translation unit void efifb_setup_from_dmi(struct screen_info *si, const char *opt) Fixes: 8633ef82f101 ("drivers/firmware: consolidate EFI framebuffer setup for all arches") Reported-by: kernel test robot Cc: # 5.15.x Signed-off-by: Javier Martinez Canillas Acked-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20211126001333.555514-1-javierm@redhat.com Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index dbd39b20e034..ef8dbc0a1522 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1283,4 +1283,10 @@ static inline struct efi_mokvar_table_entry *efi_mokvar_entry_find( } #endif +#ifdef CONFIG_SYSFB +extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); +#else +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } +#endif + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From fe415186b43df0db1f17fa3a46275fd92107fe71 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 16 Dec 2021 08:24:08 +0100 Subject: xen/console: harden hvc_xen against event channel storms The Xen console driver is still vulnerable for an attack via excessive number of events sent by the backend. Fix that by using a lateeoi event channel. For the normal domU initial console this requires the introduction of bind_evtchn_to_irq_lateeoi() as there is no xenbus device available at the time the event channel is bound to the irq. As the decision whether an interrupt was spurious or not requires to test for bytes having been read from the backend, move sending the event into the if statement, as sending an event without having found any bytes to be read is making no sense at all. This is part of XSA-391 Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich --- V2: - slightly adapt spurious irq detection (Jan Beulich) V3: - fix spurious irq detection (Jan Beulich) --- drivers/tty/hvc/hvc_xen.c | 30 +++++++++++++++++++++++++++--- drivers/xen/events/events_base.c | 6 ++++++ include/xen/events.h | 1 + 3 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 71e0dd2c0ce5..ebaf7500f48f 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -37,6 +37,8 @@ struct xencons_info { struct xenbus_device *xbdev; struct xencons_interface *intf; unsigned int evtchn; + XENCONS_RING_IDX out_cons; + unsigned int out_cons_same; struct hvc_struct *hvc; int irq; int vtermno; @@ -138,6 +140,8 @@ static int domU_read_console(uint32_t vtermno, char *buf, int len) XENCONS_RING_IDX cons, prod; int recv = 0; struct xencons_info *xencons = vtermno_to_xencons(vtermno); + unsigned int eoiflag = 0; + if (xencons == NULL) return -EINVAL; intf = xencons->intf; @@ -157,7 +161,27 @@ static int domU_read_console(uint32_t vtermno, char *buf, int len) mb(); /* read ring before consuming */ intf->in_cons = cons; - notify_daemon(xencons); + /* + * When to mark interrupt having been spurious: + * - there was no new data to be read, and + * - the backend did not consume some output bytes, and + * - the previous round with no read data didn't see consumed bytes + * (we might have a race with an interrupt being in flight while + * updating xencons->out_cons, so account for that by allowing one + * round without any visible reason) + */ + if (intf->out_cons != xencons->out_cons) { + xencons->out_cons = intf->out_cons; + xencons->out_cons_same = 0; + } + if (recv) { + notify_daemon(xencons); + } else if (xencons->out_cons_same++ > 1) { + eoiflag = XEN_EOI_FLAG_SPURIOUS; + } + + xen_irq_lateeoi(xencons->irq, eoiflag); + return recv; } @@ -386,7 +410,7 @@ static int xencons_connect_backend(struct xenbus_device *dev, if (ret) return ret; info->evtchn = evtchn; - irq = bind_evtchn_to_irq(evtchn); + irq = bind_interdomain_evtchn_to_irq_lateeoi(dev, evtchn); if (irq < 0) return irq; info->irq = irq; @@ -551,7 +575,7 @@ static int __init xen_hvc_init(void) return r; info = vtermno_to_xencons(HVC_COOKIE); - info->irq = bind_evtchn_to_irq(info->evtchn); + info->irq = bind_evtchn_to_irq_lateeoi(info->evtchn); } if (info->irq < 0) info->irq = 0; /* NO_IRQ */ diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index a78704ae3618..46d9295d9a6e 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1251,6 +1251,12 @@ int bind_evtchn_to_irq(evtchn_port_t evtchn) } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); +int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); + static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; diff --git a/include/xen/events.h b/include/xen/events.h index c204262d9fc2..344081e71584 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -17,6 +17,7 @@ struct xenbus_device; unsigned xen_evtchn_nr_channels(void); int bind_evtchn_to_irq(evtchn_port_t evtchn); +int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn); int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, irq_handler_t handler, unsigned long irqflags, const char *devname, -- cgit v1.2.3 From dfd0743f1d9ea76931510ed150334d571fbab49d Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Thu, 9 Dec 2021 15:59:37 +0100 Subject: tee: handle lookup of shm with reference count 0 Since the tee subsystem does not keep a strong reference to its idle shared memory buffers, it races with other threads that try to destroy a shared memory through a close of its dma-buf fd or by unmapping the memory. In tee_shm_get_from_id() when a lookup in teedev->idr has been successful, it is possible that the tee_shm is in the dma-buf teardown path, but that path is blocked by the teedev mutex. Since we don't have an API to tell if the tee_shm is in the dma-buf teardown path or not we must find another way of detecting this condition. Fix this by doing the reference counting directly on the tee_shm using a new refcount_t refcount field. dma-buf is replaced by using anon_inode_getfd() instead, this separates the life-cycle of the underlying file from the tee_shm. tee_shm_put() is updated to hold the mutex when decreasing the refcount to 0 and then remove the tee_shm from teedev->idr before releasing the mutex. This means that the tee_shm can never be found unless it has a refcount larger than 0. Fixes: 967c9cca2cc5 ("tee: generic TEE subsystem") Cc: stable@vger.kernel.org Reviewed-by: Greg Kroah-Hartman Reviewed-by: Lars Persson Reviewed-by: Sumit Garg Reported-by: Patrik Lantz Signed-off-by: Jens Wiklander --- drivers/tee/tee_shm.c | 174 ++++++++++++++++++------------------------------ include/linux/tee_drv.h | 4 +- 2 files changed, 68 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 8a8deb95e918..499fccba3d74 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -1,20 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2015-2016, Linaro Limited + * Copyright (c) 2015-2017, 2019-2021 Linaro Limited */ +#include #include -#include -#include #include +#include #include #include #include #include -#include #include "tee_private.h" -MODULE_IMPORT_NS(DMA_BUF); - static void release_registered_pages(struct tee_shm *shm) { if (shm->pages) { @@ -31,16 +28,8 @@ static void release_registered_pages(struct tee_shm *shm) } } -static void tee_shm_release(struct tee_shm *shm) +static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { - struct tee_device *teedev = shm->ctx->teedev; - - if (shm->flags & TEE_SHM_DMA_BUF) { - mutex_lock(&teedev->mutex); - idr_remove(&teedev->idr, shm->id); - mutex_unlock(&teedev->mutex); - } - if (shm->flags & TEE_SHM_POOL) { struct tee_shm_pool_mgr *poolm; @@ -67,45 +56,6 @@ static void tee_shm_release(struct tee_shm *shm) tee_device_put(teedev); } -static struct sg_table *tee_shm_op_map_dma_buf(struct dma_buf_attachment - *attach, enum dma_data_direction dir) -{ - return NULL; -} - -static void tee_shm_op_unmap_dma_buf(struct dma_buf_attachment *attach, - struct sg_table *table, - enum dma_data_direction dir) -{ -} - -static void tee_shm_op_release(struct dma_buf *dmabuf) -{ - struct tee_shm *shm = dmabuf->priv; - - tee_shm_release(shm); -} - -static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) -{ - struct tee_shm *shm = dmabuf->priv; - size_t size = vma->vm_end - vma->vm_start; - - /* Refuse sharing shared memory provided by application */ - if (shm->flags & TEE_SHM_USER_MAPPED) - return -EINVAL; - - return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT, - size, vma->vm_page_prot); -} - -static const struct dma_buf_ops tee_shm_dma_buf_ops = { - .map_dma_buf = tee_shm_op_map_dma_buf, - .unmap_dma_buf = tee_shm_op_unmap_dma_buf, - .release = tee_shm_op_release, - .mmap = tee_shm_op_mmap, -}; - struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags) { struct tee_device *teedev = ctx->teedev; @@ -140,6 +90,7 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags) goto err_dev_put; } + refcount_set(&shm->refcount, 1); shm->flags = flags | TEE_SHM_POOL; shm->ctx = ctx; if (flags & TEE_SHM_DMA_BUF) @@ -153,10 +104,7 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags) goto err_kfree; } - if (flags & TEE_SHM_DMA_BUF) { - DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - mutex_lock(&teedev->mutex); shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL); mutex_unlock(&teedev->mutex); @@ -164,28 +112,11 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags) ret = ERR_PTR(shm->id); goto err_pool_free; } - - exp_info.ops = &tee_shm_dma_buf_ops; - exp_info.size = shm->size; - exp_info.flags = O_RDWR; - exp_info.priv = shm; - - shm->dmabuf = dma_buf_export(&exp_info); - if (IS_ERR(shm->dmabuf)) { - ret = ERR_CAST(shm->dmabuf); - goto err_rem; - } } teedev_ctx_get(ctx); return shm; -err_rem: - if (flags & TEE_SHM_DMA_BUF) { - mutex_lock(&teedev->mutex); - idr_remove(&teedev->idr, shm->id); - mutex_unlock(&teedev->mutex); - } err_pool_free: poolm->ops->free(poolm, shm); err_kfree: @@ -246,6 +177,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } + refcount_set(&shm->refcount, 1); shm->flags = flags | TEE_SHM_REGISTER; shm->ctx = ctx; shm->id = -1; @@ -306,22 +238,6 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } - if (flags & TEE_SHM_DMA_BUF) { - DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - - exp_info.ops = &tee_shm_dma_buf_ops; - exp_info.size = shm->size; - exp_info.flags = O_RDWR; - exp_info.priv = shm; - - shm->dmabuf = dma_buf_export(&exp_info); - if (IS_ERR(shm->dmabuf)) { - ret = ERR_CAST(shm->dmabuf); - teedev->desc->ops->shm_unregister(ctx, shm); - goto err; - } - } - return shm; err: if (shm) { @@ -339,6 +255,35 @@ err: } EXPORT_SYMBOL_GPL(tee_shm_register); +static int tee_shm_fop_release(struct inode *inode, struct file *filp) +{ + tee_shm_put(filp->private_data); + return 0; +} + +static int tee_shm_fop_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct tee_shm *shm = filp->private_data; + size_t size = vma->vm_end - vma->vm_start; + + /* Refuse sharing shared memory provided by application */ + if (shm->flags & TEE_SHM_USER_MAPPED) + return -EINVAL; + + /* check for overflowing the buffer's size */ + if (vma->vm_pgoff + vma_pages(vma) > shm->size >> PAGE_SHIFT) + return -EINVAL; + + return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT, + size, vma->vm_page_prot); +} + +static const struct file_operations tee_shm_fops = { + .owner = THIS_MODULE, + .release = tee_shm_fop_release, + .mmap = tee_shm_fop_mmap, +}; + /** * tee_shm_get_fd() - Increase reference count and return file descriptor * @shm: Shared memory handle @@ -351,10 +296,11 @@ int tee_shm_get_fd(struct tee_shm *shm) if (!(shm->flags & TEE_SHM_DMA_BUF)) return -EINVAL; - get_dma_buf(shm->dmabuf); - fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC); + /* matched by tee_shm_put() in tee_shm_op_release() */ + refcount_inc(&shm->refcount); + fd = anon_inode_getfd("tee_shm", &tee_shm_fops, shm, O_RDWR); if (fd < 0) - dma_buf_put(shm->dmabuf); + tee_shm_put(shm); return fd; } @@ -364,17 +310,7 @@ int tee_shm_get_fd(struct tee_shm *shm) */ void tee_shm_free(struct tee_shm *shm) { - /* - * dma_buf_put() decreases the dmabuf reference counter and will - * call tee_shm_release() when the last reference is gone. - * - * In the case of driver private memory we call tee_shm_release - * directly instead as it doesn't have a reference counter. - */ - if (shm->flags & TEE_SHM_DMA_BUF) - dma_buf_put(shm->dmabuf); - else - tee_shm_release(shm); + tee_shm_put(shm); } EXPORT_SYMBOL_GPL(tee_shm_free); @@ -481,10 +417,15 @@ struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id) teedev = ctx->teedev; mutex_lock(&teedev->mutex); shm = idr_find(&teedev->idr, id); + /* + * If the tee_shm was found in the IDR it must have a refcount + * larger than 0 due to the guarantee in tee_shm_put() below. So + * it's safe to use refcount_inc(). + */ if (!shm || shm->ctx != ctx) shm = ERR_PTR(-EINVAL); - else if (shm->flags & TEE_SHM_DMA_BUF) - get_dma_buf(shm->dmabuf); + else + refcount_inc(&shm->refcount); mutex_unlock(&teedev->mutex); return shm; } @@ -496,7 +437,24 @@ EXPORT_SYMBOL_GPL(tee_shm_get_from_id); */ void tee_shm_put(struct tee_shm *shm) { - if (shm->flags & TEE_SHM_DMA_BUF) - dma_buf_put(shm->dmabuf); + struct tee_device *teedev = shm->ctx->teedev; + bool do_release = false; + + mutex_lock(&teedev->mutex); + if (refcount_dec_and_test(&shm->refcount)) { + /* + * refcount has reached 0, we must now remove it from the + * IDR before releasing the mutex. This will guarantee that + * the refcount_inc() in tee_shm_get_from_id() never starts + * from 0. + */ + if (shm->flags & TEE_SHM_DMA_BUF) + idr_remove(&teedev->idr, shm->id); + do_release = true; + } + mutex_unlock(&teedev->mutex); + + if (do_release) + tee_shm_release(teedev, shm); } EXPORT_SYMBOL_GPL(tee_shm_put); diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a1f03461369b..cf5999626e28 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -195,7 +195,7 @@ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, * @offset: offset of buffer in user space * @pages: locked pages from userspace * @num_pages: number of locked pages - * @dmabuf: dmabuf used to for exporting to user space + * @refcount: reference counter * @flags: defined by TEE_SHM_* in tee_drv.h * @id: unique id of a shared memory object on this device, shared * with user space @@ -214,7 +214,7 @@ struct tee_shm { unsigned int offset; struct page **pages; size_t num_pages; - struct dma_buf *dmabuf; + refcount_t refcount; u32 flags; int id; u64 sec_world_id; -- cgit v1.2.3 From b62e3317b68d9c84301940ca8ca9c35a584111b2 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Thu, 16 Dec 2021 23:19:16 +0800 Subject: net: fix typo in a comment The double 'as' in a comment is repeated, thus it should be removed. Signed-off-by: Xiang wangx Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be5cb3360b94..6aadcc0ecb5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1937,7 +1937,7 @@ enum netdev_ml_priv_type { * @udp_tunnel_nic: UDP tunnel offload state * @xdp_state: stores info on attached XDP BPF programs * - * @nested_level: Used as as a parameter of spin_lock_nested() of + * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. -- cgit v1.2.3 From ec624fe740b416fb68d536b37fb8eef46f90b5c2 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 14 Dec 2021 19:24:33 +0200 Subject: net/sched: Extend qdisc control block with tc control block BPF layer extends the qdisc control block via struct bpf_skb_data_end and because of that there is no more room to add variables to the qdisc layer control block without going over the skb->cb size. Extend the qdisc control block with a tc control block, and move all tc related variables to there as a pre-step for extending the tc control block with additional members. Signed-off-by: Paul Blakey Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 15 +++++++++++++++ include/net/sch_generic.h | 2 -- net/core/dev.c | 8 ++++---- net/sched/act_ct.c | 14 +++++++------- net/sched/cls_api.c | 6 ++++-- net/sched/cls_flower.c | 3 ++- net/sched/sch_frag.c | 3 ++- 7 files changed, 34 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index bf79f3a890af..05f18e81f3e8 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -193,4 +193,19 @@ static inline void skb_txtime_consumed(struct sk_buff *skb) skb->tstamp = ktime_set(0, 0); } +struct tc_skb_cb { + struct qdisc_skb_cb qdisc_cb; + + u16 mru; + bool post_ct; +}; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) +{ + struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; + + BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); + return cb; +} + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 22179b2fda72..c70e6d2b2fdd 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -447,8 +447,6 @@ struct qdisc_skb_cb { }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; - u16 mru; - bool post_ct; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); diff --git a/net/core/dev.c b/net/core/dev.c index 2a352e668d10..c4708e2487fb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3941,8 +3941,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ - qdisc_skb_cb(skb)->mru = 0; - qdisc_skb_cb(skb)->post_ct = false; + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { @@ -5103,8 +5103,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; - qdisc_skb_cb(skb)->mru = 0; - qdisc_skb_cb(skb)->post_ct = false; + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 90866ae45573..98e248b9c0b1 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -690,10 +690,10 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, u8 family, u16 zone, bool *defrag) { enum ip_conntrack_info ctinfo; - struct qdisc_skb_cb cb; struct nf_conn *ct; int err = 0; bool frag; + u16 mru; /* Previously seen (loopback)? Ignore. */ ct = nf_ct_get(skb, &ctinfo); @@ -708,7 +708,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, return err; skb_get(skb); - cb = *qdisc_skb_cb(skb); + mru = tc_skb_cb(skb)->mru; if (family == NFPROTO_IPV4) { enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; @@ -722,7 +722,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, if (!err) { *defrag = true; - cb.mru = IPCB(skb)->frag_max_size; + mru = IPCB(skb)->frag_max_size; } } else { /* NFPROTO_IPV6 */ #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) @@ -735,7 +735,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, if (!err) { *defrag = true; - cb.mru = IP6CB(skb)->frag_max_size; + mru = IP6CB(skb)->frag_max_size; } #else err = -EOPNOTSUPP; @@ -744,7 +744,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, } if (err != -EINPROGRESS) - *qdisc_skb_cb(skb) = cb; + tc_skb_cb(skb)->mru = mru; skb_clear_hash(skb); skb->ignore_df = 1; return err; @@ -963,7 +963,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_action_update_bstats(&c->common, skb); if (clear) { - qdisc_skb_cb(skb)->post_ct = false; + tc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_conntrack_put(&ct->ct_general); @@ -1048,7 +1048,7 @@ do_nat: out_push: skb_push_rcsum(skb, nh_ofs); - qdisc_skb_cb(skb)->post_ct = true; + tc_skb_cb(skb)->post_ct = true; out_clear: if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e54f0a42270c..ff8a9383bf1c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1617,12 +1617,14 @@ int tcf_classify(struct sk_buff *skb, /* If we missed on some chain */ if (ret == TC_ACT_UNSPEC && last_executed_chain) { + struct tc_skb_cb *cb = tc_skb_cb(skb); + ext = tc_skb_ext_alloc(skb); if (WARN_ON_ONCE(!ext)) return TC_ACT_SHOT; ext->chain = last_executed_chain; - ext->mru = qdisc_skb_cb(skb)->mru; - ext->post_ct = qdisc_skb_cb(skb)->post_ct; + ext->mru = cb->mru; + ext->post_ct = cb->post_ct; } return ret; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index aab13ba11767..9782b93db1b3 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -309,7 +310,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - bool post_ct = qdisc_skb_cb(skb)->post_ct; + bool post_ct = tc_skb_cb(skb)->post_ct; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index 8c06381391d6..5ded4c8672a6 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB #include #include +#include #include #include #include @@ -137,7 +138,7 @@ err: int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) { - u16 mru = qdisc_skb_cb(skb)->mru; + u16 mru = tc_skb_cb(skb)->mru; int err; if (mru && skb->len > mru + skb->dev->hard_header_len) -- cgit v1.2.3 From 3849595866166b23bf6a0cb9ff87e06423167f67 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 14 Dec 2021 19:24:34 +0200 Subject: net/sched: flow_dissector: Fix matching on zone id for invalid conns If ct rejects a flow, it removes the conntrack info from the skb. act_ct sets the post_ct variable so the dissector will see this case as an +tracked +invalid state, but the zone id is lost with the conntrack info. To restore the zone id on such cases, set the last executed zone, via the tc control block, when passing ct, and read it back in the dissector if there is no ct info on the skb (invalid connection). Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support") Signed-off-by: Paul Blakey Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- include/net/pkt_sched.h | 1 + net/core/flow_dissector.c | 3 ++- net/sched/act_ct.c | 1 + net/sched/cls_flower.c | 3 ++- 5 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c8cb7e697d47..2ecf8cfd2223 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1380,7 +1380,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, - bool post_ct); + bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 05f18e81f3e8..9e71691c491b 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -198,6 +198,7 @@ struct tc_skb_cb { u16 mru; bool post_ct; + u16 zone; /* Only valid if post_ct = true */ }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3255f57f5131..1b094c481f1d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -238,7 +238,7 @@ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, - size_t mapsize, bool post_ct) + size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; @@ -260,6 +260,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; + key->ct_zone = zone; return; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 98e248b9c0b1..ab3591408419 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1049,6 +1049,7 @@ out_push: skb_push_rcsum(skb, nh_ofs); tc_skb_cb(skb)->post_ct = true; + tc_skb_cb(skb)->zone = p->zone; out_clear: if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9782b93db1b3..ef54ed395874 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -311,6 +311,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, { struct cls_fl_head *head = rcu_dereference_bh(tp->root); bool post_ct = tc_skb_cb(skb)->post_ct; + u16 zone = tc_skb_cb(skb)->zone; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -328,7 +329,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map), - post_ct); + post_ct, zone); skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP); -- cgit v1.2.3 From 635d448a1cce4b4ebee52b351052c70434fa90ea Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 14 Dec 2021 19:24:35 +0200 Subject: net: openvswitch: Fix matching zone id for invalid conns arriving from tc Zone id is not restored if we passed ct and ct rejected the connection, as there is no ct info on the skb. Save the zone from tc skb cb to tc skb extension and pass it on to ovs, use that info to restore the zone id for invalid connections. Fixes: d29334c15d33 ("net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct") Signed-off-by: Paul Blakey Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + net/openvswitch/flow.c | 8 +++++++- net/sched/cls_api.c | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2ecf8cfd2223..4507d77d6941 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -286,6 +286,7 @@ struct nf_bridge_info { struct tc_skb_ext { __u32 chain; __u16 mru; + __u16 zone; bool post_ct; }; #endif diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 9713035b89e3..6d262d9aa10e 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "conntrack.h" #include "datapath.h" @@ -860,6 +861,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, #endif bool post_ct = false; int res, err; + u16 zone = 0; /* Extract metadata from packet. */ if (tun_info) { @@ -898,6 +900,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->recirc_id = tc_ext ? tc_ext->chain : 0; OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; post_ct = tc_ext ? tc_ext->post_ct : false; + zone = post_ct ? tc_ext->zone : 0; } else { key->recirc_id = 0; } @@ -906,8 +909,11 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, #endif err = key_extract(skb, key); - if (!err) + if (!err) { ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */ + if (post_ct && !skb_get_nfct(skb)) + key->ct_zone = zone; + } return err; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ff8a9383bf1c..35c74bdde848 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1625,6 +1625,7 @@ int tcf_classify(struct sk_buff *skb, ext->chain = last_executed_chain; ext->mru = cb->mru; ext->post_ct = cb->post_ct; + ext->zone = cb->zone; } return ret; -- cgit v1.2.3 From 8f905c0e7354ef261360fb7535ea079b1082c105 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Dec 2021 06:33:30 -0800 Subject: inet: fully convert sk->sk_rx_dst to RCU rules syzbot reported various issues around early demux, one being included in this changelog [1] sk->sk_rx_dst is using RCU protection without clearly documenting it. And following sequences in tcp_v4_do_rcv()/tcp_v6_do_rcv() are not following standard RCU rules. [a] dst_release(dst); [b] sk->sk_rx_dst = NULL; They look wrong because a delete operation of RCU protected pointer is supposed to clear the pointer before the call_rcu()/synchronize_rcu() guarding actual memory freeing. In some cases indeed, dst could be freed before [b] is done. We could cheat by clearing sk_rx_dst before calling dst_release(), but this seems the right time to stick to standard RCU annotations and debugging facilities. [1] BUG: KASAN: use-after-free in dst_check include/net/dst.h:470 [inline] BUG: KASAN: use-after-free in tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792 Read of size 2 at addr ffff88807f1cb73a by task syz-executor.5/9204 CPU: 0 PID: 9204 Comm: syz-executor.5 Not tainted 5.16.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x320 mm/kasan/report.c:247 __kasan_report mm/kasan/report.c:433 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:450 dst_check include/net/dst.h:470 [inline] tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792 ip_rcv_finish_core.constprop.0+0x15de/0x1e80 net/ipv4/ip_input.c:340 ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583 ip_sublist_rcv net/ipv4/ip_input.c:609 [inline] ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5508 [inline] __netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556 __netif_receive_skb_list net/core/dev.c:5608 [inline] netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699 gro_normal_list net/core/dev.c:5853 [inline] gro_normal_list net/core/dev.c:5849 [inline] napi_complete_done+0x1f1/0x880 net/core/dev.c:6590 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557 __napi_poll+0xaf/0x440 net/core/dev.c:7023 napi_poll net/core/dev.c:7090 [inline] net_rx_action+0x801/0xb40 net/core/dev.c:7177 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649 common_interrupt+0x52/0xc0 arch/x86/kernel/irq.c:240 asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:629 RIP: 0033:0x7f5e972bfd57 Code: 39 d1 73 14 0f 1f 80 00 00 00 00 48 8b 50 f8 48 83 e8 08 48 39 ca 77 f3 48 39 c3 73 3e 48 89 13 48 8b 50 f8 48 89 38 49 8b 0e <48> 8b 3e 48 83 c3 08 48 83 c6 08 eb bc 48 39 d1 72 9e 48 39 d0 73 RSP: 002b:00007fff8a413210 EFLAGS: 00000283 RAX: 00007f5e97108990 RBX: 00007f5e97108338 RCX: ffffffff81d3aa45 RDX: ffffffff81d3aa45 RSI: 00007f5e97108340 RDI: ffffffff81d3aa45 RBP: 00007f5e97107eb8 R08: 00007f5e97108d88 R09: 0000000093c2e8d9 R10: 0000000000000000 R11: 0000000000000000 R12: 00007f5e97107eb0 R13: 00007f5e97108338 R14: 00007f5e97107ea8 R15: 0000000000000019 Allocated by task 13: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:434 [inline] __kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:467 kasan_slab_alloc include/linux/kasan.h:259 [inline] slab_post_alloc_hook mm/slab.h:519 [inline] slab_alloc_node mm/slub.c:3234 [inline] slab_alloc mm/slub.c:3242 [inline] kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3247 dst_alloc+0x146/0x1f0 net/core/dst.c:92 rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613 ip_route_input_slow+0x1817/0x3a20 net/ipv4/route.c:2340 ip_route_input_rcu net/ipv4/route.c:2470 [inline] ip_route_input_noref+0x116/0x2a0 net/ipv4/route.c:2415 ip_rcv_finish_core.constprop.0+0x288/0x1e80 net/ipv4/ip_input.c:354 ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583 ip_sublist_rcv net/ipv4/ip_input.c:609 [inline] ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5508 [inline] __netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556 __netif_receive_skb_list net/core/dev.c:5608 [inline] netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699 gro_normal_list net/core/dev.c:5853 [inline] gro_normal_list net/core/dev.c:5849 [inline] napi_complete_done+0x1f1/0x880 net/core/dev.c:6590 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557 __napi_poll+0xaf/0x440 net/core/dev.c:7023 napi_poll net/core/dev.c:7090 [inline] net_rx_action+0x801/0xb40 net/core/dev.c:7177 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Freed by task 13: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free mm/kasan/common.c:328 [inline] __kasan_slab_free+0xff/0x130 mm/kasan/common.c:374 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:1723 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1749 slab_free mm/slub.c:3513 [inline] kmem_cache_free+0xbd/0x5d0 mm/slub.c:3530 dst_destroy+0x2d6/0x3f0 net/core/dst.c:127 rcu_do_batch kernel/rcu/tree.c:2506 [inline] rcu_core+0x7ab/0x1470 kernel/rcu/tree.c:2741 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 Last potentially related work creation: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 __kasan_record_aux_stack+0xf5/0x120 mm/kasan/generic.c:348 __call_rcu kernel/rcu/tree.c:2985 [inline] call_rcu+0xb1/0x740 kernel/rcu/tree.c:3065 dst_release net/core/dst.c:177 [inline] dst_release+0x79/0xe0 net/core/dst.c:167 tcp_v4_do_rcv+0x612/0x8d0 net/ipv4/tcp_ipv4.c:1712 sk_backlog_rcv include/net/sock.h:1030 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2768 release_sock+0x54/0x1b0 net/core/sock.c:3300 tcp_sendmsg+0x36/0x40 net/ipv4/tcp.c:1441 inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 sock_write_iter+0x289/0x3c0 net/socket.c:1057 call_write_iter include/linux/fs.h:2162 [inline] new_sync_write+0x429/0x660 fs/read_write.c:503 vfs_write+0x7cd/0xae0 fs/read_write.c:590 ksys_write+0x1ee/0x250 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88807f1cb700 which belongs to the cache ip_dst_cache of size 176 The buggy address is located 58 bytes inside of 176-byte region [ffff88807f1cb700, ffff88807f1cb7b0) The buggy address belongs to the page: page:ffffea0001fc72c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7f1cb flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 dead000000000100 dead000000000122 ffff8881413bb780 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112a20(GFP_ATOMIC|__GFP_NOWARN|__GFP_NORETRY|__GFP_HARDWALL), pid 5, ts 108466983062, free_ts 108048976062 prep_new_page mm/page_alloc.c:2418 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4149 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5369 alloc_pages+0x1a7/0x300 mm/mempolicy.c:2191 alloc_slab_page mm/slub.c:1793 [inline] allocate_slab mm/slub.c:1930 [inline] new_slab+0x32d/0x4a0 mm/slub.c:1993 ___slab_alloc+0x918/0xfe0 mm/slub.c:3022 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3109 slab_alloc_node mm/slub.c:3200 [inline] slab_alloc mm/slub.c:3242 [inline] kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3247 dst_alloc+0x146/0x1f0 net/core/dst.c:92 rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613 __mkroute_output net/ipv4/route.c:2564 [inline] ip_route_output_key_hash_rcu+0x921/0x2d00 net/ipv4/route.c:2791 ip_route_output_key_hash+0x18b/0x300 net/ipv4/route.c:2619 __ip_route_output_key include/net/route.h:126 [inline] ip_route_output_flow+0x23/0x150 net/ipv4/route.c:2850 ip_route_output_key include/net/route.h:142 [inline] geneve_get_v4_rt+0x3a6/0x830 drivers/net/geneve.c:809 geneve_xmit_skb drivers/net/geneve.c:899 [inline] geneve_xmit+0xc4a/0x3540 drivers/net/geneve.c:1082 __netdev_start_xmit include/linux/netdevice.h:4994 [inline] netdev_start_xmit include/linux/netdevice.h:5008 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3606 __dev_queue_xmit+0x299a/0x3650 net/core/dev.c:4229 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1338 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1389 free_unref_page_prepare mm/page_alloc.c:3309 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3388 qlink_free mm/kasan/quarantine.c:146 [inline] qlist_free_all+0x5a/0xc0 mm/kasan/quarantine.c:165 kasan_quarantine_reduce+0x180/0x200 mm/kasan/quarantine.c:272 __kasan_slab_alloc+0xa2/0xc0 mm/kasan/common.c:444 kasan_slab_alloc include/linux/kasan.h:259 [inline] slab_post_alloc_hook mm/slab.h:519 [inline] slab_alloc_node mm/slub.c:3234 [inline] kmem_cache_alloc_node+0x255/0x3f0 mm/slub.c:3270 __alloc_skb+0x215/0x340 net/core/skbuff.c:414 alloc_skb include/linux/skbuff.h:1126 [inline] alloc_skb_with_frags+0x93/0x620 net/core/skbuff.c:6078 sock_alloc_send_pskb+0x783/0x910 net/core/sock.c:2575 mld_newpack+0x1df/0x770 net/ipv6/mcast.c:1754 add_grhead+0x265/0x330 net/ipv6/mcast.c:1857 add_grec+0x1053/0x14e0 net/ipv6/mcast.c:1995 mld_send_initial_cr.part.0+0xf6/0x230 net/ipv6/mcast.c:2242 mld_send_initial_cr net/ipv6/mcast.c:1232 [inline] mld_dad_work+0x1d3/0x690 net/ipv6/mcast.c:2268 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445 Memory state around the buggy address: ffff88807f1cb600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807f1cb680: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc >ffff88807f1cb700: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807f1cb780: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc ffff88807f1cb800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20211220143330.680945-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 3 +-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 11 +++++++---- net/ipv4/udp.c | 6 +++--- net/ipv6/tcp_ipv6.c | 11 +++++++---- net/ipv6/udp.c | 4 ++-- 8 files changed, 23 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index bea21ff70e74..d47e9658da28 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -431,7 +431,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct dst_entry *sk_rx_dst; + struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0189e3cd4a7d..6b5956500436 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -154,7 +154,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); - dst_release(sk->sk_rx_dst); + dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bbb3d39c69af..2bb28bfd83bf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3012,8 +3012,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - dst_release(sk->sk_rx_dst); - sk->sk_rx_dst = NULL; + dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 246ab7b5e857..0ce46849ec3d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5787,7 +5787,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) trace_tcp_probe(sk, skb); tcp_mstamp_refresh(tp); - if (unlikely(!sk->sk_rx_dst)) + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 13d868c43284..084df223b5df 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1701,7 +1701,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst; + + dst = rcu_dereference_protected(sk->sk_rx_dst, + lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); @@ -1709,8 +1712,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_rx_dst_ifindex != skb->skb_iif || !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, dst, 0)) { + RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); - sk->sk_rx_dst = NULL; } } tcp_rcv_established(sk, skb); @@ -1786,7 +1789,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { - struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); @@ -2201,7 +2204,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { - sk->sk_rx_dst = dst; + rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 23b05e28490b..15c6b450b8db 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2250,7 +2250,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old; if (dst_hold_safe(dst)) { - old = xchg(&sk->sk_rx_dst, dst); + old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); dst_release(old); return old != dst; } @@ -2440,7 +2440,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct dst_entry *dst = skb_dst(skb); int ret; - if (unlikely(sk->sk_rx_dst != dst)) + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); @@ -2599,7 +2599,7 @@ int udp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_efree; - dst = READ_ONCE(sk->sk_rx_dst); + dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 551fce49841d..680e6481b967 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -107,7 +107,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - sk->sk_rx_dst = dst; + rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_cookie = rt6_get_cookie(rt); } @@ -1505,7 +1505,10 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst; + + dst = rcu_dereference_protected(sk->sk_rx_dst, + lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); @@ -1513,8 +1516,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, sk->sk_rx_dst_cookie) == NULL) { + RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); - sk->sk_rx_dst = NULL; } } @@ -1874,7 +1877,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { - struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e43b31d25fb6..a2caca6ccf11 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -956,7 +956,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct dst_entry *dst = skb_dst(skb); int ret; - if (unlikely(sk->sk_rx_dst != dst)) + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp6_sk_rx_dst_set(sk, dst); if (!uh->check && !udp_sk(sk)->no_check6_rx) { @@ -1070,7 +1070,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_efree; - dst = READ_ONCE(sk->sk_rx_dst); + dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); -- cgit v1.2.3 From 7e5cced9ca84df52d874aca6b632f930b3dc5bc6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 Dec 2021 09:49:01 -0500 Subject: net: accept UFOv6 packages in virtio_net_hdr_to_skb Skb with skb->protocol 0 at the time of virtio_net_hdr_to_skb may have a protocol inferred from virtio_net_hdr with virtio_net_hdr_set_proto. Unlike TCP, UDP does not have separate types for IPv4 and IPv6. Type VIRTIO_NET_HDR_GSO_UDP is guessed to be IPv4/UDP. As of the below commit, UFOv6 packets are dropped due to not matching the protocol as obtained from dev_parse_header_protocol. Invert the test to take that L2 protocol field as starting point and pass both UFOv4 and UFOv6 for VIRTIO_NET_HDR_GSO_UDP. Fixes: 924a9bc362a5 ("net: check if protocol extracted by virtio_net_hdr_set_proto is correct") Link: https://lore.kernel.org/netdev/CABcq3pG9GRCYqFDBAJ48H1vpnnX=41u+MhQnayF1ztLH4WX0Fw@mail.gmail.com/ Reported-by: Andrew Melnichenko Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211220144901.2784030-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 04e87f4b9417..22dd48c82560 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -7,6 +7,21 @@ #include #include +static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) +{ + switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + return protocol == cpu_to_be16(ETH_P_IP); + case VIRTIO_NET_HDR_GSO_TCPV6: + return protocol == cpu_to_be16(ETH_P_IPV6); + case VIRTIO_NET_HDR_GSO_UDP: + return protocol == cpu_to_be16(ETH_P_IP) || + protocol == cpu_to_be16(ETH_P_IPV6); + default: + return false; + } +} + static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { @@ -88,9 +103,12 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); - virtio_net_hdr_set_proto(skb, hdr); - if (protocol && protocol != skb->protocol) + if (!protocol) + virtio_net_hdr_set_proto(skb, hdr); + else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type)) return -EINVAL; + else + skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, -- cgit v1.2.3 From 1ed1d592113959f00cc552c3b9f47ca2d157768f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 Dec 2021 09:50:27 -0500 Subject: net: skip virtio_net_hdr_set_proto if protocol already set virtio_net_hdr_set_proto infers skb->protocol from the virtio_net_hdr gso_type, to avoid packets getting dropped for lack of a proto type. Its protocol choice is a guess, especially in the case of UFO, where the single VIRTIO_NET_HDR_GSO_UDP label covers both UFOv4 and UFOv6. Skip this best effort if the field is already initialized. Whether explicitly from userspace, or implicitly based on an earlier call to dev_parse_header_protocol (which is more robust, but was introduced after this patch). Fixes: 9d2f67e43b73 ("net/packet: fix packet drop as of virtio gso") Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20211220145027.2784293-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 22dd48c82560..a960de68ac69 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -25,6 +25,9 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { + if (skb->protocol) + return 0; + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: -- cgit v1.2.3 From cb8747b7d2a9e3d687a19a007575071d4b71cd05 Mon Sep 17 00:00:00 2001 From: Ismael Luceno Date: Mon, 15 Nov 2021 14:46:47 +0100 Subject: uapi: Fix undefined __always_inline on non-glibc systems This macro is defined by glibc itself, which makes the issue go unnoticed on those systems. On non-glibc systems it causes build failures on several utilities and libraries, like bpftool and objtool. Fixes: 1d509f2a6ebc ("x86/insn: Support big endian cross-compiles") Fixes: 2d7ce0e8a704 ("tools/virtio: more stubs") Fixes: 3fb321fde22d ("selftests/net: ipv6 flowlabel") Fixes: 50b3ed57dee9 ("selftests/bpf: test bpf flow dissection") Fixes: 9cacf81f8161 ("bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE") Fixes: a4b2061242ec ("tools include uapi: Grab a copy of linux/in.h") Fixes: b12d6ec09730 ("bpf: btf: add btf print functionality") Fixes: c0dd967818a2 ("tools, include: Grab a copy of linux/erspan.h") Fixes: c4b6014e8bb0 ("tools: Add copy of perf_event.h to tools/include/linux/") Signed-off-by: Ismael Luceno Acked-by: Masami Hiramatsu Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20211115134647.1921-1-ismael@iodev.co.uk Cc: Martin Schwidefsky Cc: Vasily Gorbik --- include/uapi/linux/byteorder/big_endian.h | 1 + include/uapi/linux/byteorder/little_endian.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h index 2199adc6a6c2..80aa5c41a763 100644 --- a/include/uapi/linux/byteorder/big_endian.h +++ b/include/uapi/linux/byteorder/big_endian.h @@ -9,6 +9,7 @@ #define __BIG_ENDIAN_BITFIELD #endif +#include #include #include diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h index 601c904fd5cd..cd98982e7523 100644 --- a/include/uapi/linux/byteorder/little_endian.h +++ b/include/uapi/linux/byteorder/little_endian.h @@ -9,6 +9,7 @@ #define __LITTLE_ENDIAN_BITFIELD #endif +#include #include #include -- cgit v1.2.3 From dcce50e6cc4d86a63dc0a9a6ee7d4f948ccd53a1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 8 Nov 2021 14:35:59 -0800 Subject: compiler.h: Fix annotation macro misplacement with Clang When building with Clang and CONFIG_TRACE_BRANCH_PROFILING, there are a lot of unreachable warnings, like: arch/x86/kernel/traps.o: warning: objtool: handle_xfd_event()+0x134: unreachable instruction Without an input to the inline asm, 'volatile' is ignored for some reason and Clang feels free to move the reachable() annotation away from its intended location. Fix that by re-adding the counter value to the inputs. Fixes: f1069a8756b9 ("compiler.h: Avoid using inline asm operand modifiers") Fixes: c199f64ff93c ("instrumentation.h: Avoid using inline asm operand modifiers") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/0417e96909b97a406323409210de7bf13df0b170.1636410380.git.jpoimboe@redhat.com Cc: Peter Zijlstra Cc: x86@kernel.org Cc: Vasily Gorbik Cc: Miroslav Benes --- include/linux/compiler.h | 4 ++-- include/linux/instrumentation.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 3d5af56337bd..429dcebe2b99 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -121,7 +121,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.reachable\n\t" \ ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define annotate_reachable() __annotate_reachable(__COUNTER__) @@ -129,7 +129,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define annotate_unreachable() __annotate_unreachable(__COUNTER__) diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index fa2cd8c63dcc..24359b4a9605 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -11,7 +11,7 @@ asm volatile(__stringify(c) ": nop\n\t" \ ".pushsection .discard.instr_begin\n\t" \ ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -50,7 +50,7 @@ asm volatile(__stringify(c) ": nop\n\t" \ ".pushsection .discard.instr_end\n\t" \ ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t"); \ + ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) #else -- cgit v1.2.3 From d7f55471db2719629f773c2d6b5742a69595bfd3 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Fri, 17 Dec 2021 10:07:54 +0800 Subject: memblock: fix memblock_phys_alloc() section mismatch error Fix modpost Section mismatch error in memblock_phys_alloc() [...] WARNING: modpost: vmlinux.o(.text.unlikely+0x1dcc): Section mismatch in reference from the function memblock_phys_alloc() to the function .init.text:memblock_phys_alloc_range() The function memblock_phys_alloc() references the function __init memblock_phys_alloc_range(). This is often because memblock_phys_alloc lacks a __init annotation or the annotation of memblock_phys_alloc_range is wrong. ERROR: modpost: Section mismatches detected. Set CONFIG_SECTION_MISMATCH_WARN_ONLY=y to allow them. [...] memblock_phys_alloc() is a one-line wrapper, make it __always_inline to avoid these section mismatches. Reported-by: k2ci Suggested-by: Mike Rapoport Signed-off-by: Jackie Liu [rppt: slightly massaged changelog ] Signed-off-by: Mike Rapoport Link: https://lore.kernel.org/r/20211217020754.2874872-1-liu.yun@linux.dev --- include/linux/memblock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8adcf1fa8096..9dc7cb239d21 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -405,8 +405,8 @@ phys_addr_t memblock_alloc_range_nid(phys_addr_t size, phys_addr_t end, int nid, bool exact_nid); phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); -static inline phys_addr_t memblock_phys_alloc(phys_addr_t size, - phys_addr_t align) +static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size, + phys_addr_t align) { return memblock_phys_alloc_range(size, align, 0, MEMBLOCK_ALLOC_ACCESSIBLE); -- cgit v1.2.3 From 5ec7d18d1813a5bead0b495045606c93873aecbb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 23 Dec 2021 13:04:30 -0500 Subject: sctp: use call_rcu to free endpoint This patch is to delay the endpoint free by calling call_rcu() to fix another use-after-free issue in sctp_sock_dump(): BUG: KASAN: use-after-free in __lock_acquire+0x36d9/0x4c20 Call Trace: __lock_acquire+0x36d9/0x4c20 kernel/locking/lockdep.c:3218 lock_acquire+0x1ed/0x520 kernel/locking/lockdep.c:3844 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline] _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168 spin_lock_bh include/linux/spinlock.h:334 [inline] __lock_sock+0x203/0x350 net/core/sock.c:2253 lock_sock_nested+0xfe/0x120 net/core/sock.c:2774 lock_sock include/net/sock.h:1492 [inline] sctp_sock_dump+0x122/0xb20 net/sctp/diag.c:324 sctp_for_each_transport+0x2b5/0x370 net/sctp/socket.c:5091 sctp_diag_dump+0x3ac/0x660 net/sctp/diag.c:527 __inet_diag_dump+0xa8/0x140 net/ipv4/inet_diag.c:1049 inet_diag_dump+0x9b/0x110 net/ipv4/inet_diag.c:1065 netlink_dump+0x606/0x1080 net/netlink/af_netlink.c:2244 __netlink_dump_start+0x59a/0x7c0 net/netlink/af_netlink.c:2352 netlink_dump_start include/linux/netlink.h:216 [inline] inet_diag_handler_cmd+0x2ce/0x3f0 net/ipv4/inet_diag.c:1170 __sock_diag_cmd net/core/sock_diag.c:232 [inline] sock_diag_rcv_msg+0x31d/0x410 net/core/sock_diag.c:263 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2477 sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:274 This issue occurs when asoc is peeled off and the old sk is freed after getting it by asoc->base.sk and before calling lock_sock(sk). To prevent the sk free, as a holder of the sk, ep should be alive when calling lock_sock(). This patch uses call_rcu() and moves sock_put and ep free into sctp_endpoint_destroy_rcu(), so that it's safe to try to hold the ep under rcu_read_lock in sctp_transport_traverse_process(). If sctp_endpoint_hold() returns true, it means this ep is still alive and we have held it and can continue to dump it; If it returns false, it means this ep is dead and can be freed after rcu_read_unlock, and we should skip it. In sctp_sock_dump(), after locking the sk, if this ep is different from tsp->asoc->ep, it means during this dumping, this asoc was peeled off before calling lock_sock(), and the sk should be skipped; If this ep is the same with tsp->asoc->ep, it means no peeloff happens on this asoc, and due to lock_sock, no peeloff will happen either until release_sock. Note that delaying endpoint free won't delay the port release, as the port release happens in sctp_endpoint_destroy() before calling call_rcu(). Also, freeing endpoint by call_rcu() makes it safe to access the sk by asoc->base.sk in sctp_assocs_seq_show() and sctp_rcv(). Thanks Jones to bring this issue up. v1->v2: - improve the changelog. - add kfree(ep) into sctp_endpoint_destroy_rcu(), as Jakub noticed. Reported-by: syzbot+9276d76e83e3bcde6c99@syzkaller.appspotmail.com Reported-by: Lee Jones Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 6 +++--- include/net/sctp/structs.h | 3 ++- net/sctp/diag.c | 12 ++++++------ net/sctp/endpointola.c | 23 +++++++++++++++-------- net/sctp/socket.c | 23 +++++++++++++++-------- 5 files changed, 41 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 189fdb9db162..d314a180ab93 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -105,6 +105,7 @@ extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); +typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *); void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, @@ -115,9 +116,8 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, void *p); -int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), - int (*cb_done)(struct sctp_transport *, void *), - struct net *net, int *pos, void *p); +int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, + struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 899c29c326ba..8dabd8800006 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1355,6 +1355,7 @@ struct sctp_endpoint { reconf_enable:1; __u8 strreset_enable; + struct rcu_head rcu; }; /* Recover the outter endpoint structure. */ @@ -1370,7 +1371,7 @@ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base) struct sctp_endpoint *sctp_endpoint_new(struct sock *, gfp_t); void sctp_endpoint_free(struct sctp_endpoint *); void sctp_endpoint_put(struct sctp_endpoint *); -void sctp_endpoint_hold(struct sctp_endpoint *); +int sctp_endpoint_hold(struct sctp_endpoint *ep); void sctp_endpoint_add_asoc(struct sctp_endpoint *, struct sctp_association *); struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 760b367644c1..a7d623171501 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -290,9 +290,8 @@ out: return err; } -static int sctp_sock_dump(struct sctp_transport *tsp, void *p) +static int sctp_sock_dump(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { - struct sctp_endpoint *ep = tsp->asoc->ep; struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; @@ -302,6 +301,8 @@ static int sctp_sock_dump(struct sctp_transport *tsp, void *p) int err = 0; lock_sock(sk); + if (ep != tsp->asoc->ep) + goto release; list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; @@ -344,9 +345,8 @@ release: return err; } -static int sctp_sock_filter(struct sctp_transport *tsp, void *p) +static int sctp_sock_filter(struct sctp_endpoint *ep, struct sctp_transport *tsp, void *p) { - struct sctp_endpoint *ep = tsp->asoc->ep; struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; const struct inet_diag_req_v2 *r = commp->r; @@ -505,8 +505,8 @@ skip: if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; - sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, &pos, &commp); + sctp_transport_traverse_process(sctp_sock_filter, sctp_sock_dump, + net, &pos, &commp); cb->args[2] = pos; done: diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 48c9c2c7602f..efffde7f2328 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -184,6 +184,18 @@ void sctp_endpoint_free(struct sctp_endpoint *ep) } /* Final destructor for endpoint. */ +static void sctp_endpoint_destroy_rcu(struct rcu_head *head) +{ + struct sctp_endpoint *ep = container_of(head, struct sctp_endpoint, rcu); + struct sock *sk = ep->base.sk; + + sctp_sk(sk)->ep = NULL; + sock_put(sk); + + kfree(ep); + SCTP_DBG_OBJCNT_DEC(ep); +} + static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { struct sock *sk; @@ -213,18 +225,13 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) if (sctp_sk(sk)->bind_hash) sctp_put_port(sk); - sctp_sk(sk)->ep = NULL; - /* Give up our hold on the sock */ - sock_put(sk); - - kfree(ep); - SCTP_DBG_OBJCNT_DEC(ep); + call_rcu(&ep->rcu, sctp_endpoint_destroy_rcu); } /* Hold a reference to an endpoint. */ -void sctp_endpoint_hold(struct sctp_endpoint *ep) +int sctp_endpoint_hold(struct sctp_endpoint *ep) { - refcount_inc(&ep->base.refcnt); + return refcount_inc_not_zero(&ep->base.refcnt); } /* Release a reference to an endpoint and clean up if there are diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 33391254fa82..ad5028a07b18 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5338,11 +5338,12 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), } EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); -int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), - int (*cb_done)(struct sctp_transport *, void *), - struct net *net, int *pos, void *p) { +int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, + struct net *net, int *pos, void *p) +{ struct rhashtable_iter hti; struct sctp_transport *tsp; + struct sctp_endpoint *ep; int ret; again: @@ -5351,26 +5352,32 @@ again: tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { - ret = cb(tsp, p); - if (ret) - break; + ep = tsp->asoc->ep; + if (sctp_endpoint_hold(ep)) { /* asoc can be peeled off */ + ret = cb(ep, tsp, p); + if (ret) + break; + sctp_endpoint_put(ep); + } (*pos)++; sctp_transport_put(tsp); } sctp_transport_walk_stop(&hti); if (ret) { - if (cb_done && !cb_done(tsp, p)) { + if (cb_done && !cb_done(ep, tsp, p)) { (*pos)++; + sctp_endpoint_put(ep); sctp_transport_put(tsp); goto again; } + sctp_endpoint_put(ep); sctp_transport_put(tsp); } return ret; } -EXPORT_SYMBOL_GPL(sctp_for_each_transport); +EXPORT_SYMBOL_GPL(sctp_transport_traverse_process); /* 7.2.1 Association Status (SCTP_STATUS) -- cgit v1.2.3 From 94ab10dd42a70acc5208a41325617e3d9cf81a70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 24 Dec 2021 21:12:48 -0800 Subject: mm: delete unsafe BUG from page_cache_add_speculative() It is not easily reproducible, but on 5.16-rc I have several times hit the VM_BUG_ON_PAGE(PageTail(page), page) in page_cache_add_speculative(): usually from filemap_get_read_batch() for an ext4 read, yesterday from next_uptodate_page() from filemap_map_pages() for a shmem fault. That BUG used to be placed where page_ref_add_unless() had succeeded, but now it is placed before folio_ref_add_unless() is attempted: that is not safe, since it is only the acquired reference which makes the page safe from racing THP collapse or split. We could keep the BUG, checking PageTail only when folio_ref_try_add_rcu() has succeeded; but I don't think it adds much value - just delete it. Link: https://lkml.kernel.org/r/8b98fc6f-3439-8614-c3f3-945c659a1aba@google.com Fixes: 020853b6f5ea ("mm: Add folio_try_get_rcu()") Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: William Kucharski Cc: Christoph Hellwig Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 605246452305..d150a9082b31 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -285,7 +285,6 @@ static inline struct inode *folio_inode(struct folio *folio) static inline bool page_cache_add_speculative(struct page *page, int count) { - VM_BUG_ON_PAGE(PageTail(page), page); return folio_ref_try_add_rcu((struct folio *)page, count); } -- cgit v1.2.3 From 595ec1973c276f6c0c1de8aca5eef8dfd81f9b49 Mon Sep 17 00:00:00 2001 From: Thibaut Sautereau Date: Fri, 24 Dec 2021 21:12:51 -0800 Subject: mm/page_alloc: fix __alloc_size attribute for alloc_pages_exact_nid The second parameter of alloc_pages_exact_nid is the one indicating the size of memory pointed by the returned pointer. Link: https://lkml.kernel.org/r/YbjEgwhn4bGblp//@coeus Fixes: abd58f38dfb4 ("mm/page_alloc: add __alloc_size attributes for better bounds checking") Signed-off-by: Thibaut Sautereau Acked-by: Kees Cook Cc: Daniel Micay Cc: Levente Polyak Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b976c4177299..8fcc38467af6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -624,7 +624,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) -- cgit v1.2.3 From 7175f02c4e5f5a9430113ab9ca0fd0ce98b28a51 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sun, 26 Dec 2021 16:01:27 +0300 Subject: uapi: fix linux/nfc.h userspace compilation errors Replace sa_family_t with __kernel_sa_family_t to fix the following linux/nfc.h userspace compilation errors: /usr/include/linux/nfc.h:266:2: error: unknown type name 'sa_family_t' sa_family_t sa_family; /usr/include/linux/nfc.h:274:2: error: unknown type name 'sa_family_t' sa_family_t sa_family; Fixes: 23b7869c0fd0 ("NFC: add the NFC socket raw protocol") Fixes: d646960f7986 ("NFC: Initial LLCP support") Cc: Signed-off-by: Dmitry V. Levin Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- include/uapi/linux/nfc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index f6e3c8c9c744..aadad43d943a 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -263,7 +263,7 @@ enum nfc_sdp_attr { #define NFC_SE_ENABLED 0x1 struct sockaddr_nfc { - sa_family_t sa_family; + __kernel_sa_family_t sa_family; __u32 dev_idx; __u32 target_idx; __u32 nfc_protocol; @@ -271,7 +271,7 @@ struct sockaddr_nfc { #define NFC_LLCP_MAX_SERVICE_NAME 63 struct sockaddr_nfc_llcp { - sa_family_t sa_family; + __kernel_sa_family_t sa_family; __u32 dev_idx; __u32 target_idx; __u32 nfc_protocol; -- cgit v1.2.3 From 79b69a83705e621b258ac6d8ae6d3bfdb4b930aa Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 26 Dec 2021 13:03:47 +0100 Subject: nfc: uapi: use kernel size_t to fix user-space builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix user-space builds if it includes /usr/include/linux/nfc.h before some of other headers: /usr/include/linux/nfc.h:281:9: error: unknown type name ‘size_t’ 281 | size_t service_name_len; | ^~~~~~ Fixes: d646960f7986 ("NFC: Initial LLCP support") Cc: Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- include/uapi/linux/nfc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index aadad43d943a..4fa4e979e948 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -278,7 +278,7 @@ struct sockaddr_nfc_llcp { __u8 dsap; /* Destination SAP, if known */ __u8 ssap; /* Source SAP to be bound to */ char service_name[NFC_LLCP_MAX_SERVICE_NAME]; /* Service name URI */; - size_t service_name_len; + __kernel_size_t service_name_len; }; /* NFC socket protocols */ -- cgit v1.2.3 From 1b4e3f26f9f7553b260b8aed43967500961448a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 2 Dec 2021 15:06:14 +0000 Subject: mm: vmscan: Reduce throttling due to a failure to make progress Mike Galbraith, Alexey Avramov and Darrick Wong all reported similar problems due to reclaim throttling for excessive lengths of time. In Alexey's case, a memory hog that should go OOM quickly stalls for several minutes before stalling. In Mike and Darrick's cases, a small memcg environment stalled excessively even though the system had enough memory overall. Commit 69392a403f49 ("mm/vmscan: throttle reclaim when no progress is being made") introduced the problem although commit a19594ca4a8b ("mm/vmscan: increase the timeout if page reclaim is not making progress") made it worse. Systems at or near an OOM state that cannot be recovered must reach OOM quickly and memcg should kill tasks if a memcg is near OOM. To address this, only stall for the first zone in the zonelist, reduce the timeout to 1 tick for VMSCAN_THROTTLE_NOPROGRESS and only stall if the scan control nr_reclaimed is 0, kswapd is still active and there were excessive pages pending for writeback. If kswapd has stopped reclaiming due to excessive failures, do not stall at all so that OOM triggers relatively quickly. Similarly, if an LRU is simply congested, only lightly throttle similar to NOPROGRESS. Alexey's original case was the most straight forward for i in {1..3}; do tail /dev/zero; done On vanilla 5.16-rc1, this test stalled heavily, after the patch the test completes in a few seconds similar to 5.15. Alexey's second test case added watching a youtube video while tail runs 10 times. On 5.15, playback only jitters slightly, 5.16-rc1 stalls a lot with lots of frames missing and numerous audio glitches. With this patch applies, the video plays similarly to 5.15. [lkp@intel.com: Fix W=1 build warning] Link: https://lore.kernel.org/r/99e779783d6c7fce96448a3402061b9dc1b3b602.camel@gmx.de Link: https://lore.kernel.org/r/20211124011954.7cab9bb4@mail.inbox.lv Link: https://lore.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net Link: https://lore.kernel.org/r/20211202150614.22440-1-mgorman@techsingularity.net Link: https://linux-regtracking.leemhuis.info/regzbot/regression/20211124011954.7cab9bb4@mail.inbox.lv/ Reported-and-tested-by: Alexey Avramov Reported-and-tested-by: Mike Galbraith Reported-and-tested-by: Darrick J. Wong Reported-by: kernel test robot Acked-by: Hugh Dickins Tracked-by: Thorsten Leemhuis Fixes: 69392a403f49 ("mm/vmscan: throttle reclaim when no progress is being made") Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + include/trace/events/vmscan.h | 4 ++- mm/vmscan.c | 64 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 59 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 58e744b78c2c..936dc0b6c226 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -277,6 +277,7 @@ enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, + VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index f25a6149d3ba..ca2e9009a651 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -30,12 +30,14 @@ #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) +#define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ - {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"} \ + {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \ + {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" diff --git a/mm/vmscan.c b/mm/vmscan.c index fb9584641ac7..4c4d5f6cd8a3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1021,6 +1021,39 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +static bool skip_throttle_noprogress(pg_data_t *pgdat) +{ + int reclaimable = 0, write_pending = 0; + int i; + + /* + * If kswapd is disabled, reschedule if necessary but do not + * throttle as the system is likely near OOM. + */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + + /* + * If there are a lot of dirty/writeback pages then do not + * throttle as throttling will occur when the pages cycle + * towards the end of the LRU if still under writeback. + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + reclaimable += zone_reclaimable_pages(zone); + write_pending += zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); + } + if (2 * write_pending <= reclaimable) + return true; + + return false; +} + void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) { wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; @@ -1056,8 +1089,16 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) } break; + case VMSCAN_THROTTLE_CONGESTED: + fallthrough; case VMSCAN_THROTTLE_NOPROGRESS: - timeout = HZ/2; + if (skip_throttle_noprogress(pgdat)) { + cond_resched(); + return; + } + + timeout = 1; + break; case VMSCAN_THROTTLE_ISOLATED: timeout = HZ/50; @@ -3321,7 +3362,7 @@ again: if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) - reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc)) @@ -3386,16 +3427,16 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) } /* - * Do not throttle kswapd on NOPROGRESS as it will throttle on - * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under - * writeback and marked for immediate reclaim at the tail of - * the LRU. + * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will + * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages + * under writeback and marked for immediate reclaim at the tail of the + * LRU. */ - if (current_is_kswapd()) + if (current_is_kswapd() || cgroup_reclaim(sc)) return; /* Throttle if making no progress at high prioities. */ - if (sc->priority < DEF_PRIORITY - 2) + if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } @@ -3415,6 +3456,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL; + pg_data_t *first_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -3478,14 +3520,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } + if (!first_pgdat) + first_pgdat = zone->zone_pgdat; + /* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); - consider_reclaim_throttle(zone->zone_pgdat, sc); } + consider_reclaim_throttle(first_pgdat, sc); + /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. -- cgit v1.2.3