From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:22:25 -0400 Subject: NFS: Fix another fsync() issue after a server reboot Currently, when the writeback code detects a server reboot, it redirties any pages that were not committed to disk, and it sets the flag NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor that dirtied the file. While this allows the file descriptor in question to redrive its own writes, it violates the fsync() requirement that we should be synchronising all writes to disk. While the problem is infrequent, we do see corner cases where an untimely server reboot causes the fsync() call to abandon its attempt to sync data to disk and causing data corruption issues due to missed error conditions or similar. In order to tighted up the client's ability to deal with this situation without introducing livelocks, add a counter that records the number of times pages are redirtied due to a server reboot-like condition, and use that in fsync() to redrive the sync to disk. Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b32ed68e7dc4..f08e581f0161 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -182,6 +182,7 @@ struct nfs_inode { /* Regular file */ struct { atomic_long_t nrequests; + atomic_long_t redirtied_pages; struct nfs_mds_commit_info commit_info; struct mutex commit_mutex; }; -- cgit v1.2.3 From 5f6277a0c15e1ea54b6fd3d78c9fff7bfe42556c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:51:45 -0400 Subject: NFS: Cleanup to remove unused flag NFS_CONTEXT_RESEND_WRITES Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f08e581f0161..7931fa472561 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -83,7 +83,6 @@ struct nfs_open_context { fmode_t mode; unsigned long flags; -#define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) -- cgit v1.2.3 From 0a90ed8d0cfa29735a221eba14d9cb6c735d35b6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Aug 2022 14:37:31 +0300 Subject: platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask On Intel hardware the SLP_TYPx bitfield occupies bits 10-12 as per ACPI specification (see Table 4.13 "PM1 Control Registers Fixed Hardware Feature Control Bits" for the details). Fix the mask and other related definitions accordingly. Fixes: 93e5eadd1f6e ("x86/platform: New Intel Atom SOC power management controller driver") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220801113734.36131-1-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/pmc_atom.c | 2 +- include/linux/platform_data/x86/pmc_atom.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c index 154317e9910d..5c757c7f64de 100644 --- a/drivers/platform/x86/pmc_atom.c +++ b/drivers/platform/x86/pmc_atom.c @@ -232,7 +232,7 @@ static void pmc_power_off(void) pm1_cnt_port = acpi_base_addr + PM1_CNT; pm1_cnt_value = inl(pm1_cnt_port); - pm1_cnt_value &= SLEEP_TYPE_MASK; + pm1_cnt_value &= ~SLEEP_TYPE_MASK; pm1_cnt_value |= SLEEP_TYPE_S5; pm1_cnt_value |= SLEEP_ENABLE; diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index 3edfb6d4e67a..dd81f510e4cf 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -7,6 +7,8 @@ #ifndef PMC_ATOM_H #define PMC_ATOM_H +#include + /* ValleyView Power Control Unit PCI Device ID */ #define PCI_DEVICE_ID_VLV_PMC 0x0F1C /* CherryTrail Power Control Unit PCI Device ID */ @@ -139,9 +141,9 @@ #define ACPI_MMIO_REG_LEN 0x100 #define PM1_CNT 0x4 -#define SLEEP_TYPE_MASK 0xFFFFECFF +#define SLEEP_TYPE_MASK GENMASK(12, 10) #define SLEEP_TYPE_S5 0x1C00 -#define SLEEP_ENABLE 0x2000 +#define SLEEP_ENABLE BIT(13) extern int pmc_atom_read(int offset, u32 *value); -- cgit v1.2.3 From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:09 +0800 Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create() psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it. Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- include/linux/psi.h | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index 89784763d19e..dd74411ac21d 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -27,7 +27,7 @@ void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res); + char *buf, enum psi_res res); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ffaccd6373f1..df7df5843b4f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3698,7 +3698,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5ee615a59fe1..ecb4b4ff4ce0 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1087,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1316,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, nbytes, res); + new = psi_trigger_create(&psi_system, buf, res); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit v1.2.3 From d7ae5818c3fa3007dee13f9d99832e7f26b8bc44 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:10 +0800 Subject: sched/psi: Remove redundant cgroup_psi() when !CONFIG_CGROUPS cgroup_psi() is only called under CONFIG_CGROUPS. We don't need cgroup_psi() when !CONFIG_CGROUPS, so we can remove it in this case. Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed53bfe7c46c..ac5d0515680e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -734,11 +734,6 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } -static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) -{ - return NULL; -} - static inline bool cgroup_psi_enabled(void) { return false; -- cgit v1.2.3 From b5a5b9d5f28d23b84f06b45c61dcad95b07d41bc Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Aug 2022 15:38:58 +0200 Subject: serial: document start_rx member at struct uart_ops Fix this doc build warning: ./include/linux/serial_core.h:397: warning: Function parameter or member 'start_rx' not described in 'uart_ops' Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/5d07ae2eec8fbad87e623160f9926b178bef2744.1660829433.git.mchehab@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index aef3145f2032..6e4f4765d209 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -141,6 +141,14 @@ struct gpio_desc; * Locking: none. * Interrupts: caller dependent. * + * @start_rx: ``void ()(struct uart_port *port)`` + * + * Start receiving characters. + * + * Locking: @port->lock taken. + * Interrupts: locally disabled. + * This call must not sleep + * * @stop_rx: ``void ()(struct uart_port *port)`` * * Stop receiving characters; the @port is in the process of being closed. -- cgit v1.2.3 From c1e5c2f0cb8a22ec2e14af92afc7006491bebabb Mon Sep 17 00:00:00 2001 From: Pablo Sun Date: Thu, 4 Aug 2022 11:48:03 +0800 Subject: usb: typec: altmodes/displayport: correct pin assignment for UFP receptacles Fix incorrect pin assignment values when connecting to a monitor with Type-C receptacle instead of a plug. According to specification, an UFP_D receptacle's pin assignment should came from the UFP_D pin assignments field (bit 23:16), while an UFP_D plug's assignments are described in the DFP_D pin assignments (bit 15:8) during Mode Discovery. For example the LG 27 UL850-W is a monitor with Type-C receptacle. The monitor responds to MODE DISCOVERY command with following DisplayPort Capability flag: dp->alt->vdo=0x140045 The existing logic only take cares of UPF_D plug case, and would take the bit 15:8 for this 0x140045 case. This results in an non-existing pin assignment 0x0 in dp_altmode_configure. To fix this problem a new set of macros are introduced to take plug/receptacle differences into consideration. Fixes: 0e3bb7d6894d ("usb: typec: Add driver for DisplayPort alternate mode") Cc: stable@vger.kernel.org Co-developed-by: Pablo Sun Co-developed-by: Macpaul Lin Reviewed-by: Guillaume Ranquet Reviewed-by: Heikki Krogerus Signed-off-by: Pablo Sun Signed-off-by: Macpaul Lin Link: https://lore.kernel.org/r/20220804034803.19486-1-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/altmodes/displayport.c | 4 ++-- include/linux/usb/typec_dp.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c index c1d8c23baa39..de66a2949e33 100644 --- a/drivers/usb/typec/altmodes/displayport.c +++ b/drivers/usb/typec/altmodes/displayport.c @@ -99,8 +99,8 @@ static int dp_altmode_configure(struct dp_altmode *dp, u8 con) case DP_STATUS_CON_UFP_D: case DP_STATUS_CON_BOTH: /* NOTE: First acting as DP source */ conf |= DP_CONF_UFP_U_AS_UFP_D; - pin_assign = DP_CAP_DFP_D_PIN_ASSIGN(dp->alt->vdo) & - DP_CAP_UFP_D_PIN_ASSIGN(dp->port->vdo); + pin_assign = DP_CAP_PIN_ASSIGN_UFP_D(dp->alt->vdo) & + DP_CAP_PIN_ASSIGN_DFP_D(dp->port->vdo); break; default: break; diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h index cfb916cccd31..8d09c2f0a9b8 100644 --- a/include/linux/usb/typec_dp.h +++ b/include/linux/usb/typec_dp.h @@ -73,6 +73,11 @@ enum { #define DP_CAP_USB BIT(7) #define DP_CAP_DFP_D_PIN_ASSIGN(_cap_) (((_cap_) & GENMASK(15, 8)) >> 8) #define DP_CAP_UFP_D_PIN_ASSIGN(_cap_) (((_cap_) & GENMASK(23, 16)) >> 16) +/* Get pin assignment taking plug & receptacle into consideration */ +#define DP_CAP_PIN_ASSIGN_UFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \ + DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_)) +#define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \ + DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_)) /* DisplayPort Status Update VDO bits */ #define DP_STATUS_CONNECTION(_status_) ((_status_) & 3) -- cgit v1.2.3 From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Aug 2022 22:56:40 +0200 Subject: mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know that FOLL_FORCE can be possibly dangerous, especially if there are races that can be exploited by user space. Right now, it would be sufficient to have some code that sets a PTE of a R/O-mapped shared page dirty, in order for it to erroneously become writable by FOLL_FORCE. The implications of setting a write-protected PTE dirty might not be immediately obvious to everyone. And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map a shmem page R/O while marking the pte dirty. This can be used by unprivileged user space to modify tmpfs/shmem file content even if the user does not have write permissions to the file, and to bypass memfd write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590). To fix such security issues for good, the insight is that we really only need that fancy retry logic (FOLL_COW) for COW mappings that are not writable (!VM_WRITE). And in a COW mapping, we really only broke COW if we have an exclusive anonymous page mapped. If we have something else mapped, or the mapped anonymous page might be shared (!PageAnonExclusive), we have to trigger a write fault to break COW. If we don't find an exclusive anonymous page when we retry, we have to trigger COW breaking once again because something intervened. Let's move away from this mandatory-retry + dirty handling and rely on our PageAnonExclusive() flag for making a similar decision, to use the same COW logic as in other kernel parts here as well. In case we stumble over a PTE in a COW mapping that does not map an exclusive anonymous page, COW was not properly broken and we have to trigger a fake write-fault to break COW. Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection") and commit 76aefad628aa ("mm/mprotect: fix soft-dirty check in can_change_pte_writable()"), take care of softdirty and uffd-wp manually. For example, a write() via /proc/self/mem to a uffd-wp-protected range has to fail instead of silently granting write access and bypassing the userspace fault handler. Note that FOLL_FORCE is not only used for debug access, but also triggered by applications without debug intentions, for example, when pinning pages via RDMA. This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR. Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So let's just get rid of it. Thanks to Nadav Amit for pointing out that the pte_dirty() check in FOLL_FORCE code is problematic and might be exploitable. Note 1: We don't check for the PTE being dirty because it doesn't matter for making a "was COWed" decision anymore, and whoever modifies the page has to set the page dirty either way. Note 2: Kernels before extended uffd-wp support and before PageAnonExclusive (< 5.19) can simply revert the problematic commit instead and be safe regarding UFFDIO_CONTINUE. A backport to v5.19 requires minor adjustments due to lack of vma_soft_dirty_enabled(). Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte") Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Axel Rasmussen Cc: Nadav Amit Cc: Peter Xu Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: John Hubbard Cc: Jason Gunthorpe Cc: David Laight Cc: [5.16] Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/gup.c | 68 ++++++++++++++++++++++++++++++++++++------------------ mm/huge_memory.c | 64 +++++++++++++++++++++++++++++++++----------------- 3 files changed, 89 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3bedc449c14d..982f2607180b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ diff --git a/mm/gup.c b/mm/gup.c index 732825157430..5abdaf487460 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } -/* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. - */ -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ +static inline bool can_follow_write_pte(pte_t pte, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + /* If the pte is writable, we can write to the page. */ + if (pte_write(pte)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + return false; + return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -528,12 +556,19 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { - pte_unmap_unlock(ptep, ptl); - return NULL; - } page = vm_normal_page(vma, address, pte); + + /* + * We only care about anon pages in can_follow_write_pte() and don't + * have to worry about pte_devmap() because they are never anon. + */ + if ((flags & FOLL_WRITE) && + !can_follow_write_pte(pte, page, vma, flags)) { + page = NULL; + goto out; + } + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN @@ -986,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma, return -EBUSY; } - /* - * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when - * necessary, even if maybe_mkwrite decided not to set pte_write. We - * can thus safely do subsequent page lookups as if they were reads. - * But only do so when looping for pte_write is futile: in some cases - * userspace may also be wanting to write to the gotten user page, - * which a read fault here might prevent (a readonly page might get - * reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags |= FOLL_COW; return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8a7c1b344abe..e9414ee57c5b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); - /* - * When we COW a devmap PMD entry, we split it into PTEs, so we should - * not be in this function with `flags & FOLL_COW` set. - */ - WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) @@ -1395,14 +1389,42 @@ fallback: return VM_FAULT_FALLBACK; } -/* - * FOLL_FORCE can write to even unwritable pmd's, but only - * after we've gone through a COW cycle and they are dirty. - */ -static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + /* If the pmd is writable, we can write to the page. */ + if (pmd_write(pmd)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + return !userfaultfd_huge_pmd_wp(vma, pmd); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1411,12 +1433,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; + struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) - goto out; + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if ((flags & FOLL_WRITE) && + !can_follow_write_pmd(*pmd, page, vma, flags)) + return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) @@ -1424,10 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, /* Full NUMA hinting faults to serialise migration in fault paths */ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) - goto out; - - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + return NULL; if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) return ERR_PTR(-EMLINK); @@ -1444,7 +1467,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); -out: return page; } -- cgit v1.2.3 From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sun, 7 Aug 2022 15:44:42 +0000 Subject: mm: add DEVICE_ZONE to FOR_ALL_ZONES FOR_ALL_ZONES should be consistent with enum zone_type. Otherwise, __count_zid_vm_events have the potential to add count to wrong item when zid is ZONE_DEVICE. Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io Signed-off-by: Hao Lee Cc: David Hildenbrand Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 15 +++++++++++---- mm/vmstat.c | 9 ++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 404024486fa5..f3fc36cd2276 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -20,12 +20,19 @@ #define HIGHMEM_ZONE(xx) #endif -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE +#ifdef CONFIG_ZONE_DEVICE +#define DEVICE_ZONE(xx) xx##_DEVICE, +#else +#define DEVICE_ZONE(xx) +#endif + +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \ + HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx) enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, - FOR_ALL_ZONES(PGALLOC), - FOR_ALL_ZONES(ALLOCSTALL), - FOR_ALL_ZONES(PGSCAN_SKIP), + FOR_ALL_ZONES(PGALLOC) + FOR_ALL_ZONES(ALLOCSTALL) + FOR_ALL_ZONES(PGSCAN_SKIP) PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, diff --git a/mm/vmstat.c b/mm/vmstat.c index 373d2730fcf2..90af9a8572f5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1168,8 +1168,15 @@ int fragmentation_index(struct zone *zone, unsigned int order) #define TEXT_FOR_HIGHMEM(xx) #endif +#ifdef CONFIG_ZONE_DEVICE +#define TEXT_FOR_DEVICE(xx) xx "_device", +#else +#define TEXT_FOR_DEVICE(xx) +#endif + #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", + TEXT_FOR_HIGHMEM(xx) xx "_movable", \ + TEXT_FOR_DEVICE(xx) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ -- cgit v1.2.3 From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 16:13:40 -0400 Subject: mm/uffd: reset write protection when unregister with wp-mode The motivation of this patch comes from a recent report and patchfix from David Hildenbrand on hugetlb shared handling of wr-protected page [1]. With the reproducer provided in commit message of [1], one can leverage the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect not only the attacker process, but also the whole system. The lazy-reset mechanism of uffd-wp was used to make unregister faster, meanwhile it has an assumption that any leftover pgtable entries should only affect the process on its own, so not only the user should be aware of anything it does, but also it should not affect outside of the process. But it seems that this is not true, and it can also be utilized to make some exploit easier. So far there's no clue showing that the lazy-reset is important to any userfaultfd users because normally the unregister will only happen once for a specific range of memory of the lifecycle of the process. Considering all above, what this patch proposes is to do explicit pte resets when unregister an uffd region with wr-protect mode enabled. It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false) right before ioctl(UFFDIO_UNREGISTER) for the user. So potentially it'll make the unregister slower. From that pov it's a very slight abi change, but hopefully nothing should break with this change either. Regarding to the change itself - core of uffd write [un]protect operation is moved into a separate function (uffd_wp_range()) and it is reused in the unregister code path. Note that the new function will not check for anything, e.g. ranges or memory types, because they should have been checked during the previous UFFDIO_REGISTER or it should have failed already. It also doesn't check mmap_changing because we're with mmap write lock held anyway. I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's the only issue reported so far and that's the commit David's reproducer will start working (v5.19+). But the whole idea actually applies to not only file memories but also anonymous. It's just that we don't need to fix anonymous prior to v5.19- because there's no known way to exploit. IOW, this patch can also fix the issue reported in [1] as the patch 2 does. [1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/ Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: David Hildenbrand Cc: Mike Rapoport Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 4 ++++ include/linux/userfaultfd_k.h | 2 ++ mm/userfaultfd.c | 29 ++++++++++++++++++----------- 3 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c44bf75f916..175de70e3adf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(mm, vma, start, vma_end - start, false); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 732b522bacb7..e1b8a915e9e9 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); +extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, + unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 07d3befc80e4..7327b2573f7c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } +void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, + unsigned long start, unsigned long len, bool enable_wp) +{ + struct mmu_gather tlb; + pgprot_t newprot; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + tlb_gather_mmu(&tlb, dst_mm); + change_protection(&tlb, dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + tlb_finish_mmu(&tlb); +} + int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; unsigned long page_mask; - struct mmu_gather tlb; - pgprot_t newprot; int err; /* @@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - if (enable_wp) - newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); - else - newprot = vm_get_page_prot(dst_vma->vm_flags); - - tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, - enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); - tlb_finish_mmu(&tlb); + uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); err = 0; out_unlock: -- cgit v1.2.3 From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 21:51:09 -0700 Subject: mm/shmem: fix chattr fsflags support in tmpfs ext[234] have always allowed unimplemented chattr flags to be set, but other filesystems have tended to be stricter. Follow the stricter approach for tmpfs: I don't want to have to explain why csu attributes don't actually work, and we won't need to update the chattr(1) manpage; and it's never wrong to start off strict, relaxing later if persuaded. Allow only a (append only) i (immutable) A (no atime) and d (no dump). Although lsattr showed 'A' inherited, the NOATIME behavior was not being inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME. Add shmem_set_inode_flags() to sync the flags, using inode_set_flags() to avoid that instant of lost immutablility during fileattr_set(). But that change switched generic/079 from passing to failing: because FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the INHERITED fsflags: remove them and generic/079 is back to passing. Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs") Signed-off-by: Hugh Dickins Cc: "Theodore Ts'o" Cc: Radoslaw Burny Cc: "Darrick J. Wong" Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 13 ++++-------- mm/shmem.c | 54 +++++++++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1b6c4013f691..ff0b990de83d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -29,15 +29,10 @@ struct shmem_inode_info { struct inode vfs_inode; }; -#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE -#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE -#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE \ + (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) +#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ diff --git a/mm/shmem.c b/mm/shmem.c index 5783f11351bb..170b4078420f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2281,16 +2281,34 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -/* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); + +/* + * chattr's fsflags are unrelated to extended attributes, + * but tmpfs has chosen to enable them under the same config option. + */ +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ + unsigned int i_flags = 0; + + if (fsflags & FS_NOATIME_FL) + i_flags |= S_NOATIME; + if (fsflags & FS_APPEND_FL) + i_flags |= S_APPEND; + if (fsflags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + /* + * But FS_NODUMP_FL does not require any action in i_flags. + */ + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); +} +#else +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & SHMEM_REG_FLMASK; - else - return flags & SHMEM_OTHER_FLMASK; } +#define shmem_initxattrs NULL +#endif static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) @@ -2319,7 +2337,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, info->i_crtime = inode->i_mtime; info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; - info->fsflags = shmem_mask_flags(mode, info->fsflags); + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -2468,12 +2487,6 @@ out_unacct_blocks: static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -#ifdef CONFIG_TMPFS_XATTR -static int shmem_initxattrs(struct inode *, const struct xattr *, void *); -#else -#define shmem_initxattrs NULL -#endif - static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, @@ -3179,18 +3192,13 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; + if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) + return -EOPNOTSUPP; info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); - if (info->fsflags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; - if (info->fsflags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (info->fsflags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; - + shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); return 0; } -- cgit v1.2.3 From d59b73a66e5e0682442b6d7b4965364e57078b80 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 3 Aug 2022 10:49:23 +0300 Subject: net/mlx5: Avoid false positive lockdep warning by adding lock_class_key Add a lock_class_key per mlx5 device to avoid a false positive "possible circular locking dependency" warning by lockdep, on flows which lock more than one mlx5 device, such as adding SF. kernel log: ====================================================== WARNING: possible circular locking dependency detected 5.19.0-rc8+ #2 Not tainted ------------------------------------------------------ kworker/u20:0/8 is trying to acquire lock: ffff88812dfe0d98 (&dev->intf_state_mutex){+.+.}-{3:3}, at: mlx5_init_one+0x2e/0x490 [mlx5_core] but task is already holding lock: ffff888101aa7898 (&(¬ifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&(¬ifier->n_head)->rwsem){++++}-{3:3}: down_write+0x90/0x150 blocking_notifier_chain_register+0x53/0xa0 mlx5_sf_table_init+0x369/0x4a0 [mlx5_core] mlx5_init_one+0x261/0x490 [mlx5_core] probe_one+0x430/0x680 [mlx5_core] local_pci_probe+0xd6/0x170 work_for_cpu_fn+0x4e/0xa0 process_one_work+0x7c2/0x1340 worker_thread+0x6f6/0xec0 kthread+0x28f/0x330 ret_from_fork+0x1f/0x30 -> #0 (&dev->intf_state_mutex){+.+.}-{3:3}: __lock_acquire+0x2fc7/0x6720 lock_acquire+0x1c1/0x550 __mutex_lock+0x12c/0x14b0 mlx5_init_one+0x2e/0x490 [mlx5_core] mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core] auxiliary_bus_probe+0x9d/0xe0 really_probe+0x1e0/0xaa0 __driver_probe_device+0x219/0x480 driver_probe_device+0x49/0x130 __device_attach_driver+0x1b8/0x280 bus_for_each_drv+0x123/0x1a0 __device_attach+0x1a3/0x460 bus_probe_device+0x1a2/0x260 device_add+0x9b1/0x1b40 __auxiliary_device_add+0x88/0xc0 mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core] blocking_notifier_call_chain+0xd5/0x130 mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core] process_one_work+0x7c2/0x1340 worker_thread+0x59d/0xec0 kthread+0x28f/0x330 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(¬ifier->n_head)->rwsem); lock(&dev->intf_state_mutex); lock(&(¬ifier->n_head)->rwsem); lock(&dev->intf_state_mutex); *** DEADLOCK *** 4 locks held by kworker/u20:0/8: #0: ffff888150612938 ((wq_completion)mlx5_events){+.+.}-{0:0}, at: process_one_work+0x6e2/0x1340 #1: ffff888100cafdb8 ((work_completion)(&work->work)#3){+.+.}-{0:0}, at: process_one_work+0x70f/0x1340 #2: ffff888101aa7898 (&(¬ifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130 #3: ffff88813682d0e8 (&dev->mutex){....}-{3:3}, at:__device_attach+0x76/0x460 stack backtrace: CPU: 6 PID: 8 Comm: kworker/u20:0 Not tainted 5.19.0-rc8+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core] Call Trace: dump_stack_lvl+0x57/0x7d check_noncircular+0x278/0x300 ? print_circular_bug+0x460/0x460 ? lock_chain_count+0x20/0x20 ? register_lock_class+0x1880/0x1880 __lock_acquire+0x2fc7/0x6720 ? register_lock_class+0x1880/0x1880 ? register_lock_class+0x1880/0x1880 lock_acquire+0x1c1/0x550 ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? lockdep_hardirqs_on_prepare+0x400/0x400 __mutex_lock+0x12c/0x14b0 ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? _raw_read_unlock+0x1f/0x30 ? mutex_lock_io_nested+0x1320/0x1320 ? __ioremap_caller.constprop.0+0x306/0x490 ? mlx5_sf_dev_probe+0x269/0x370 [mlx5_core] ? iounmap+0x160/0x160 mlx5_init_one+0x2e/0x490 [mlx5_core] mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core] ? mlx5_sf_dev_remove+0x130/0x130 [mlx5_core] auxiliary_bus_probe+0x9d/0xe0 really_probe+0x1e0/0xaa0 __driver_probe_device+0x219/0x480 ? auxiliary_match_id+0xe9/0x140 driver_probe_device+0x49/0x130 __device_attach_driver+0x1b8/0x280 ? driver_allows_async_probing+0x140/0x140 bus_for_each_drv+0x123/0x1a0 ? bus_for_each_dev+0x1a0/0x1a0 ? lockdep_hardirqs_on_prepare+0x286/0x400 ? trace_hardirqs_on+0x2d/0x100 __device_attach+0x1a3/0x460 ? device_driver_attach+0x1e0/0x1e0 ? kobject_uevent_env+0x22d/0xf10 bus_probe_device+0x1a2/0x260 device_add+0x9b1/0x1b40 ? dev_set_name+0xab/0xe0 ? __fw_devlink_link_to_suppliers+0x260/0x260 ? memset+0x20/0x40 ? lockdep_init_map_type+0x21a/0x7d0 __auxiliary_device_add+0x88/0xc0 ? auxiliary_device_init+0x86/0xa0 mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core] blocking_notifier_call_chain+0xd5/0x130 mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core] ? mlx5_vhca_event_arm+0x100/0x100 [mlx5_core] ? lock_downgrade+0x6e0/0x6e0 ? lockdep_hardirqs_on_prepare+0x286/0x400 process_one_work+0x7c2/0x1340 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? pwq_dec_nr_in_flight+0x230/0x230 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x59d/0xec0 ? process_one_work+0x1340/0x1340 kthread+0x28f/0x330 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Fixes: 6a3273217469 ("net/mlx5: SF, Port function state change support") Signed-off-by: Moshe Shemesh Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++++ include/linux/mlx5/driver.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index bec8d6d0b5f6..c085b031abfc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1530,7 +1530,9 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) memcpy(&dev->profile, &profile[profile_idx], sizeof(dev->profile)); INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); + lockdep_register_key(&dev->lock_key); mutex_init(&dev->intf_state_mutex); + lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key); mutex_init(&priv->bfregs.reg_head.lock); mutex_init(&priv->bfregs.wc_head.lock); @@ -1597,6 +1599,7 @@ err_timeout_init: mutex_destroy(&priv->bfregs.wc_head.lock); mutex_destroy(&priv->bfregs.reg_head.lock); mutex_destroy(&dev->intf_state_mutex); + lockdep_unregister_key(&dev->lock_key); return err; } @@ -1618,6 +1621,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) mutex_destroy(&priv->bfregs.wc_head.lock); mutex_destroy(&priv->bfregs.reg_head.lock); mutex_destroy(&dev->intf_state_mutex); + lockdep_unregister_key(&dev->lock_key); } static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 96b16fbe1aa4..7b7ce602c808 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -779,6 +779,7 @@ struct mlx5_core_dev { enum mlx5_device_state state; /* sync interface state */ struct mutex intf_state_mutex; + struct lock_class_key lock_key; unsigned long intf_state; struct mlx5_priv priv; struct mlx5_profile profile; -- cgit v1.2.3 From 13a8e0f6b01b14b2e28ba144e112c883f03a3db2 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 19 Aug 2022 15:16:11 -0700 Subject: Revert "driver core: Delete driver_deferred_probe_check_state()" This reverts commit 9cbffc7a59561be950ecc675d19a3d2b45202b2b. There are a few more issues to fix that have been reported in the thread for the original series [1]. We'll need to fix those before this will work. So, revert it for now. [1] - https://lore.kernel.org/lkml/20220601070707.3946847-1-saravanak@google.com/ Fixes: 9cbffc7a5956 ("driver core: Delete driver_deferred_probe_check_state()") Tested-by: Tony Lindgren Tested-by: Peng Fan Tested-by: Douglas Anderson Tested-by: Alexander Stein Reviewed-by: Tony Lindgren Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20220819221616.2107893-2-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 30 ++++++++++++++++++++++++++++++ include/linux/device/driver.h | 1 + 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 70f79fc71539..a8916d1bfdcb 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -274,12 +274,42 @@ static int __init deferred_probe_timeout_setup(char *str) } __setup("deferred_probe_timeout=", deferred_probe_timeout_setup); +/** + * driver_deferred_probe_check_state() - Check deferred probe state + * @dev: device to check + * + * Return: + * * -ENODEV if initcalls have completed and modules are disabled. + * * -ETIMEDOUT if the deferred probe timeout was set and has expired + * and modules are enabled. + * * -EPROBE_DEFER in other cases. + * + * Drivers or subsystems can opt-in to calling this function instead of directly + * returning -EPROBE_DEFER. + */ +int driver_deferred_probe_check_state(struct device *dev) +{ + if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) { + dev_warn(dev, "ignoring dependency for device, assuming no driver\n"); + return -ENODEV; + } + + if (!driver_deferred_probe_timeout && initcalls_done) { + dev_warn(dev, "deferred probe timeout, ignoring dependency\n"); + return -ETIMEDOUT; + } + + return -EPROBE_DEFER; +} +EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state); + static void deferred_probe_timeout_work_func(struct work_struct *work) { struct device_private *p; fw_devlink_drivers_done(); + driver_deferred_probe_timeout = 0; driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 7acaabde5396..2114d65b862f 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -242,6 +242,7 @@ driver_find_device_by_acpi_dev(struct device_driver *drv, const void *adev) extern int driver_deferred_probe_timeout; void driver_deferred_probe_add(struct device *dev); +int driver_deferred_probe_check_state(struct device *dev); void driver_init(void); /** -- cgit v1.2.3 From 7997eff82828304b780dc0a39707e1946d6f1ebf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 20 Aug 2022 17:38:37 +0200 Subject: netfilter: ebtables: reject blobs that don't provide all entry points Harshit Mogalapalli says: In ebt_do_table() function dereferencing 'private->hook_entry[hook]' can lead to NULL pointer dereference. [..] Kernel panic: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [..] RIP: 0010:ebt_do_table+0x1dc/0x1ce0 Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 5c 16 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b 6c df 08 48 8d 7d 2c 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 88 [..] Call Trace: nf_hook_slow+0xb1/0x170 __br_forward+0x289/0x730 maybe_deliver+0x24b/0x380 br_flood+0xc6/0x390 br_dev_xmit+0xa2e/0x12c0 For some reason ebtables rejects blobs that provide entry points that are not supported by the table, but what it should instead reject is the opposite: blobs that DO NOT provide an entry point supported by the table. t->valid_hooks is the bitmask of hooks (input, forward ...) that will see packets. Providing an entry point that is not support is harmless (never called/used), but the inverse isn't: it results in a crash because the ebtables traverser doesn't expect a NULL blob for a location its receiving packets for. Instead of fixing all the individual checks, do what iptables is doing and reject all blobs that differ from the expected hooks. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Harshit Mogalapalli Reported-by: syzkaller Signed-off-by: Florian Westphal --- include/linux/netfilter_bridge/ebtables.h | 4 ---- net/bridge/netfilter/ebtable_broute.c | 8 -------- net/bridge/netfilter/ebtable_filter.c | 8 -------- net/bridge/netfilter/ebtable_nat.c | 8 -------- net/bridge/netfilter/ebtables.c | 8 +------- 5 files changed, 1 insertion(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index a13296d6c7ce..fd533552a062 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -94,10 +94,6 @@ struct ebt_table { struct ebt_replace_kernel *table; unsigned int valid_hooks; rwlock_t lock; - /* e.g. could be the table explicitly only allows certain - * matches, targets, ... 0 == let it in */ - int (*check)(const struct ebt_table_info *info, - unsigned int valid_hooks); /* the data used by the kernel */ struct ebt_table_info *private; struct nf_hook_ops *ops; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 1a11064f9990..8f19253024b0 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -36,18 +36,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)&initial_chain, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~(1 << NF_BR_BROUTING)) - return -EINVAL; - return 0; -} - static const struct ebt_table broute_table = { .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index cb949436bc0e..278f324e6752 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~FILTER_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_filter = { .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 5ee0531ae506..9066f7f376d5 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~NAT_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_nat = { .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f2dbefb61ce8..9a0ae59cdc50 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1040,8 +1040,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - /* the table doesn't like it */ - if (t->check && (ret = t->check(newinfo, repl->valid_hooks))) + if (repl->valid_hooks != t->valid_hooks) goto free_unlock; if (repl->num_counters && repl->num_counters != t->private->nentries) { @@ -1231,11 +1230,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, if (ret != 0) goto free_chainstack; - if (table->check && table->check(newinfo, table->valid_hooks)) { - ret = -EINVAL; - goto free_chainstack; - } - table->private = newinfo; rwlock_init(&table->lock); mutex_lock(&ebt_mutex); -- cgit v1.2.3 From af67508ea6cbf0e4ea27f8120056fa2efce127dd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:56 -0700 Subject: net: Fix data-races around sysctl_fb_tunnels_only_for_init_net. While reading sysctl_fb_tunnels_only_for_init_net, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a3cb93c3dcc..6d3a33fd0cdb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -640,9 +640,14 @@ extern int sysctl_devconf_inherit_init_net; */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return !IS_ENABLED(CONFIG_SYSCTL) || - !sysctl_fb_tunnels_only_for_init_net || - (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1); +#if IS_ENABLED(CONFIG_SYSCTL) + int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net); + + return !fb_tunnels_only_for_init_net || + (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1); +#else + return true; +#endif } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) -- cgit v1.2.3 From a5612ca10d1aa05624ebe72633e0c8c792970833 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:57 -0700 Subject: net: Fix data-races around sysctl_devconf_inherit_init_net. While reading sysctl_devconf_inherit_init_net, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 856c395cfa63 ("net: introduce a knob to control whether to inherit devconf config") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ net/ipv4/devinet.c | 16 ++++++++++------ net/ipv6/addrconf.c | 5 ++--- 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6d3a33fd0cdb..05d6f3facd5a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -650,6 +650,15 @@ static inline bool net_has_fallback_tunnels(const struct net *net) #endif } +static inline int net_inherit_devconf(void) +{ +#if IS_ENABLED(CONFIG_SYSCTL) + return READ_ONCE(sysctl_devconf_inherit_init_net); +#else + return 0; +#endif +} + static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 92b778e423df..e8b9a9202fec 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2682,23 +2682,27 @@ static __net_init int devinet_init_net(struct net *net) #endif if (!net_eq(net, &init_net)) { - if (IS_ENABLED(CONFIG_SYSCTL) && - sysctl_devconf_inherit_init_net == 3) { + switch (net_inherit_devconf()) { + case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); - } else if (!IS_ENABLED(CONFIG_SYSCTL) || - sysctl_devconf_inherit_init_net != 2) { - /* inherit == 0 or 1: copy from init_net */ + break; + case 0: + case 1: + /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); + break; + case 2: + /* use compiled values */ + break; } - /* else inherit == 2: use compiled values */ } #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b624e3d8c5f0..e15f64f22fa8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7162,9 +7162,8 @@ static int __net_init addrconf_init_net(struct net *net) if (!dflt) goto err_alloc_dflt; - if (IS_ENABLED(CONFIG_SYSCTL) && - !net_eq(net, &init_net)) { - switch (sysctl_devconf_inherit_init_net) { + if (!net_eq(net, &init_net)) { + switch (net_inherit_devconf()) { case 1: /* copy from init_net */ memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); -- cgit v1.2.3 From 2a5840124009f133bd09fd855963551fb2cefe22 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 15 Jul 2022 12:16:22 -0700 Subject: lsm,io_uring: add LSM hooks for the new uring_cmd file op io-uring cmd support was added through ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd"), this extended the struct file_operations to allow a new command which each subsystem can use to enable command passthrough. Add an LSM specific for the command passthrough which enables LSMs to inspect the command details. This was discussed long ago without no clear pointer for something conclusive, so this enables LSMs to at least reject this new file operation. [0] https://lkml.kernel.org/r/8adf55db-7bab-f59d-d612-ed906b948d19@schaufler-ca.com Cc: stable@vger.kernel.org Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd") Signed-off-by: Luis Chamberlain Acked-by: Jens Axboe Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/lsm_hooks.h | 3 +++ include/linux/security.h | 5 +++++ io_uring/uring_cmd.c | 5 +++++ security/security.c | 4 ++++ 5 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 806448173033..60fff133c0b1 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -407,4 +407,5 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #ifdef CONFIG_IO_URING LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) +LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 84a0d7e02176..3aa6030302f5 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1582,6 +1582,9 @@ * Check whether the current task is allowed to spawn a io_uring polling * thread (IORING_SETUP_SQPOLL). * + * @uring_cmd: + * Check whether the file_operations uring_cmd is allowed to run. + * */ union security_list_options { #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); diff --git a/include/linux/security.h b/include/linux/security.h index 1bc362cb413f..7bd0c490703d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2060,6 +2060,7 @@ static inline int security_perf_event_write(struct perf_event *event) #ifdef CONFIG_SECURITY extern int security_uring_override_creds(const struct cred *new); extern int security_uring_sqpoll(void); +extern int security_uring_cmd(struct io_uring_cmd *ioucmd); #else static inline int security_uring_override_creds(const struct cred *new) { @@ -2069,6 +2070,10 @@ static inline int security_uring_sqpoll(void) { return 0; } +static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_IO_URING */ diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8e0cc2d9205e..0f7ad956ddcb 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -88,6 +89,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (!req->file->f_op->uring_cmd) return -EOPNOTSUPP; + ret = security_uring_cmd(ioucmd); + if (ret) + return ret; + if (ctx->flags & IORING_SETUP_SQE128) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & IORING_SETUP_CQE32) diff --git a/security/security.c b/security/security.c index 14d30fec8a00..4b95de24bc8d 100644 --- a/security/security.c +++ b/security/security.c @@ -2660,4 +2660,8 @@ int security_uring_sqpoll(void) { return call_int_hook(uring_sqpoll, 0); } +int security_uring_cmd(struct io_uring_cmd *ioucmd) +{ + return call_int_hook(uring_cmd, 0, ioucmd); +} #endif /* CONFIG_IO_URING */ -- cgit v1.2.3 From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 26 Aug 2022 09:17:08 -0400 Subject: wait_on_bit: add an acquire memory barrier There are several places in the kernel where wait_on_bit is not followed by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read). On architectures with weak memory ordering, it may happen that memory accesses that follow wait_on_bit are reordered before wait_on_bit and they may return invalid data. Fix this class of bugs by introducing a new function "test_bit_acquire" that works like test_bit, but has acquire memory ordering semantics. Signed-off-by: Mikulas Patocka Acked-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- Documentation/atomic_bitops.txt | 10 ++++------ arch/x86/include/asm/bitops.h | 21 +++++++++++++++++++++ include/asm-generic/bitops/generic-non-atomic.h | 14 ++++++++++++++ .../asm-generic/bitops/instrumented-non-atomic.h | 12 ++++++++++++ include/asm-generic/bitops/non-atomic.h | 1 + .../bitops/non-instrumented-non-atomic.h | 1 + include/linux/bitops.h | 1 + include/linux/buffer_head.h | 2 +- include/linux/wait_bit.h | 8 ++++---- kernel/sched/wait_bit.c | 2 +- 10 files changed, 60 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt index d8b101c97031..edea4656c5c0 100644 --- a/Documentation/atomic_bitops.txt +++ b/Documentation/atomic_bitops.txt @@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is: - RMW operations that have a return value are fully ordered. - - RMW operations that are conditional are unordered on FAILURE, - otherwise the above rules apply. In the case of test_and_set_bit_lock(), - if the bit in memory is unchanged by the operation then it is deemed to have - failed. + - RMW operations that are conditional are fully ordered. -Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and -clear_bit_unlock() which has RELEASE semantics. +Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics, +clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has +ACQUIRE semantics. Since a platform only has a single means of achieving atomic operations the same barriers as for atomic_t are used, see atomic_t.txt. diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 973c6bd17f98..0fe9de58af31 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } +static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) +{ + bool oldbit; + + asm volatile("testb %2,%1" + CC_SET(nz) + : CC_OUT(nz) (oldbit) + : "m" (((unsigned char *)addr)[nr >> 3]), + "i" (1 << (nr & 7)) + :"memory"); + + return oldbit; +} + static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { bool oldbit; @@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr) variable_test_bit(nr, addr); } +static __always_inline bool +arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) +{ + return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : + variable_test_bit(nr, addr); +} + /** * __ffs - find first set bit in word * @word: The word to search diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h index 3d5ebd24652b..564a8c675d85 100644 --- a/include/asm-generic/bitops/generic-non-atomic.h +++ b/include/asm-generic/bitops/generic-non-atomic.h @@ -4,6 +4,7 @@ #define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H #include +#include #ifndef _LINUX_BITOPS_H #error only can be included directly @@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr) return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } +/** + * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static __always_inline bool +generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) +{ + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1))); +} + /* * const_*() definitions provide good compile-time optimizations when * the passed arguments can be resolved at compile time. @@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr) #define const___test_and_set_bit generic___test_and_set_bit #define const___test_and_clear_bit generic___test_and_clear_bit #define const___test_and_change_bit generic___test_and_change_bit +#define const_test_bit_acquire generic_test_bit_acquire /** * const_test_bit - Determine whether a bit is set diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h index 988a3bbfba34..2b238b161a62 100644 --- a/include/asm-generic/bitops/instrumented-non-atomic.h +++ b/include/asm-generic/bitops/instrumented-non-atomic.h @@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr) return arch_test_bit(nr, addr); } +/** + * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static __always_inline bool +_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) +{ + instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); + return arch_test_bit_acquire(nr, addr); +} + #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h index 5c37ced343ae..71f8d54a5195 100644 --- a/include/asm-generic/bitops/non-atomic.h +++ b/include/asm-generic/bitops/non-atomic.h @@ -13,6 +13,7 @@ #define arch___test_and_change_bit generic___test_and_change_bit #define arch_test_bit generic_test_bit +#define arch_test_bit_acquire generic_test_bit_acquire #include diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h index bdb9b1ffaee9..0ddc78dfc358 100644 --- a/include/asm-generic/bitops/non-instrumented-non-atomic.h +++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h @@ -12,5 +12,6 @@ #define ___test_and_change_bit arch___test_and_change_bit #define _test_bit arch_test_bit +#define _test_bit_acquire arch_test_bit_acquire #endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */ diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cf9bf65039f2..3b89c64bcfd8 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w); #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) +#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index def8b8d30ccc..089c9ade4325 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh) * make it consistent with folio_test_uptodate * pairs with smp_mb__before_atomic in set_buffer_uptodate */ - return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0; + return test_bit_acquire(BH_Uptodate, &bh->b_state); } #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 7dec36aecbd9..7725b7579b78 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -71,7 +71,7 @@ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, @@ -96,7 +96,7 @@ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, @@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, @@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); - if (!test_bit(bit, word)) + if (!test_bit_acquire(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index d4788f810b55..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); -- cgit v1.2.3 From fcab34b433e2c13e333b2f53c4a8409eadc432c7 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 10 Aug 2022 10:53:59 -0600 Subject: mm: re-allow pinning of zero pfns (again) The below referenced commit makes the same error as 1c563432588d ("mm: fix is_pinnable_page against a cma page"), re-interpreting the logic to exclude pinning of the zero page, which breaks device assignment with vfio. To avoid further subtle mistakes, split the logic into discrete tests. [akpm@linux-foundation.org: simplify comment, per John] Link: https://lkml.kernel.org/r/166015037385.760108.16881097713975517242.stgit@omen Link: https://lore.kernel.org/all/165490039431.944052.12458624139225785964.stgit@omen Fixes: f25cbb7a95a2 ("mm: add zone device coherent type memory support") Signed-off-by: Alex Williamson Suggested-by: Matthew Wilcox Suggested-by: Felix Kuehling Tested-by: Slawomir Laba Reviewed-by: John Hubbard Cc: Alex Sierra Cc: Christoph Hellwig Cc: Alistair Popple Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 982f2607180b..21f8b27bd9fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1544,9 +1544,16 @@ static inline bool is_longterm_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - return !(is_device_coherent_page(page) || - is_zone_movable_page(page) || - is_zero_pfn(page_to_pfn(page))); + /* The zero page may always be pinned */ + if (is_zero_pfn(page_to_pfn(page))) + return true; + + /* Coherent device memory must always allow eviction. */ + if (is_device_coherent_page(page)) + return false; + + /* Otherwise, non-movable zone pages can be pinned. */ + return !is_zone_movable_page(page); } #else static inline bool is_longterm_pinnable_page(struct page *page) -- cgit v1.2.3 From dbb16df6443c59e8a1ef21c2272fcf387d600ddf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 17 Aug 2022 17:21:39 +0000 Subject: Revert "memcg: cleanup racy sum avoidance code" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 96e51ccf1af33e82f429a0d6baebba29c6448d0f. Recently we started running the kernel with rstat infrastructure on production traffic and begin to see negative memcg stats values. Particularly the 'sock' stat is the one which we observed having negative value. $ grep "sock " /mnt/memory/job/memory.stat sock 253952 total_sock 18446744073708724224 Re-run after couple of seconds $ grep "sock " /mnt/memory/job/memory.stat sock 253952 total_sock 53248 For now we are only seeing this issue on large machines (256 CPUs) and only with 'sock' stat. I think the networking stack increase the stat on one cpu and decrease it on another cpu much more often. So, this negative sock is due to rstat flusher flushing the stats on the CPU that has seen the decrement of sock but missed the CPU that has increments. A typical race condition. For easy stable backport, revert is the most simple solution. For long term solution, I am thinking of two directions. First is just reduce the race window by optimizing the rstat flusher. Second is if the reader sees a negative stat value, force flush and restart the stat collection. Basically retry but limited. Link: https://lkml.kernel.org/r/20220817172139.3141101-1-shakeelb@google.com Fixes: 96e51ccf1af33e8 ("memcg: cleanup racy sum avoidance code") Signed-off-by: Shakeel Butt Cc: "Michal Koutný" Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: David Hildenbrand Cc: Yosry Ahmed Cc: Greg Thelen Cc: [5.15] Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4d31ce55b1c0..6257867fbf95 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -987,19 +987,30 @@ static inline void mod_memcg_page_state(struct page *page, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - return READ_ONCE(memcg->vmstats.state[idx]); + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return READ_ONCE(pn->lruvec_stats.state[idx]); + x = READ_ONCE(pn->lruvec_stats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, -- cgit v1.2.3 From dcf8e5633e2e69ad60b730ab5905608b756a032f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 23 Aug 2022 12:59:25 -0700 Subject: tracing: Define the is_signed_type() macro once There are two definitions of the is_signed_type() macro: one in and a second definition in . As suggested by Linus, move the definition of the is_signed_type() macro into the header file. Change the definition of the is_signed_type() macro to make sure that it does not trigger any sparse warnings with future versions of sparse for bitwise types. Link: https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/ Link: https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/ Cc: Rasmus Villemoes Cc: Steven Rostedt Acked-by: Kees Cook Signed-off-by: Bart Van Assche Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 6 ++++++ include/linux/overflow.h | 1 - include/linux/trace_events.h | 2 -- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 01ce94b58b42..7713d7bcdaea 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* + * Whether 'type' is a signed type or an unsigned type. Supports scalar types, + * bool and also pointer types. + */ +#define is_signed_type(type) (((type)(-1)) < (__force type)1) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. diff --git a/include/linux/overflow.h b/include/linux/overflow.h index f1221d11f8e5..0eb3b192f07a 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -30,7 +30,6 @@ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ -#define is_signed_type(type) (((type)(-1)) < (type)1) #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index b18759a673c6..8401dec93c15 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); -#define is_signed_type(type) (((type)(-1)) < (type)1) - int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, -- cgit v1.2.3 From 25af7406df5915f04d5f1c8f081dabb0ead1cdcc Mon Sep 17 00:00:00 2001 From: Isaac Manjarres Date: Thu, 18 Aug 2022 18:28:51 +0100 Subject: ARM: 9229/1: amba: Fix use-after-free in amba_read_periphid() After commit f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device addition"), it became possible for amba_read_periphid() to be invoked concurrently from two threads for a particular AMBA device. Consider the case where a thread (T0) is registering an AMBA driver, and searching for all of the devices it can match with on the AMBA bus. Suppose that another thread (T1) is executing the deferred probe work, and is searching through all of the AMBA drivers on the bus for a driver that matches a particular AMBA device. Assume that both threads begin operating on the same AMBA device and the device's peripheral ID is still unknown. In this scenario, the amba_match() function will be invoked for the same AMBA device by both threads, which means amba_read_periphid() can also be invoked by both threads, and both threads will be able to manipulate the AMBA device's pclk pointer without any synchronization. It's possible that one thread will initialize the pclk pointer, then the other thread will re-initialize it, overwriting the previous value, and both will race to free the same pclk, resulting in a use-after-free for whichever thread frees the pclk last. Add a lock per AMBA device to synchronize the handling with detecting the peripheral ID to avoid the use-after-free scenario. The following KFENCE bug report helped detect this problem: ================================================================== BUG: KFENCE: use-after-free read in clk_disable+0x14/0x34 Use-after-free read at 0x(ptrval) (in kfence-#19): clk_disable+0x14/0x34 amba_read_periphid+0xdc/0x134 amba_match+0x3c/0x84 __driver_attach+0x20/0x158 bus_for_each_dev+0x74/0xc0 bus_add_driver+0x154/0x1e8 driver_register+0x88/0x11c do_one_initcall+0x8c/0x2fc kernel_init_freeable+0x190/0x220 kernel_init+0x10/0x108 ret_from_fork+0x14/0x3c 0x0 kfence-#19: 0x(ptrval)-0x(ptrval), size=36, cache=kmalloc-64 allocated by task 8 on cpu 0 at 11.629931s: clk_hw_create_clk+0x38/0x134 amba_get_enable_pclk+0x10/0x68 amba_read_periphid+0x28/0x134 amba_match+0x3c/0x84 __device_attach_driver+0x2c/0xc4 bus_for_each_drv+0x80/0xd0 __device_attach+0xb0/0x1f0 bus_probe_device+0x88/0x90 deferred_probe_work_func+0x8c/0xc0 process_one_work+0x23c/0x690 worker_thread+0x34/0x488 kthread+0xd4/0xfc ret_from_fork+0x14/0x3c 0x0 freed by task 8 on cpu 0 at 11.630095s: amba_read_periphid+0xec/0x134 amba_match+0x3c/0x84 __device_attach_driver+0x2c/0xc4 bus_for_each_drv+0x80/0xd0 __device_attach+0xb0/0x1f0 bus_probe_device+0x88/0x90 deferred_probe_work_func+0x8c/0xc0 process_one_work+0x23c/0x690 worker_thread+0x34/0x488 kthread+0xd4/0xfc ret_from_fork+0x14/0x3c 0x0 Cc: Saravana Kannan Cc: patches@armlinux.org.uk Fixes: f2d3b9a46e0e ("ARM: 9220/1: amba: Remove deferred device addition") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Isaac J. Manjarres Signed-off-by: Russell King (Oracle) --- drivers/amba/bus.c | 8 +++++++- include/linux/amba/bus.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 32b0e0b930c1..110a535648d2 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -209,6 +209,7 @@ static int amba_match(struct device *dev, struct device_driver *drv) struct amba_device *pcdev = to_amba_device(dev); struct amba_driver *pcdrv = to_amba_driver(drv); + mutex_lock(&pcdev->periphid_lock); if (!pcdev->periphid) { int ret = amba_read_periphid(pcdev); @@ -218,11 +219,14 @@ static int amba_match(struct device *dev, struct device_driver *drv) * permanent failure in reading pid and cid, simply map it to * -EPROBE_DEFER. */ - if (ret) + if (ret) { + mutex_unlock(&pcdev->periphid_lock); return -EPROBE_DEFER; + } dev_set_uevent_suppress(dev, false); kobject_uevent(&dev->kobj, KOBJ_ADD); } + mutex_unlock(&pcdev->periphid_lock); /* When driver_override is set, only bind to the matching driver */ if (pcdev->driver_override) @@ -532,6 +536,7 @@ static void amba_device_release(struct device *dev) if (d->res.parent) release_resource(&d->res); + mutex_destroy(&d->periphid_lock); kfree(d); } @@ -584,6 +589,7 @@ static void amba_device_initialize(struct amba_device *dev, const char *name) dev->dev.dma_mask = &dev->dev.coherent_dma_mask; dev->dev.dma_parms = &dev->dma_parms; dev->res.name = dev_name(&dev->dev); + mutex_init(&dev->periphid_lock); } /** diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index e94cdf235f1d..5001e14c5c06 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -67,6 +67,7 @@ struct amba_device { struct clk *pclk; struct device_dma_parameters dma_parms; unsigned int periphid; + struct mutex periphid_lock; unsigned int cid; struct amba_cs_uci_id uci; unsigned int irq[AMBA_NR_IRQS]; -- cgit v1.2.3 From 9c6d778800b921bde3bff3cff5003d1650f942d1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 26 Aug 2022 15:31:32 -0400 Subject: USB: core: Prevent nested device-reset calls Automatic kernel fuzzing revealed a recursive locking violation in usb-storage: ============================================ WARNING: possible recursive locking detected 5.18.0 #3 Not tainted -------------------------------------------- kworker/1:3/1205 is trying to acquire lock: ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at: usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230 but task is already holding lock: ffff888018638db8 (&us_interface_key[i]){+.+.}-{3:3}, at: usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230 ... stack backtrace: CPU: 1 PID: 1205 Comm: kworker/1:3 Not tainted 5.18.0 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_deadlock_bug kernel/locking/lockdep.c:2988 [inline] check_deadlock kernel/locking/lockdep.c:3031 [inline] validate_chain kernel/locking/lockdep.c:3816 [inline] __lock_acquire.cold+0x152/0x3ca kernel/locking/lockdep.c:5053 lock_acquire kernel/locking/lockdep.c:5665 [inline] lock_acquire+0x1ab/0x520 kernel/locking/lockdep.c:5630 __mutex_lock_common kernel/locking/mutex.c:603 [inline] __mutex_lock+0x14f/0x1610 kernel/locking/mutex.c:747 usb_stor_pre_reset+0x35/0x40 drivers/usb/storage/usb.c:230 usb_reset_device+0x37d/0x9a0 drivers/usb/core/hub.c:6109 r871xu_dev_remove+0x21a/0x270 drivers/staging/rtl8712/usb_intf.c:622 usb_unbind_interface+0x1bd/0x890 drivers/usb/core/driver.c:458 device_remove drivers/base/dd.c:545 [inline] device_remove+0x11f/0x170 drivers/base/dd.c:537 __device_release_driver drivers/base/dd.c:1222 [inline] device_release_driver_internal+0x1a7/0x2f0 drivers/base/dd.c:1248 usb_driver_release_interface+0x102/0x180 drivers/usb/core/driver.c:627 usb_forced_unbind_intf+0x4d/0xa0 drivers/usb/core/driver.c:1118 usb_reset_device+0x39b/0x9a0 drivers/usb/core/hub.c:6114 This turned out not to be an error in usb-storage but rather a nested device reset attempt. That is, as the rtl8712 driver was being unbound from a composite device in preparation for an unrelated USB reset (that driver does not have pre_reset or post_reset callbacks), its ->remove routine called usb_reset_device() -- thus nesting one reset call within another. Performing a reset as part of disconnect processing is a questionable practice at best. However, the bug report points out that the USB core does not have any protection against nested resets. Adding a reset_in_progress flag and testing it will prevent such errors in the future. Link: https://lore.kernel.org/all/CAB7eexKUpvX-JNiLzhXBDWgfg2T9e9_0Tw4HQ6keN==voRbP0g@mail.gmail.com/ Cc: stable@vger.kernel.org Reported-and-tested-by: Rondreis Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/YwkflDxvg0KWqyZK@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 10 ++++++++++ include/linux/usb.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 2633acde7ac1..d4b1e70d1498 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -6038,6 +6038,11 @@ re_enumerate: * the reset is over (using their post_reset method). * * Return: The same as for usb_reset_and_verify_device(). + * However, if a reset is already in progress (for instance, if a + * driver doesn't have pre_ or post_reset() callbacks, and while + * being unbound or re-bound during the ongoing reset its disconnect() + * or probe() routine tries to perform a second, nested reset), the + * routine returns -EINPROGRESS. * * Note: * The caller must own the device lock. For example, it's safe to use @@ -6071,6 +6076,10 @@ int usb_reset_device(struct usb_device *udev) return -EISDIR; } + if (udev->reset_in_progress) + return -EINPROGRESS; + udev->reset_in_progress = 1; + port_dev = hub->ports[udev->portnum - 1]; /* @@ -6135,6 +6144,7 @@ int usb_reset_device(struct usb_device *udev) usb_autosuspend_device(udev); memalloc_noio_restore(noio_flag); + udev->reset_in_progress = 0; return ret; } EXPORT_SYMBOL_GPL(usb_reset_device); diff --git a/include/linux/usb.h b/include/linux/usb.h index f7a9914fc97f..9ff1ad4dfad1 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -575,6 +575,7 @@ struct usb3_lpm_parameters { * @devaddr: device address, XHCI: assigned by HW, others: same as devnum * @can_submit: URBs may be submitted * @persist_enabled: USB_PERSIST enabled for this device + * @reset_in_progress: the device is being reset * @have_langid: whether string_langid is valid * @authorized: policy has said we can use it; * (user space) policy determines if we authorize this device to be @@ -662,6 +663,7 @@ struct usb_device { unsigned can_submit:1; unsigned persist_enabled:1; + unsigned reset_in_progress:1; unsigned have_langid:1; unsigned authorized:1; unsigned authenticated:1; -- cgit v1.2.3 From ec1bd37123c607ca6485beb4542a792a4db765aa Mon Sep 17 00:00:00 2001 From: Khalid Masum Date: Thu, 18 Aug 2022 10:07:38 +0600 Subject: fscache: fix misdocumented parameter This patch fixes two warnings generated by make docs. The functions fscache_use_cookie and fscache_unuse_cookie, both have a parameter named cookie. But they are documented with the name "object" with unclear description. Which generates the warning when creating docs. This commit will replace the currently misdocumented parameter names with the correct ones while adding proper descriptions. CC: Randy Dunlap Signed-off-by: Khalid Masum Signed-off-by: David Howells Link: https://lore.kernel.org/r/20220521142446.4746-1-khalid.masum.92@gmail.com/ # v1 Link: https://lore.kernel.org/r/20220818040738.12036-1-khalid.masum.92@gmail.com/ # v2 Link: https://lore.kernel.org/r/880d7d25753fb326ee17ac08005952112fcf9bdb.1657360984.git.mchehab@kernel.org/ # Mauro's version --- include/linux/fscache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 720874e6ee94..36e5dd84cf59 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -258,7 +258,7 @@ struct fscache_cookie *fscache_acquire_cookie(struct fscache_volume *volume, /** * fscache_use_cookie - Request usage of cookie attached to an object - * @object: Object description + * @cookie: The cookie representing the cache object * @will_modify: If cache is expected to be modified locally * * Request usage of the cookie attached to an object. The caller should tell @@ -274,7 +274,7 @@ static inline void fscache_use_cookie(struct fscache_cookie *cookie, /** * fscache_unuse_cookie - Cease usage of cookie attached to an object - * @object: Object description + * @cookie: The cookie representing the cache object * @aux_data: Updated auxiliary data (or NULL) * @object_size: Revised size of the object (or NULL) * -- cgit v1.2.3 From 2555283eb40df89945557273121e9393ef9b542b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 31 Aug 2022 19:06:00 +0200 Subject: mm/rmap: Fix anon_vma->degree ambiguity leading to double-reuse anon_vma->degree tracks the combined number of child anon_vmas and VMAs that use the anon_vma as their ->anon_vma. anon_vma_clone() then assumes that for any anon_vma attached to src->anon_vma_chain other than src->anon_vma, it is impossible for it to be a leaf node of the VMA tree, meaning that for such VMAs ->degree is elevated by 1 because of a child anon_vma, meaning that if ->degree equals 1 there are no VMAs that use the anon_vma as their ->anon_vma. This assumption is wrong because the ->degree optimization leads to leaf nodes being abandoned on anon_vma_clone() - an existing anon_vma is reused and no new parent-child relationship is created. So it is possible to reuse an anon_vma for one VMA while it is still tied to another VMA. This is an issue because is_mergeable_anon_vma() and its callers assume that if two VMAs have the same ->anon_vma, the list of anon_vmas attached to the VMAs is guaranteed to be the same. When this assumption is violated, vma_merge() can merge pages into a VMA that is not attached to the corresponding anon_vma, leading to dangling page->mapping pointers that will be dereferenced during rmap walks. Fix it by separately tracking the number of child anon_vmas and the number of VMAs using the anon_vma as their ->anon_vma. Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy") Cc: stable@kernel.org Acked-by: Michal Hocko Acked-by: Vlastimil Babka Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 7 +++++-- mm/rmap.c | 29 ++++++++++++++++------------- 2 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf80adca980b..b89b4b86951f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -41,12 +41,15 @@ struct anon_vma { atomic_t refcount; /* - * Count of child anon_vmas and VMAs which points to this anon_vma. + * Count of child anon_vmas. Equals to the count of all anon_vmas that + * have ->parent pointing to this one, including itself. * * This counter is used for making decision about reusing anon_vma * instead of forking new one. See comments in function anon_vma_clone. */ - unsigned degree; + unsigned long num_children; + /* Count of VMAs whose ->anon_vma pointer points to this object. */ + unsigned long num_active_vmas; struct anon_vma *parent; /* Parent of this anon_vma */ diff --git a/mm/rmap.c b/mm/rmap.c index edc06c52bc82..93d5a6f793d2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -93,7 +93,8 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); - anon_vma->degree = 1; /* Reference for first vma */ + anon_vma->num_children = 0; + anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called @@ -201,6 +202,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; + anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } @@ -210,8 +212,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; + anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } @@ -296,19 +297,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) anon_vma_chain_link(dst, avc, anon_vma); /* - * Reuse existing anon_vma if its degree lower than two, - * that means it has no vma and only one anon_vma child. + * Reuse existing anon_vma if it has no vma and only one + * anon_vma child. * - * Do not choose parent anon_vma, otherwise first child - * will always reuse it. Root anon_vma is never reused: + * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && - anon_vma != src->anon_vma && anon_vma->degree < 2) + anon_vma->num_children < 2 && + anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) - dst->anon_vma->degree++; + dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; @@ -358,6 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; + anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; @@ -378,7 +380,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma->parent->degree++; + anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; @@ -410,7 +412,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { - anon_vma->parent->degree--; + anon_vma->parent->num_children--; continue; } @@ -418,7 +420,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } if (vma->anon_vma) { - vma->anon_vma->degree--; + vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared @@ -436,7 +438,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - VM_WARN_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->num_children); + VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); -- cgit v1.2.3 From ac56a0b48da86fd1b4389632fb7c4c8a5d86eefa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 Aug 2022 15:39:28 +0100 Subject: rxrpc: Fix ICMP/ICMP6 error handling Because rxrpc pretends to be a tunnel on top of a UDP/UDP6 socket, allowing it to siphon off UDP packets early in the handling of received UDP packets thereby avoiding the packet going through the UDP receive queue, it doesn't get ICMP packets through the UDP ->sk_error_report() callback. In fact, it doesn't appear that there's any usable option for getting hold of ICMP packets. Fix this by adding a new UDP encap hook to distribute error messages for UDP tunnels. If the hook is set, then the tunnel driver will be able to see ICMP packets. The hook provides the offset into the packet of the UDP header of the original packet that caused the notification. An alternative would be to call the ->error_handler() hook - but that requires that the skbuff be cloned (as ip_icmp_error() or ipv6_cmp_error() do, though isn't really necessary or desirable in rxrpc's case is we want to parse them there and then, not queue them). Changes ======= ver #3) - Fixed an uninitialised variable. ver #2) - Fixed some missing CONFIG_AF_RXRPC_IPV6 conditionals. Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Signed-off-by: David Howells --- include/linux/udp.h | 1 + include/net/udp_tunnel.h | 4 + net/ipv4/udp.c | 2 + net/ipv4/udp_tunnel_core.c | 1 + net/ipv6/udp.c | 5 +- net/rxrpc/ar-internal.h | 1 + net/rxrpc/local_object.c | 1 + net/rxrpc/peer_event.c | 293 +++++++++++++++++++++++++++++++++++++++------ 8 files changed, 270 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 254a2654400f..e96da4157d04 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,6 +70,7 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index afc7ce713657..72394f441dad 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -67,6 +67,9 @@ static inline int udp_sock_create(struct net *net, typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); +typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, + struct sk_buff *skb, + unsigned int udp_offset); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, @@ -80,6 +83,7 @@ struct udp_tunnel_sock_cfg { __u8 encap_type; udp_tunnel_encap_rcv_t encap_rcv; udp_tunnel_encap_err_lookup_t encap_err_lookup; + udp_tunnel_encap_err_rcv_t encap_err_rcv; udp_tunnel_encap_destroy_t encap_destroy; udp_tunnel_gro_receive_t gro_receive; udp_tunnel_gro_complete_t gro_complete; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 34eda973bbf1..cd72158e953a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -783,6 +783,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2); goto out; } if (!inet->recverr) { diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 8efaf8c3fe2a..8242c8947340 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 16c176e7c69a..3366d6a77ff2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -616,8 +616,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } /* Tunnels don't have an application socket: don't pass errors back */ - if (tunnel) + if (tunnel) { + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, offset); goto out; + } if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 571436064cd6..62c70709d798 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -982,6 +982,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 96ecb7356c0f..79bb02eb67b2 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -137,6 +137,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_input_packet; + tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index be032850ae8c..32561e9567fe 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -16,22 +16,105 @@ #include #include #include +#include #include "ar-internal.h" +static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); static void rxrpc_distribute_error(struct rxrpc_peer *, int, enum rxrpc_call_completion); /* - * Find the peer associated with an ICMP packet. + * Find the peer associated with an ICMPv4 packet. */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, - const struct sk_buff *skb, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, struct sockaddr_rxrpc *srx) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct iphdr *ip, *ip0 = ip_hdr(skb); + struct icmphdr *icmp = icmp_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); - _enter(""); + _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); + + switch (icmp->type) { + case ICMP_DEST_UNREACH: + *info = ntohs(icmp->un.frag.mtu); + fallthrough; + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip = (struct iphdr *)((void *)icmp + 8); + break; + default: + return NULL; + } + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ + switch (srx->transport.family) { + case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; +#endif + + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} + +#ifdef CONFIG_AF_RXRPC_IPV6 +/* + * Find the peer associated with an ICMPv6 packet. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, + struct sockaddr_rxrpc *srx) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); + + _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); + + switch (icmp->icmp6_type) { + case ICMPV6_DEST_UNREACH: + *info = ntohl(icmp->icmp6_mtu); + fallthrough; + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + ip = (struct ipv6hdr *)((void *)icmp + 8); + break; + default: + return NULL; + } memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; @@ -41,6 +124,165 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice * versa? */ + switch (srx->transport.family) { + case AF_INET: + _net("Rx ICMP6 on v4 sock"); + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, + &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); + break; + case AF_INET6: + _net("Rx ICMP6"); + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, + sizeof(struct in6_addr)); + break; + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} +#endif /* CONFIG_AF_RXRPC_IPV6 */ + +/* + * Handle an error received on the local endpoint as a tunnel. + */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, + unsigned int udp_offset) +{ + struct sock_extended_err ee; + struct sockaddr_rxrpc srx; + struct rxrpc_local *local; + struct rxrpc_peer *peer; + unsigned int info = 0; + int err; + u8 version = ip_hdr(skb)->version; + u8 type = icmp_hdr(skb)->type; + u8 code = icmp_hdr(skb)->code; + + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } + + rxrpc_new_skb(skb, rxrpc_skb_received); + + switch (ip_hdr(skb)->version) { + case IPVERSION: + peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, + &info, &srx); + break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, + &info, &srx); + break; +#endif + default: + rcu_read_unlock(); + return; + } + + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); + return; + } + + memset(&ee, 0, sizeof(ee)); + + switch (version) { + case IPVERSION: + switch (type) { + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_FRAG_NEEDED: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + default: + break; + } + + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + /* Might want to do something different with + * non-fatal errors + */ + //harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + err = EPROTO; + break; + } + + ee.ee_origin = SO_EE_ORIGIN_ICMP; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + switch (type) { + case ICMPV6_PKT_TOOBIG: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + } + + icmpv6_err_convert(type, code, &err); + + if (err == EACCES) + err = EHOSTUNREACH; + + ee.ee_origin = SO_EE_ORIGIN_ICMP6; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; +#endif + } + + trace_rxrpc_rx_icmp(peer, &ee, &srx); + + rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); + rcu_read_unlock(); + rxrpc_put_peer(peer); +} + +/* + * Find the peer associated with a local error. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, + const struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + + _enter(""); + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + switch (srx->transport.family) { case AF_INET: srx->transport_len = sizeof(srx->transport.sin); @@ -104,10 +346,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* * Handle an MTU/fragmentation problem. */ -static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) +static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { - u32 mtu = serr->ee.ee_info; - _net("Rx ICMP Fragmentation Needed (%d)", mtu); /* wind down the local interface MTU */ @@ -148,7 +388,7 @@ void rxrpc_error_report(struct sock *sk) struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; struct rxrpc_local *local; - struct rxrpc_peer *peer; + struct rxrpc_peer *peer = NULL; struct sk_buff *skb; rcu_read_lock(); @@ -172,41 +412,20 @@ void rxrpc_error_report(struct sock *sk) } rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); - if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { - _leave("UDP empty message"); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - return; - } - peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - _leave(" [no peer]"); - return; - } - - trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); - - if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && - serr->ee.ee_type == ICMP_DEST_UNREACH && - serr->ee.ee_code == ICMP_FRAG_NEEDED)) { - rxrpc_adjust_mtu(peer, serr); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - rxrpc_put_peer(peer); - _leave(" [MTU update]"); - return; + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { + peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (peer) { + trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); + rxrpc_store_error(peer, serr); + } } - rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); - _leave(""); } -- cgit v1.2.3 From 3261400639463a853ba2b3be8bd009c2a8089775 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 23:38:09 +0000 Subject: tcp: TX zerocopy should not sense pfmemalloc status We got a recent syzbot report [1] showing a possible misuse of pfmemalloc page status in TCP zerocopy paths. Indeed, for pages coming from user space or other layers, using page_is_pfmemalloc() is moot, and possibly could give false positives. There has been attempts to make page_is_pfmemalloc() more robust, but not using it in the first place in this context is probably better, removing cpu cycles. Note to stable teams : You need to backport 84ce071e38a6 ("net: introduce __skb_fill_page_desc_noacc") as a prereq. Race is more probable after commit c07aea3ef4d4 ("mm: add a signature in struct page") because page_is_pfmemalloc() is now using low order bit from page->lru.next, which can change more often than page->index. Low order bit should never be set for lru.next (when used as an anchor in LRU list), so KCSAN report is mostly a false positive. Backporting to older kernel versions seems not necessary. [1] BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0: __list_add include/linux/list.h:73 [inline] list_add include/linux/list.h:88 [inline] lruvec_add_folio include/linux/mm_inline.h:105 [inline] lru_add_fn+0x440/0x520 mm/swap.c:228 folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246 folio_batch_add_and_move mm/swap.c:263 [inline] folio_add_lru+0xf1/0x140 mm/swap.c:490 filemap_add_folio+0xf8/0x150 mm/filemap.c:948 __filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981 pagecache_get_page+0x26/0x190 mm/folio-compat.c:104 grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116 ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988 generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738 ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270 ext4_file_write_iter+0x2e3/0x1210 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x468/0x760 fs/read_write.c:578 ksys_write+0xe8/0x1a0 fs/read_write.c:631 __do_sys_write fs/read_write.c:643 [inline] __se_sys_write fs/read_write.c:640 [inline] __x64_sys_write+0x3e/0x50 fs/read_write.c:640 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1: page_is_pfmemalloc include/linux/mm.h:1740 [inline] __skb_fill_page_desc include/linux/skbuff.h:2422 [inline] skb_fill_page_desc include/linux/skbuff.h:2443 [inline] tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018 do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075 tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline] tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833 kernel_sendpage+0x184/0x300 net/socket.c:3561 sock_sendpage+0x5a/0x70 net/socket.c:1054 pipe_to_sendpage+0x128/0x160 fs/splice.c:361 splice_from_pipe_feed fs/splice.c:415 [inline] __splice_from_pipe+0x222/0x4d0 fs/splice.c:559 splice_from_pipe fs/splice.c:594 [inline] generic_splice_sendpage+0x89/0xc0 fs/splice.c:743 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:931 splice_direct_to_actor+0x305/0x620 fs/splice.c:886 do_splice_direct+0xfb/0x180 fs/splice.c:974 do_sendfile+0x3bf/0x910 fs/read_write.c:1249 __do_sys_sendfile64 fs/read_write.c:1317 [inline] __se_sys_sendfile64 fs/read_write.c:1303 [inline] __x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0xffffea0004a1d288 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022 Fixes: c07aea3ef4d4 ("mm: add a signature in struct page") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Shakeel Butt Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/linux/skbuff.h | 21 +++++++++++++++++++++ net/core/datagram.c | 2 +- net/ipv4/tcp.c | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..18e163a3460d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2444,6 +2444,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, skb_shinfo(skb)->nr_frags = i + 1; } +/** + * skb_fill_page_desc_noacc - initialise a paged fragment in an skb + * @skb: buffer containing fragment to be initialised + * @i: paged fragment index to initialise + * @page: the page to use for this fragment + * @off: the offset to the data with @page + * @size: the length of the data + * + * Variant of skb_fill_page_desc() which does not deal with + * pfmemalloc, if page is not owned by us. + */ +static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, + struct page *page, int off, + int size) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + __skb_fill_page_desc_noacc(shinfo, i, page, off, size); + shinfo->nr_frags = i + 1; +} + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); diff --git a/net/core/datagram.c b/net/core/datagram.c index 7255531f63ae..e4ff2db40c98 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e5011c136fdb..6cdfce6f2867 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1015,7 +1015,7 @@ new_segment: skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); + skb_fill_page_desc_noacc(skb, i, page, offset, copy); } if (!(flags & MSG_NO_SHARED_FRAGS)) -- cgit v1.2.3 From b30f7c8eb0780e1479a9882526e838664271f4c9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 1 Sep 2022 13:07:32 +0100 Subject: spi: mux: Fix mux interaction with fast path optimisations The spi-mux driver is rather too clever and attempts to resubmit any message that is submitted to it to the parent controller with some adjusted callbacks. This does not play at all nicely with the fast path which now sets flags on the message indicating that it's being handled through the fast path, we see async messages flagged as being on the fast path. Ideally the spi-mux code would duplicate the message but that's rather invasive and a bit fragile in that it relies on the mux knowing which fields in the message to copy. Instead teach the core that there are controllers which can't cope with the fast path and have the mux flag itself as being such a controller, ensuring that messages going via the mux don't get partially handled via the fast path. This will reduce the performance of any spi-mux connected device since we'll now always use the thread for both the actual controller and the mux controller instead of just the actual controller but given that we were always hitting the slow path anyway it's hopefully not too much of an additional cost and it allows us to keep the fast path. Fixes: ae7d2346dc89 ("spi: Don't use the message queue if possible in spi_sync") Reported-by: Casper Andersson Tested-by: Casper Andersson Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220901120732.49245-1-broonie@kernel.org Signed-off-by: Mark Brown --- drivers/spi/spi-mux.c | 1 + drivers/spi/spi.c | 2 +- include/linux/spi/spi.h | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c index f5d32ec4634e..0709e987bd5a 100644 --- a/drivers/spi/spi-mux.c +++ b/drivers/spi/spi-mux.c @@ -161,6 +161,7 @@ static int spi_mux_probe(struct spi_device *spi) ctlr->num_chipselect = mux_control_states(priv->mux); ctlr->bus_num = -1; ctlr->dev.of_node = spi->dev.of_node; + ctlr->must_async = true; ret = devm_spi_register_controller(&spi->dev, ctlr); if (ret) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8f97a3eacdea..5cbe090f7676 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -4033,7 +4033,7 @@ static int __spi_sync(struct spi_device *spi, struct spi_message *message) * guard against reentrancy from a different context. The io_mutex * will catch those cases. */ - if (READ_ONCE(ctlr->queue_empty)) { + if (READ_ONCE(ctlr->queue_empty) && !ctlr->must_async) { message->actual_length = 0; message->status = -EINPROGRESS; diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e6c73d5ff1a8..f089ee1ead58 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -469,6 +469,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * SPI_TRANS_FAIL_NO_START. * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. + * @must_async: disable all fast paths in the core * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -690,6 +691,7 @@ struct spi_controller { /* Flag for enabling opportunistic skipping of the queue in spi_sync */ bool queue_empty; + bool must_async; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) -- cgit v1.2.3 From 2aec909912da55a6e469fd6ee8412080a5433ed2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Aug 2022 11:46:38 +0200 Subject: wifi: use struct_group to copy addresses We sometimes copy all the addresses from the 802.11 header for the AAD, which may cause complaints from fortify checks. Use struct_group() to avoid the compiler warnings/errors. Change-Id: Ic3ea389105e7813b22095b295079eecdabde5045 Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 +++++--- net/mac80211/wpa.c | 4 ++-- net/wireless/lib80211_crypt_ccmp.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55e6f4ad0ca6..b6e6d5b40774 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -310,9 +310,11 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; - u8 addr1[ETH_ALEN]; - u8 addr2[ETH_ALEN]; - u8 addr3[ETH_ALEN]; + struct_group(addrs, + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; + ); __le16 seq_ctrl; u8 addr4[ETH_ALEN]; } __packed __aligned(2); diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 93ec2f349748..20f742b5503b 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -351,7 +351,7 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad) * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(len_a, &aad[0]); put_unaligned(mask_fc, (__le16 *)&aad[2]); - memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN); + memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; @@ -792,7 +792,7 @@ static void bip_aad(struct sk_buff *skb, u8 *aad) IEEE80211_FCTL_MOREDATA); put_unaligned(mask_fc, (__le16 *) &aad[0]); /* A1 || A2 || A3 */ - memcpy(aad + 2, &hdr->addr1, 3 * ETH_ALEN); + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); } diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 6a5f08f7491e..cca5e1cf089e 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -136,7 +136,7 @@ static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, pos = (u8 *) hdr; aad[0] = pos[0] & 0x8f; aad[1] = pos[1] & 0xc7; - memcpy(aad + 2, hdr->addr1, 3 * ETH_ALEN); + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); pos = (u8 *) & hdr->seq_ctrl; aad[20] = pos[0] & 0x0f; aad[21] = 0; /* all bits masked */ -- cgit v1.2.3 From dec9b2f1e0455a151a7293c367da22ab973f713e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Sep 2022 16:59:15 +0200 Subject: debugfs: add debugfs_lookup_and_remove() There is a very common pattern of using debugfs_remove(debufs_lookup(..)) which results in a dentry leak of the dentry that was looked up. Instead of having to open-code the correct pattern of calling dput() on the dentry, create debugfs_lookup_and_remove() to handle this pattern automatically and properly without any memory leaks. Cc: stable Reported-by: Kuyo Chang Tested-by: Kuyo Chang Link: https://lore.kernel.org/r/YxIaQ8cSinDR881k@kroah.com Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 22 ++++++++++++++++++++++ include/linux/debugfs.h | 6 ++++++ 2 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3dcf0b8b4e93..232cfdf095ae 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -744,6 +744,28 @@ void debugfs_remove(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_remove); +/** + * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it + * @name: a pointer to a string containing the name of the item to look up. + * @parent: a pointer to the parent dentry of the item. + * + * This is the equlivant of doing something like + * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting + * handled for the directory being looked up. + */ +void debugfs_lookup_and_remove(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + + dentry = debugfs_lookup(name, parent); + if (!dentry) + return; + + debugfs_remove(dentry); + dput(dentry); +} +EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); + /** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index c869f1e73d75..f60674692d36 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -91,6 +91,8 @@ struct dentry *debugfs_create_automount(const char *name, void debugfs_remove(struct dentry *dentry); #define debugfs_remove_recursive debugfs_remove +void debugfs_lookup_and_remove(const char *name, struct dentry *parent); + const struct file_operations *debugfs_real_fops(const struct file *filp); int debugfs_file_get(struct dentry *dentry); @@ -225,6 +227,10 @@ static inline void debugfs_remove(struct dentry *dentry) static inline void debugfs_remove_recursive(struct dentry *dentry) { } +static inline void debugfs_lookup_and_remove(const char *name, + struct dentry *parent) +{ } + const struct file_operations *debugfs_real_fops(const struct file *filp); static inline int debugfs_file_get(struct dentry *dentry) -- cgit v1.2.3 From 9ca05b0f27de928be121cccf07735819dc9e1ed3 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 29 Aug 2022 12:02:27 +0300 Subject: RDMA/mlx5: Rely on RoCE fw cap instead of devlink when setting profile When the RDMA auxiliary driver probes, it sets its profile based on devlink driverinit value. The latter might not be in sync with FW yet (In case devlink reload is not performed), thus causing a mismatch between RDMA driver and FW. This results in the following FW syndrome when the RDMA driver tries to adjust RoCE state, which fails the probe: "0xC1F678 | modify_nic_vport_context: roce_en set on a vport that doesn't support roce" To prevent this, select the PF profile based on FW RoCE capability instead of relying on devlink driverinit value. To provide backward compatibility of the RoCE disable feature, on older FW's where roce_rw is not set (FW RoCE capability is read-only), keep the current behavior e.g., rely on devlink driverinit value. Fixes: fbfa97b4d79f ("net/mlx5: Disable roce at HCA level") Reviewed-by: Shay Drory Reviewed-by: Michael Guralnik Reviewed-by: Saeed Mahameed Signed-off-by: Maher Sanalla Link: https://lore.kernel.org/r/cb34ce9a1df4a24c135cb804db87f7d2418bd6cc.1661763459.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 23 +++++++++++++++++++++-- include/linux/mlx5/driver.h | 19 ++++++++++--------- 3 files changed, 32 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fc94a1b25485..883d7c60143e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4336,7 +4336,7 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev->mdev = mdev; dev->num_ports = num_ports; - if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_init_enabled(mdev)) + if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) profile = &raw_eth_profile; else profile = &pf_profile; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index bec8d6d0b5f6..4870a88cecb7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -494,6 +494,24 @@ static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev) return err; } +bool mlx5_is_roce_on(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + &val); + + if (!err) + return val.vbool; + + mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err); + return MLX5_CAP_GEN(dev, roce); +} +EXPORT_SYMBOL(mlx5_is_roce_on); + static int handle_hca_cap_2(struct mlx5_core_dev *dev, void *set_ctx) { void *set_hca_cap; @@ -597,7 +615,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)); if (MLX5_CAP_GEN(dev, roce_rw_supported)) - MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_init_enabled(dev)); + MLX5_SET(cmd_hca_cap, set_hca_cap, roce, + mlx5_is_roce_on(dev)); max_uc_list = max_uc_list_get_devlink_param(dev); if (max_uc_list > 0) @@ -623,7 +642,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) */ static bool is_roce_fw_disabled(struct mlx5_core_dev *dev) { - return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_init_enabled(dev)) || + return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_on(dev)) || (!MLX5_CAP_GEN(dev, roce_rw_supported) && !MLX5_CAP_GEN(dev, roce)); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 96b16fbe1aa4..d6338fb449c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1279,16 +1279,17 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; -static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev) +bool mlx5_is_roce_on(struct mlx5_core_dev *dev); + +static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev) { - struct devlink *devlink = priv_to_devlink(dev); - union devlink_param_value val; - int err; - - err = devlink_param_driverinit_value_get(devlink, - DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, - &val); - return err ? MLX5_CAP_GEN(dev, roce) : val.vbool; + if (MLX5_CAP_GEN(dev, roce_rw_supported)) + return MLX5_CAP_GEN(dev, roce); + + /* If RoCE cap is read-only in FW, get RoCE state from devlink + * in order to support RoCE enable/disable feature + */ + return mlx5_is_roce_on(dev); } #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 9fc18f6d56d5b79d527c17a8100a0965d18345cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 Aug 2022 16:06:44 +0200 Subject: dma-mapping: mark dma_supported static Now that the remaining users in drivers are gone, this function can be marked static. Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 5 ----- kernel/dma/mapping.c | 3 +-- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 25a30906289d..0ee20b764000 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -139,7 +139,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_can_mmap(struct device *dev); -int dma_supported(struct device *dev, u64 mask); bool dma_pci_p2pdma_supported(struct device *dev); int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); @@ -248,10 +247,6 @@ static inline bool dma_can_mmap(struct device *dev) { return false; } -static inline int dma_supported(struct device *dev, u64 mask) -{ - return 0; -} static inline bool dma_pci_p2pdma_supported(struct device *dev) { return false; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de7..27f272381cf2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -707,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); -int dma_supported(struct device *dev, u64 mask) +static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -721,7 +721,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; return ops->dma_supported(dev, mask); } -EXPORT_SYMBOL(dma_supported); bool dma_pci_p2pdma_supported(struct device *dev) { -- cgit v1.2.3 From 2f79cdfe58c13949bbbb65ba5926abfe9561d0ec Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 31 Aug 2022 09:46:12 -0700 Subject: fs: only do a memory barrier for the first set_buffer_uptodate() Commit d4252071b97d ("add barriers to buffer_uptodate and set_buffer_uptodate") added proper memory barriers to the buffer head BH_Uptodate bit, so that anybody who tests a buffer for being up-to-date will be guaranteed to actually see initialized state. However, that commit didn't _just_ add the memory barrier, it also ended up dropping the "was it already set" logic that the BUFFER_FNS() macro had. That's conceptually the right thing for a generic "this is a memory barrier" operation, but in the case of the buffer contents, we really only care about the memory barrier for the _first_ time we set the bit, in that the only memory ordering protection we need is to avoid anybody seeing uninitialized memory contents. Any other access ordering wouldn't be about the BH_Uptodate bit anyway, and would require some other proper lock (typically BH_Lock or the folio lock). A reader that races with somebody invalidating the buffer head isn't an issue wrt the memory ordering, it's a serialization issue. Now, you'd think that the buffer head operations don't matter in this day and age (and I certainly thought so), but apparently some loads still end up being heavy users of buffer heads. In particular, the kernel test robot reported that not having this bit access optimization in place caused a noticeable direct IO performance regression on ext4: fxmark.ssd_ext4_no_jnl_DWTL_54_directio.works/sec -26.5% regression although you presumably need a fast disk and a lot of cores to actually notice. Link: https://lore.kernel.org/all/Yw8L7HTZ%2FdE2%2Fo9C@xsang-OptiPlex-9020/ Reported-by: kernel test robot Tested-by: Fengwei Yin Cc: Mikulas Patocka Cc: Matthew Wilcox (Oracle) Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 089c9ade4325..df518c429667 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -137,6 +137,17 @@ BUFFER_FNS(Defer_Completion, defer_completion) static __always_inline void set_buffer_uptodate(struct buffer_head *bh) { + /* + * If somebody else already set this uptodate, they will + * have done the memory barrier, and a reader will thus + * see *some* valid buffer state. + * + * Any other serialization (with IO errors or whatever that + * might clear the bit) has to come from other state (eg BH_Lock). + */ + if (test_bit(BH_Uptodate, &bh->b_state)) + return; + /* * make it consistent with folio_mark_uptodate * pairs with smp_load_acquire in buffer_uptodate -- cgit v1.2.3 From 9cd4f1434479f1ac25c440c421fbf52069079914 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 11 Sep 2022 11:18:45 +0800 Subject: iommu/vt-d: Fix possible recursive locking in intel_iommu_init() The global rwsem dmar_global_lock was introduced by commit 3a5670e8ac932 ("iommu/vt-d: Introduce a rwsem to protect global data structures"). It is used to protect DMAR related global data from DMAR hotplug operations. The dmar_global_lock used in the intel_iommu_init() might cause recursive locking issue, for example, intel_iommu_get_resv_regions() is taking the dmar_global_lock from within a section where intel_iommu_init() already holds it via probe_acpi_namespace_devices(). Using dmar_global_lock in intel_iommu_init() could be relaxed since it is unlikely that any IO board must be hot added before the IOMMU subsystem is initialized. This eliminates the possible recursive locking issue by moving down DMAR hotplug support after the IOMMU is initialized and removing the uses of dmar_global_lock in intel_iommu_init(). Fixes: d5692d4af08cd ("iommu/vt-d: Fix suspicious RCU usage in probe_acpi_namespace_devices()") Reported-by: Robin Murphy Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/894db0ccae854b35c73814485569b634237b5538.1657034828.git.robin.murphy@arm.com Link: https://lore.kernel.org/r/20220718235325.3952426-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 7 +++++++ drivers/iommu/intel/iommu.c | 27 ++------------------------- include/linux/dmar.h | 4 +++- 3 files changed, 12 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 5a8f780e7ffd..497c912ad9e1 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -2349,6 +2349,13 @@ static int dmar_device_hotplug(acpi_handle handle, bool insert) if (!dmar_in_use()) return 0; + /* + * It's unlikely that any I/O board is hot added before the IOMMU + * subsystem is initialized. + */ + if (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled) + return -EOPNOTSUPP; + if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) { tmp = handle; } else { diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 64d30895a4c8..1f2cd43cf9bc 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3019,13 +3019,7 @@ static int __init init_dmars(void) #ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - /* - * Call dmar_alloc_hwirq() with dmar_global_lock held, - * could cause possible lock race condition. - */ - up_write(&dmar_global_lock); ret = intel_svm_enable_prq(iommu); - down_write(&dmar_global_lock); if (ret) goto free_iommu; } @@ -3938,7 +3932,6 @@ int __init intel_iommu_init(void) force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || platform_optin_force_iommu(); - down_write(&dmar_global_lock); if (dmar_table_init()) { if (force_on) panic("tboot: Failed to initialize DMAR table\n"); @@ -3951,16 +3944,6 @@ int __init intel_iommu_init(void) goto out_free_dmar; } - up_write(&dmar_global_lock); - - /* - * The bus notifier takes the dmar_global_lock, so lockdep will - * complain later when we register it under the lock. - */ - dmar_register_bus_notifier(); - - down_write(&dmar_global_lock); - if (!no_iommu) intel_iommu_debugfs_init(); @@ -4005,11 +3988,9 @@ int __init intel_iommu_init(void) pr_err("Initialization failed\n"); goto out_free_dmar; } - up_write(&dmar_global_lock); init_iommu_pm_ops(); - down_read(&dmar_global_lock); for_each_active_iommu(iommu, drhd) { /* * The flush queue implementation does not perform @@ -4027,13 +4008,11 @@ int __init intel_iommu_init(void) "%s", iommu->name); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); } - up_read(&dmar_global_lock); bus_set_iommu(&pci_bus_type, &intel_iommu_ops); if (si_domain && !hw_pass_through) register_memory_notifier(&intel_iommu_memory_nb); - down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); @@ -4044,17 +4023,15 @@ int __init intel_iommu_init(void) iommu_disable_protect_mem_regions(iommu); } - up_read(&dmar_global_lock); - - pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); intel_iommu_enabled = 1; + dmar_register_bus_notifier(); + pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); return 0; out_free_dmar: intel_iommu_free_dmars(); - up_write(&dmar_global_lock); return ret; } diff --git a/include/linux/dmar.h b/include/linux/dmar.h index d81a51978d01..8917a32173c4 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -65,6 +65,7 @@ struct dmar_pci_notify_info { extern struct rw_semaphore dmar_global_lock; extern struct list_head dmar_drhd_units; +extern int intel_iommu_enabled; #define for_each_drhd_unit(drhd) \ list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ @@ -88,7 +89,8 @@ extern struct list_head dmar_drhd_units; static inline bool dmar_rcu_check(void) { return rwsem_is_locked(&dmar_global_lock) || - system_state == SYSTEM_BOOTING; + system_state == SYSTEM_BOOTING || + (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled); } #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) -- cgit v1.2.3