From 75354284cc3aa58f7e54d479d9bee69bd2ca828f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 6 Aug 2019 16:14:44 +0900 Subject: auxdisplay: charlcd: move charlcd.h to drivers/auxdisplay This header is included in drivers/auxdisplay/. Make it a local header. Reviewed-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada Signed-off-by: Miguel Ojeda --- include/misc/charlcd.h | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 include/misc/charlcd.h (limited to 'include') diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h deleted file mode 100644 index 8cf6c18b0adb..000000000000 --- a/include/misc/charlcd.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Character LCD driver for Linux - * - * Copyright (C) 2000-2008, Willy Tarreau - * Copyright (C) 2016-2017 Glider bvba - */ - -struct charlcd { - const struct charlcd_ops *ops; - const unsigned char *char_conv; /* Optional */ - - int ifwidth; /* 4-bit or 8-bit (default) */ - int height; - int width; - int bwidth; /* Default set by charlcd_alloc() */ - int hwidth; /* Default set by charlcd_alloc() */ - - void *drvdata; /* Set by charlcd_alloc() */ -}; - -struct charlcd_ops { - /* Required */ - void (*write_cmd)(struct charlcd *lcd, int cmd); - void (*write_data)(struct charlcd *lcd, int data); - - /* Optional */ - void (*write_cmd_raw4)(struct charlcd *lcd, int cmd); /* 4-bit only */ - void (*clear_fast)(struct charlcd *lcd); - void (*backlight)(struct charlcd *lcd, int on); -}; - -struct charlcd *charlcd_alloc(unsigned int drvdata_size); -void charlcd_free(struct charlcd *lcd); - -int charlcd_register(struct charlcd *lcd); -int charlcd_unregister(struct charlcd *lcd); - -void charlcd_poke(struct charlcd *lcd); -- cgit v1.2.3 From 33dcb37cef741294b481f4d889a465b8091f11bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Jul 2019 09:26:40 +0200 Subject: dma-mapping: fix page attributes for dma_mmap_* All the way back to introducing dma_common_mmap we've defaulted to mark the pages as uncached. But this is wrong for DMA coherent devices. Later on DMA_ATTR_WRITE_COMBINE also got incorrect treatment as that flag is only treated special on the alloc side for non-coherent devices. Introduce a new dma_pgprot helper that deals with the check for coherent devices so that only the remapping cases ever reach arch_dma_mmap_pgprot and we thus ensure no aliasing of page attributes happens, which makes the powerpc version of arch_dma_mmap_pgprot obsolete and simplifies the remaining ones. Note that this means arch_dma_mmap_pgprot is a bit misnamed now, but we'll phase it out soon. Fixes: 64ccc9c033c6 ("common: dma-mapping: add support for generic dma_mmap_* calls") Reported-by: Shawn Anastasio Reported-by: Gavin Li Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas # arm64 --- arch/arm/mm/dma-mapping.c | 4 +--- arch/arm64/mm/dma-mapping.c | 4 +--- arch/powerpc/Kconfig | 1 - arch/powerpc/kernel/Makefile | 3 +-- arch/powerpc/kernel/dma-common.c | 17 ----------------- drivers/iommu/dma-iommu.c | 6 +++--- include/linux/dma-noncoherent.h | 13 +++++++++---- kernel/dma/mapping.c | 19 ++++++++++++++++++- kernel/dma/remap.c | 2 +- 9 files changed, 34 insertions(+), 35 deletions(-) delete mode 100644 arch/powerpc/kernel/dma-common.c (limited to 'include') diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 6774b03aa405..d42557ee69c2 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2405,9 +2405,7 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) - return __get_dma_pgprot(attrs, prot); - return prot; + return __get_dma_pgprot(attrs, prot); } void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 1d3f0b5a9940..bd2b039f43a6 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -14,9 +14,7 @@ pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { - if (!dev_is_dma_coherent(dev) || (attrs & DMA_ATTR_WRITE_COMBINE)) - return pgprot_writecombine(prot); - return prot; + return pgprot_writecombine(prot); } void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 77f6ebf97113..d8dcd8820369 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,7 +121,6 @@ config PPC select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_MMAP_PGPROT select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ea0c69236789..56dfa7a2a6f2 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -49,8 +49,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ - of_platform.o prom_parse.o \ - dma-common.o + of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o diff --git a/arch/powerpc/kernel/dma-common.c b/arch/powerpc/kernel/dma-common.c deleted file mode 100644 index dc7ef6b17b69..000000000000 --- a/arch/powerpc/kernel/dma-common.c +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Contains common dma routines for all powerpc platforms. - * - * Copyright (C) 2019 Shawn Anastasio. - */ - -#include -#include - -pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, - unsigned long attrs) -{ - if (!dev_is_dma_coherent(dev)) - return pgprot_noncached(prot); - return prot; -} diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index a7f9c3edbcb2..0015fe610b23 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -574,7 +574,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, struct iova_domain *iovad = &cookie->iovad; bool coherent = dev_is_dma_coherent(dev); int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); - pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs); + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap; struct page **pages; struct sg_table sgt; @@ -975,7 +975,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size, return NULL; if (IS_ENABLED(CONFIG_DMA_REMAP) && (!coherent || PageHighMem(page))) { - pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs); + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); cpu_addr = dma_common_contiguous_remap(page, alloc_size, VM_USERMAP, prot, __builtin_return_address(0)); @@ -1035,7 +1035,7 @@ static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn, off = vma->vm_pgoff; int ret; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index 3813211a9aad..0bff3d7fac92 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -42,13 +42,18 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, dma_addr_t dma_addr); - -#ifdef CONFIG_ARCH_HAS_DMA_MMAP_PGPROT pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs); + +#ifdef CONFIG_MMU +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs); #else -# define arch_dma_mmap_pgprot(dev, prot, attrs) pgprot_noncached(prot) -#endif +static inline pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, + unsigned long attrs) +{ + return prot; /* no protection bits supported without page tables */ +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b945239621d8..b0038ca3aa92 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -150,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, } EXPORT_SYMBOL(dma_get_sgtable_attrs); +#ifdef CONFIG_MMU +/* + * Return the page attributes used for mapping dma_alloc_* memory, either in + * kernel space if remapping is needed, or to userspace through dma_mmap_*. + */ +pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) +{ + if (dev_is_dma_coherent(dev) || + (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && + (attrs & DMA_ATTR_NON_CONSISTENT))) + return prot; + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT)) + return arch_dma_mmap_pgprot(dev, prot, attrs); + return pgprot_noncached(prot); +} +#endif /* CONFIG_MMU */ + /* * Create userspace mapping for the DMA-coherent memory. */ @@ -164,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn; int ret = -ENXIO; - vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index a594aec07882..ffe78f0b2fe4 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, - arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), + dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { __dma_direct_free_pages(dev, size, page); -- cgit v1.2.3 From accd2dd72c8f087441d725dd916688171519e4e6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 Aug 2019 10:23:57 +0200 Subject: PCI/ASPM: Add pcie_aspm_enabled() Add a function checking whether or not PCIe ASPM has been enabled for a given device. It will be used by the NVMe driver to decide how to handle the device during system suspend. Signed-off-by: Rafael J. Wysocki Reviewed-by: Keith Busch Acked-by: Bjorn Helgaas --- drivers/pci/pcie/aspm.c | 20 ++++++++++++++++++++ include/linux/pci.h | 2 ++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e44af7f4d37f..464f8f92653f 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1170,6 +1170,26 @@ static int pcie_aspm_get_policy(char *buffer, const struct kernel_param *kp) module_param_call(policy, pcie_aspm_set_policy, pcie_aspm_get_policy, NULL, 0644); +/** + * pcie_aspm_enabled - Check if PCIe ASPM has been enabled for a device. + * @pdev: Target device. + */ +bool pcie_aspm_enabled(struct pci_dev *pdev) +{ + struct pci_dev *bridge = pci_upstream_bridge(pdev); + bool ret; + + if (!bridge) + return false; + + mutex_lock(&aspm_lock); + ret = bridge->link_state ? !!bridge->link_state->aspm_enabled : false; + mutex_unlock(&aspm_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pcie_aspm_enabled); + #ifdef CONFIG_PCIEASPM_DEBUG static ssize_t link_state_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..82e4cd1b7ac3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1567,8 +1567,10 @@ extern bool pcie_ports_native; #ifdef CONFIG_PCIEASPM bool pcie_aspm_support_enabled(void); +bool pcie_aspm_enabled(struct pci_dev *pdev); #else static inline bool pcie_aspm_support_enabled(void) { return false; } +static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEAER -- cgit v1.2.3 From 2c8ccb37b08fe364f02a9914daca474d43151453 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Fri, 9 Aug 2019 17:18:16 +0200 Subject: RDMA/siw: Change CQ flags from 64->32 bits This patch changes the driver/user shared (mmapped) CQ notification flags field from unsigned 64-bits size to unsigned 32-bits size. This enables building siw on 32-bit architectures. This patch changes the siw-abi, but as siw was only just merged in this merge window cycle, there are no released kernels with the prior abi. We are making no attempt to be binary compatible with siw user space libraries prior to the merge of siw into the upstream kernel, only moving forward with upstream kernels and upstream rdma-core provided siw libraries are we guaranteeing compatibility. Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20190809151816.13018-1-bmt@zurich.ibm.com Signed-off-by: Doug Ledford --- drivers/infiniband/sw/siw/Kconfig | 2 +- drivers/infiniband/sw/siw/siw.h | 2 +- drivers/infiniband/sw/siw/siw_qp.c | 14 ++++++++++---- drivers/infiniband/sw/siw/siw_verbs.c | 16 +++++++++++----- include/uapi/rdma/siw-abi.h | 3 ++- 5 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index dace276aea14..b622fc62f2cd 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -1,6 +1,6 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" - depends on INET && INFINIBAND && LIBCRC32C && 64BIT + depends on INET && INFINIBAND && LIBCRC32C select DMA_VIRT_OPS help This driver implements the iWARP RDMA transport over diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 03fd7b2f595f..77b1aabf6ff3 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -214,7 +214,7 @@ struct siw_wqe { struct siw_cq { struct ib_cq base_cq; spinlock_t lock; - u64 *notify; + struct siw_cq_ctrl *notify; struct siw_cqe *queue; u32 cq_put; u32 cq_get; diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c index e27bd5b35b96..0990307c5d2c 100644 --- a/drivers/infiniband/sw/siw/siw_qp.c +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -1013,18 +1013,24 @@ out: */ static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags) { - u64 cq_notify; + u32 cq_notify; if (!cq->base_cq.comp_handler) return false; - cq_notify = READ_ONCE(*cq->notify); + /* Read application shared notification state */ + cq_notify = READ_ONCE(cq->notify->flags); if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) || ((cq_notify & SIW_NOTIFY_SOLICITED) && (flags & SIW_WQE_SOLICITED))) { - /* dis-arm CQ */ - smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); + /* + * CQ notification is one-shot: Since the + * current CQE causes user notification, + * the CQ gets dis-aremd and must be re-aremd + * by the user for a new notification. + */ + WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT); return true; } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 32dc79d0e898..e7f3a2379d9d 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1049,7 +1049,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, spin_lock_init(&cq->lock); - cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; + cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; if (udata) { struct siw_uresp_create_cq uresp = {}; @@ -1141,11 +1141,17 @@ int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) siw_dbg_cq(cq, "flags: 0x%02x\n", flags); if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) - /* CQ event for next solicited completion */ - smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); + /* + * Enable CQ event for next solicited completion. + * and make it visible to all associated producers. + */ + smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); else - /* CQ event for any signalled completion */ - smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); + /* + * Enable CQ event for any signalled completion. + * and make it visible to all associated producers. + */ + smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); if (flags & IB_CQ_REPORT_MISSED_EVENTS) return cq->cq_put - cq->cq_get; diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h index 7de68f1dc707..af735f55b291 100644 --- a/include/uapi/rdma/siw-abi.h +++ b/include/uapi/rdma/siw-abi.h @@ -180,6 +180,7 @@ struct siw_cqe { * to control CQ arming. */ struct siw_cq_ctrl { - __aligned_u64 notify; + __u32 flags; + __u32 pad; }; #endif -- cgit v1.2.3 From 76470ccd62f18bfa0954bec10f2329339f793914 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 13 Aug 2019 15:37:04 -0700 Subject: mm: document zone device struct page field usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/hmm: fixes for device private page migration", v3. Testing the latest linux git tree turned up a few bugs with page migration to and from ZONE_DEVICE private and anonymous pages. Hopefully it clarifies how ZONE_DEVICE private struct page uses the same mapping and index fields from the source anonymous page mapping. This patch (of 3): Struct page for ZONE_DEVICE private pages uses the page->mapping and and page->index fields while the source anonymous pages are migrated to device private memory. This is so rmap_walk() can find the page when migrating the ZONE_DEVICE private page back to system memory. ZONE_DEVICE pmem backed fsdax pages also use the page->mapping and page->index fields when files are mapped into a process address space. Add comments to struct page and remove the unused "_zd_pad_1" field to make this more clear. Link: http://lkml.kernel.org/r/20190724232700.23327-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a37a89eb7a7..6a7a1083b6fb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,7 +159,16 @@ struct page { /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; - unsigned long _zd_pad_1; /* uses mapping */ + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ }; /** @rcu_head: You can use this to free a page by RCU. */ -- cgit v1.2.3 From ec9f02384f6053f2a5417e82b65078edc5364a8d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 13 Aug 2019 15:37:41 -0700 Subject: mm: workingset: fix vmstat counters for shadow nodes Memcg counters for shadow nodes are broken because the memcg pointer is obtained in a wrong way. The following approach is used: virt_to_page(xa_node)->mem_cgroup Since commit 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") page->mem_cgroup pointer isn't set for slab pages, so memcg_from_slab_page() should be used instead. Also I doubt that it ever worked correctly: virt_to_head_page() should be used instead of virt_to_page(). Otherwise objects residing on tail pages are not accounted, because only the head page contains a valid mem_cgroup pointer. That was a case since the introduction of these counters by the commit 68d48e6a2df5 ("mm: workingset: add vmstat counter for shadow nodes"). Link: http://lkml.kernel.org/r/20190801233532.138743-1-guro@fb.com Fixes: 4d96ba353075 ("mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages") Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Vladimir Davydov Cc: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++++++++++++++ mm/memcontrol.c | 20 ++++++++++++++++++++ mm/workingset.c | 10 ++++------ 3 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44c41462be33..2cd4359cb38c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -668,6 +668,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) @@ -1072,6 +1073,14 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } +static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, + int val) +{ + struct page *page = virt_to_head_page(p); + + __mod_node_page_state(page_pgdat(page), idx, val); +} + static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1159,6 +1168,16 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +{ + __mod_lruvec_slab_state(p, idx, 1); +} + +static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +{ + __mod_lruvec_slab_state(p, idx, -1); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_state(struct mem_cgroup *memcg, int idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e405e058eda..6f5c0c517c49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -768,6 +768,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } +void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +{ + struct page *page = virt_to_head_page(p); + pg_data_t *pgdat = page_pgdat(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = memcg_from_slab_page(page); + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg || memcg == root_mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); + } else { + lruvec = mem_cgroup_lruvec(pgdat, memcg); + __mod_lruvec_state(lruvec, idx, val); + } + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup diff --git a/mm/workingset.c b/mm/workingset.c index e0b4edcb88c8..c963831d354f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -380,14 +380,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_page_state(virt_to_page(node), - WORKINGSET_NODES); + __inc_lruvec_slab_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_page_state(virt_to_page(node), - WORKINGSET_NODES); + __dec_lruvec_slab_state(node, WORKINGSET_NODES); } } } @@ -480,7 +478,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + __dec_lruvec_slab_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -503,7 +501,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * shadow entries we were tracking ... */ xas_store(&xas, NULL); - __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3 From 0cfaee2af3a04c0be5f056cebe5f804dedc59a43 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 13 Aug 2019 15:37:47 -0700 Subject: include/asm-generic/5level-fixup.h: fix variable 'p4d' set but not used A compiler throws a warning on an arm64 system since commit 9849a5697d3d ("arch, mm: convert all architectures to use 5level-fixup.h"), mm/kasan/init.c: In function 'kasan_free_p4d': mm/kasan/init.c:344:9: warning: variable 'p4d' set but not used [-Wunused-but-set-variable] p4d_t *p4d; ^~~ because p4d_none() in "5level-fixup.h" is compiled away while it is a static inline function in "pgtable-nopud.h". However, if converted p4d_none() to a static inline there, powerpc would be unhappy as it reads those in assembler language in "arch/powerpc/include/asm/book3s/64/pgtable.h", so it needs to skip assembly include for the static inline C function. While at it, converted a few similar functions to be consistent with the ones in "pgtable-nopud.h". Link: http://lkml.kernel.org/r/20190806232917.881-1-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Arnd Bergmann Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index bb6cb347018c..f6947da70d71 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -19,9 +19,24 @@ #define p4d_alloc(mm, pgd, address) (pgd) #define p4d_offset(pgd, start) (pgd) -#define p4d_none(p4d) 0 -#define p4d_bad(p4d) 0 -#define p4d_present(p4d) 1 + +#ifndef __ASSEMBLY__ +static inline int p4d_none(p4d_t p4d) +{ + return 0; +} + +static inline int p4d_bad(p4d_t p4d) +{ + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + return 1; +} +#endif + #define p4d_ERROR(p4d) do { } while (0) #define p4d_clear(p4d) pgd_clear(p4d) #define p4d_val(p4d) pgd_val(p4d) -- cgit v1.2.3 From 92717d429b38e4f9f934eed7e605cc42858f1839 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 13 Aug 2019 15:37:50 -0700 Subject: Revert "Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"" Patch series "reapply: relax __GFP_THISNODE for MADV_HUGEPAGE mappings". The fixes for what was originally reported as "pathological THP behavior" we rightfully reverted to be sure not to introduced regressions at end of a merge window after a severe regression report from the kernel bot. We can safely re-apply them now that we had time to analyze the problem. The mm process worked fine, because the good fixes were eventually committed upstream without excessive delay. The regression reported by the kernel bot however forced us to revert the good fixes to be sure not to introduce regressions and to give us the time to analyze the issue further. The silver lining is that this extra time allowed to think more at this issue and also plan for a future direction to improve things further in terms of THP NUMA locality. This patch (of 2): This reverts commit 356ff8a9a78fb35d ("Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). So it reapplies 89c83fb539f954 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). Consolidation of the THP allocation flags at the same place was meant to be a clean up to easier handle otherwise scattered code which is imposing a maintenance burden. There were no real problems observed with the gfp mask consolidation but the reversion was rushed through without a larger consensus regardless. This patch brings the consolidation back because this should make the long term maintainability easier as well as it should allow future changes to be less error prone. [mhocko@kernel.org: changelog additions] Link: http://lkml.kernel.org/r/20190503223146.2312-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Rientjes Cc: Zi Yan Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++-------- mm/huge_memory.c | 27 ++++++++++++++------------- mm/mempolicy.c | 32 +++----------------------------- mm/shmem.c | 2 +- 4 files changed, 22 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fb07b503dc45..f33881688f42 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ - alloc_pages(gfp_mask, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) + alloc_pages_vma(gfp_mask, 0, vma, addr, node) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1334ede667a8..f7e388b8662d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -644,30 +644,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | __GFP_THISNODE | + (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return gfp_mask | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | - (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | - (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - return GFP_TRANSHUGE_LIGHT; + return gfp_mask; } /* Caller must hold page table lock. */ @@ -739,8 +739,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1347,8 +1347,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (__transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 547cd403ed02..9c9877a43d58 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1180,8 +1180,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, - HPAGE_PMD_ORDER); + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, + address, numa_node_id()); if (!thp) return NULL; prep_transhuge_page(thp); @@ -2083,7 +2083,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2094,7 +2093,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, int node) { struct mempolicy *pol; struct page *page; @@ -2112,31 +2111,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; - - /* - * For hugepage allocation and non-interleave policy which - * allows the current node (or other explicitly preferred - * node) we only try to allocate from the current/preferred - * node and don't fall back to other nodes, as the cost of - * remote accesses would likely offset THP benefits. - * - * If the policy is interleave, or does not allow the current - * node in its nodemask, we allocate the standard way. - */ - if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); - page = __alloc_pages_node(hpage_node, - gfp | __GFP_THISNODE, order); - goto out; - } - } - nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index 626d8c74b973..2bed4761f279 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1466,7 +1466,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3 From a8282608c88e08b1782141026eab61204c1e533f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 13 Aug 2019 15:37:53 -0700 Subject: Revert "mm, thp: restore node-local hugepage allocations" This reverts commit 2f0799a0ffc033b ("mm, thp: restore node-local hugepage allocations"). commit 2f0799a0ffc033b was rightfully applied to avoid the risk of a severe regression that was reported by the kernel test robot at the end of the merge window. Now we understood the regression was a false positive and was caused by a significant increase in fairness during a swap trashing benchmark. So it's safe to re-apply the fix and continue improving the code from there. The benchmark that reported the regression is very useful, but it provides a meaningful result only when there is no significant alteration in fairness during the workload. The removal of __GFP_THISNODE increased fairness. __GFP_THISNODE cannot be used in the generic page faults path for new memory allocations under the MPOL_DEFAULT mempolicy, or the allocation behavior significantly deviates from what the MPOL_DEFAULT semantics are supposed to be for THP and 4k allocations alike. Setting THP defrag to "always" or using MADV_HUGEPAGE (with THP defrag set to "madvise") has never meant to provide an implicit MPOL_BIND on the "current" node the task is running on, causing swap storms and providing a much more aggressive behavior than even zone_reclaim_node = 3. Any workload who could have benefited from __GFP_THISNODE has now to enable zone_reclaim_mode=1||2||3. __GFP_THISNODE implicitly provided the zone_reclaim_mode behavior, but it only did so if THP was enabled: if THP was disabled, there would have been no chance to get any 4k page from the current node if the current node was full of pagecache, which further shows how this __GFP_THISNODE was misplaced in MADV_HUGEPAGE. MADV_HUGEPAGE has never been intended to provide any zone_reclaim_mode semantics, in fact the two are orthogonal, zone_reclaim_mode = 1|2|3 must work exactly the same with MADV_HUGEPAGE set or not. The performance characteristic of memory depends on the hardware details. The numbers below are obtained on Naples/EPYC architecture and the N/A projection extends them to show what we should aim for in the future as a good THP NUMA locality default. The benchmark used exercises random memory seeks (note: the cost of the page faults is not part of the measurement). D0 THP | D0 4k | D1 THP | D1 4k | D2 THP | D2 4k | D3 THP | D3 4k | ... 0% | +43% | +45% | +106% | +131% | +224% | N/A | N/A D0 means distance zero (i.e. local memory), D1 means distance one (i.e. intra socket memory), D2 means distance two (i.e. inter socket memory), etc... For the guest physical memory allocated by qemu and for guest mode kernel the performance characteristic of RAM is more complex and an ideal default could be: D0 THP | D1 THP | D0 4k | D2 THP | D1 4k | D3 THP | D2 4k | D3 4k | ... 0% | +58% | +101% | N/A | +222% | N/A | N/A | N/A NOTE: the N/A are projections and haven't been measured yet, the measurement in this case is done on a 1950x with only two NUMA nodes. The THP case here means THP was used both in the host and in the guest. After applying this commit the THP NUMA locality order that we'll get out of MADV_HUGEPAGE is this: D0 THP | D1 THP | D2 THP | D3 THP | ... | D0 4k | D1 4k | D2 4k | D3 4k | ... Before this commit it was: D0 THP | D0 4k | D1 4k | D2 4k | D3 4k | ... Even if we ignore the breakage of large workloads that can't fit in a single node that the __GFP_THISNODE implicit "current node" mbind caused, the THP NUMA locality order provided by __GFP_THISNODE was still not the one we shall aim for in the long term (i.e. the first one at the top). After this commit is applied, we can introduce a new allocator multi order API and to replace those two alloc_pages_vmas calls in the page fault path, with a single multi order call: unsigned int order = (1 << HPAGE_PMD_ORDER) | (1 << 0); page = alloc_pages_multi_order(..., &order); if (!page) goto out; if (!(order & (1 << 0))) { VM_WARN_ON(order != 1 << HPAGE_PMD_ORDER); /* THP fault */ } else { VM_WARN_ON(order != 1 << 0); /* 4k fallback */ } The page allocator logic has to be altered so that when it fails on any zone with order 9, it has to try again with a order 0 before falling back to the next zone in the zonelist. After that we need to do more measurements and evaluate if adding an opt-in feature for guest mode is worth it, to swap "DN 4k | DN+1 THP" with "DN+1 THP | DN 4k" at every NUMA distance crossing. Link: http://lkml.kernel.org/r/20190503223146.2312-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Michal Hocko Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Rientjes Cc: Zi Yan Cc: Stefan Priebe - Profihost AG Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 ++ mm/huge_memory.c | 42 ++++++++++++++++++++++++++---------------- mm/mempolicy.c | 2 +- 3 files changed, 29 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5228c62af416..bac395f1d00a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7e388b8662d..738065f765ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -647,27 +647,37 @@ release: static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; + gfp_t this_node = 0; - /* Always do synchronous compaction */ - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | __GFP_THISNODE | - (vma_madvised ? 0 : __GFP_NORETRY); +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif - /* Kick kcompactd and fail quickly */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return gfp_mask | __GFP_KSWAPD_RECLAIM; - - /* Synchronous compaction if madvised, otherwise kick kcompactd */ + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); - - /* Only do synchronous compaction if madvised */ + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - - return gfp_mask; + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9c9877a43d58..65e0874fce17 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1734,7 +1734,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); -- cgit v1.2.3 From edfbcb321faf07ca970e4191abe061deeb7d3788 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 Aug 2019 10:05:16 +0200 Subject: usb: add a hcd_uses_dma helper The USB buffer allocation code is the only place in the usb core (and in fact the whole kernel) that uses is_device_dma_capable, while the URB mapping code uses the uses_dma flag in struct usb_bus. Switch the buffer allocation to use the uses_dma flag used by the rest of the USB code, and create a helper in hcd.h that checks this flag as well as the CONFIG_HAS_DMA to simplify the caller a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20190811080520.21712-3-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/buffer.c | 10 +++------- drivers/usb/core/hcd.c | 4 ++-- drivers/usb/dwc2/hcd.c | 2 +- include/linux/usb.h | 2 +- include/linux/usb/hcd.h | 3 +++ 5 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c index 1a5b3dcae930..6cf22c27f2d2 100644 --- a/drivers/usb/core/buffer.c +++ b/drivers/usb/core/buffer.c @@ -66,9 +66,7 @@ int hcd_buffer_create(struct usb_hcd *hcd) char name[16]; int i, size; - if (hcd->localmem_pool || - !IS_ENABLED(CONFIG_HAS_DMA) || - !is_device_dma_capable(hcd->self.sysdev)) + if (hcd->localmem_pool || !hcd_uses_dma(hcd)) return 0; for (i = 0; i < HCD_BUFFER_POOLS; i++) { @@ -129,8 +127,7 @@ void *hcd_buffer_alloc( return gen_pool_dma_alloc(hcd->localmem_pool, size, dma); /* some USB hosts just use PIO */ - if (!IS_ENABLED(CONFIG_HAS_DMA) || - !is_device_dma_capable(bus->sysdev)) { + if (!hcd_uses_dma(hcd)) { *dma = ~(dma_addr_t) 0; return kmalloc(size, mem_flags); } @@ -160,8 +157,7 @@ void hcd_buffer_free( return; } - if (!IS_ENABLED(CONFIG_HAS_DMA) || - !is_device_dma_capable(bus->sysdev)) { + if (!hcd_uses_dma(hcd)) { kfree(addr); return; } diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 2ccbc2f83570..8592c0344fe8 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1412,7 +1412,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, if (usb_endpoint_xfer_control(&urb->ep->desc)) { if (hcd->self.uses_pio_for_control) return ret; - if (IS_ENABLED(CONFIG_HAS_DMA) && hcd->self.uses_dma) { + if (hcd_uses_dma(hcd)) { if (is_vmalloc_addr(urb->setup_packet)) { WARN_ONCE(1, "setup packet is not dma capable\n"); return -EAGAIN; @@ -1446,7 +1446,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; if (urb->transfer_buffer_length != 0 && !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)) { - if (IS_ENABLED(CONFIG_HAS_DMA) && hcd->self.uses_dma) { + if (hcd_uses_dma(hcd)) { if (urb->num_sgs) { int n; diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index ee144ff8af5b..111787a137ee 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -4608,7 +4608,7 @@ static int _dwc2_hcd_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, buf = urb->transfer_buffer; - if (hcd->self.uses_dma) { + if (hcd_uses_dma(hcd)) { if (!buf && (urb->transfer_dma & 3)) { dev_err(hsotg->dev, "%s: unaligned transfer with no transfer_buffer", diff --git a/include/linux/usb.h b/include/linux/usb.h index 83d35d993e8c..e87826e23d59 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1457,7 +1457,7 @@ typedef void (*usb_complete_t)(struct urb *); * field rather than determining a dma address themselves. * * Note that transfer_buffer must still be set if the controller - * does not support DMA (as indicated by bus.uses_dma) and when talking + * does not support DMA (as indicated by hcd_uses_dma()) and when talking * to root hub. If you have to trasfer between highmem zone and the device * on such controller, create a bounce buffer or bail out with an error. * If transfer_buffer cannot be set (is in highmem) and the controller is DMA diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index bab27ccc8ff5..a20e7815d814 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -422,6 +422,9 @@ static inline bool hcd_periodic_completion_in_progress(struct usb_hcd *hcd, return hcd->high_prio_bh.completing_ep == ep; } +#define hcd_uses_dma(hcd) \ + (IS_ENABLED(CONFIG_HAS_DMA) && (hcd)->self.uses_dma) + extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb); extern int usb_hcd_check_unlink_urb(struct usb_hcd *hcd, struct urb *urb, int status); -- cgit v1.2.3 From 7b6620d7db566a46f49b4b9deab9fa061fd4b59b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Aug 2019 11:09:16 -0600 Subject: block: remove REQ_NOWAIT_INLINE We had a few issues with this code, and there's still a problem around how we deal with error handling for chained/split bios. For now, just revert the code and we'll try again with a thoroug solution. This reverts commits: e15c2ffa1091 ("block: fix O_DIRECT error handling for bio fragments") 0eb6ddfb865c ("block: Fix __blkdev_direct_IO() for bio fragments") 6a43074e2f46 ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO") 893a1c97205a ("blk-mq: allow REQ_NOWAIT to return an error inline") Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++------ fs/block_dev.c | 49 +++++------------------------------------------ include/linux/blk_types.h | 5 +---- 3 files changed, 8 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index a8e6a58f5f28..0835f4d8d42e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1958,13 +1958,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); - - cookie = BLK_QC_T_NONE; - if (bio->bi_opf & REQ_NOWAIT_INLINE) - cookie = BLK_QC_T_EAGAIN; - else if (bio->bi_opf & REQ_NOWAIT) + if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return cookie; + return BLK_QC_T_NONE; } trace_block_getrq(q, bio, bio->bi_opf); diff --git a/fs/block_dev.c b/fs/block_dev.c index eb657ab94060..677cb364d33f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -345,24 +345,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; - bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - gfp_t gfp; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - if (nowait) - gfp = GFP_NOWAIT; - else - gfp = GFP_KERNEL; - - bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool); - if (!bio) - return -EAGAIN; + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -384,7 +375,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!is_poll) blk_start_plug(&plug); - ret = 0; for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -409,14 +399,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) task_io_account_write(bio->bi_iter.bi_size); } - /* - * Tell underlying layer to not block for resource shortage. - * And if we would have blocked, return error inline instead - * of through the bio->bi_end_io() callback. - */ - if (nowait) - bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE); - + dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); @@ -428,13 +411,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) polled = true; } - dio->size += bio->bi_iter.bi_size; qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - dio->size -= bio->bi_iter.bi_size; - ret = -EAGAIN; - goto error; - } if (polled) WRITE_ONCE(iocb->ki_cookie, qc); @@ -455,19 +432,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) atomic_inc(&dio->ref); } - dio->size += bio->bi_iter.bi_size; - qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - dio->size -= bio->bi_iter.bi_size; - ret = -EAGAIN; - goto error; - } - - bio = bio_alloc(gfp, nr_pages); - if (!bio) { - ret = -EAGAIN; - goto error; - } + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); } if (!is_poll) @@ -487,7 +453,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); -out: if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) @@ -495,10 +460,6 @@ out: bio_put(&dio->bio); return ret; -error: - if (!is_poll) - blk_finish_plug(&plug); - goto out; } static ssize_t diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b1fa1557e68..feff3fe4467e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,7 +311,6 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ - __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -346,7 +345,6 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) -#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) @@ -420,13 +418,12 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U -#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; + return cookie != BLK_QC_T_NONE; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -- cgit v1.2.3