From 4d54954a197175c0dcb3c82af0c0740d0c5f827a Mon Sep 17 00:00:00 2001 From: Sebastien Boisvert Date: Fri, 26 Oct 2018 15:02:23 -0700 Subject: include/linux/pfn_t.h: force '~' to be parsed as an unary operator Tracing the event "fs_dax:dax_pmd_insert_mapping" with perf produces this warning: [fs_dax:dax_pmd_insert_mapping] unknown op '~' It is printed in process_op (tools/lib/traceevent/event-parse.c) because '~' is parsed as a binary operator. perf reads the format of fs_dax:dax_pmd_insert_mapping ("print fmt") from /sys/kernel/debug/tracing/events/fs_dax/dax_pmd_insert_mapping/format . The format contains: ~(((u64) ~(~(((1UL) << 12)-1))) ^ \ interpreted as a binary operator by process_op(). This part is generated in the declaration of the event class dax_pmd_insert_mapping_class in include/trace/events/fs_dax.h : __print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|", PFN_FLAGS_TRACE), This patch adds a pair of parentheses in the declaration of PFN_FLAGS_MASK to make sure that '~' is parsed as a unary operator by perf. The part of the format that was problematic is now: ~(((u64) (~(~(((1UL) << 12)-1)))) Now, all the '~' are parsed as unary operators. Link: http://lkml.kernel.org/r/20181021145939.8760-1-sebhtml@videotron.qc.ca Signed-off-by: Sebastien Boisvert Acked-by: Dan Williams Cc: "Steven Rostedt (VMware)" Cc: Arnaldo Carvalho de Melo Cc: "Tzvetomir Stoyanov (VMware)" Cc: Namhyung Kim Cc: Ross Zwisler Cc: Elenie Godzaridis Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 21713dc14ce2..673546ba7342 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -10,7 +10,7 @@ * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver */ -#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) +#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) #define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) -- cgit v1.2.3 From 74f213ea25b99fddcf34cbe07dabdb01136bcd86 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 26 Oct 2018 15:02:27 -0700 Subject: include/linux/linkage.h: align weak symbols Since WEAK() supposed to be used instead of ENTRY() to define weak symbols, but unlike ENTRY() it doesn't have ALIGN directive. It seems there is no actual reason to not have, so let's add ALIGN to WEAK() too. Link: http://lkml.kernel.org/r/20180920135631.23833-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Will Deacon , Catalin Marinas Cc: Kyeongdon Kim Cc: Ard Biesheuvel Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/linkage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index d7618c41f74c..7c47b1a471d4 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -90,6 +90,7 @@ #ifndef WEAK #define WEAK(name) \ .weak name ASM_NL \ + ALIGN ASM_NL \ name: #endif -- cgit v1.2.3 From 5780a02fd1e87641ad6a8dd6891a1e890cf45c5d Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 26 Oct 2018 15:02:59 -0700 Subject: fs/iomap.c: change return type to vm_fault_t Change iomap_page_mkwrite() return type to vm_fault_t. see commit 1c8f422059ae ("mm: change return type to vm_fault_t") for reference. Link: http://lkml.kernel.org/r/20180827172050.GA18673@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/iomap.c | 2 +- include/linux/iomap.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..90c2febc93ac 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3555d54bf79a..9a4258154b25 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -6,6 +6,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, + const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, -- cgit v1.2.3 From 9b6f7e163cd0f468d1b9696b785659d3c27c8667 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:19 -0700 Subject: mm: rework memcg kernel stack accounting If CONFIG_VMAP_STACK is set, kernel stacks are allocated using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel stack pages are charged against corresponding memory cgroups on allocation and uncharged on releasing them. The problem is that we do cache kernel stacks in small per-cpu caches and do reuse them for new tasks, which can belong to different memory cgroups. Each stack page still holds a reference to the original cgroup, so the cgroup can't be released until the vmap area is released. To make this happen we need more than two subsequent exits without forks in between on the current cpu, which makes it very unlikely to happen. As a result, I saw a significant number of dying cgroups (in theory, up to 2 * number_of_cpu + number_of_tasks), which can't be released even by significant memory pressure. As a cgroup structure can take a significant amount of memory (first of all, per-cpu data like memcg statistics), it leads to a noticeable waste of memory. Link: http://lkml.kernel.org/r/20180827162621.30187-1-guro@fb.com Fixes: ac496bf48d97 ("fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y") Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Andy Lutomirski Cc: Konstantin Khlebnikov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 ++++++++++- kernel/fork.c | 55 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 652f602167df..4399cc3f00e4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg); + +#ifdef CONFIG_MEMCG_KMEM int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); void memcg_kmem_uncharge(struct page *page, int order); -#ifdef CONFIG_MEMCG_KMEM extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id); extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else + +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/kernel/fork.c b/kernel/fork.c index f0b58479534f..3c719fec46c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } + /* + * Allocated stacks are cached and later reused by new threads, + * so memcg accounting is performed manually on assigning/releasing + * stacks to tasks. Drop __GFP_ACCOUNT. + */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, + THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK - if (task_stack_vm_area(tsk)) { + struct vm_struct *vm = task_stack_vm_area(tsk); + + if (vm) { int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + -(int)(PAGE_SIZE / 1024)); + + memcg_kmem_uncharge(vm->pages[i], 0); + } + for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) NR_KERNEL_STACK_KB, PAGE_SIZE / 1024 * account); } - - /* All stack pages belong to the same memcg. */ - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ +#ifdef CONFIG_VMAP_STACK + struct vm_struct *vm = task_stack_vm_area(tsk); + int ret; + + if (vm) { + int i; + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + /* + * If memcg_kmem_charge() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge() + * and mod_memcg_page_state() in free_thread_stack() + * will ignore this page. So it's safe. + */ + ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + if (ret) + return ret; + + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + PAGE_SIZE / 1024); + } + } +#endif + return 0; +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(tsk->state != TASK_DEAD)) @@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + if (memcg_charge_kernel_stack(tsk)) + goto free_stack; + stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); -- cgit v1.2.3 From 68600f623d69da428c6163275f97ca126e1a8ec5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:27 -0700 Subject: mm: don't miss the last page because of round-off error I've noticed, that dying memory cgroups are often pinned in memory by a single pagecache page. Even under moderate memory pressure they sometimes stayed in such state for a long time. That looked strange. My investigation showed that the problem is caused by applying the LRU pressure balancing math: scan = div64_u64(scan * fraction[lru], denominator), where denominator = fraction[anon] + fraction[file] + 1. Because fraction[lru] is always less than denominator, if the initial scan size is 1, the result is always 0. This means the last page is not scanned and has no chances to be reclaimed. Fix this by rounding up the result of the division. In practice this change significantly improves the speed of dying cgroups reclaim. [guro@fb.com: prevent double calculation of DIV64_U64_ROUND_UP() arguments] Link: http://lkml.kernel.org/r/20180829213311.GA13501@castle Link: http://lkml.kernel.org/r/20180827162621.30187-3-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Rik van Riel Cc: Konstantin Khlebnikov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/math64.h | 3 +++ mm/vmscan.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index 837f2f2d1d34..bb2c84afb80c 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ +#define DIV64_U64_ROUND_UP(ll, d) \ + ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) + #endif /* _LINUX_MATH64_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index c5ef7240cbcb..961401c46334 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2456,9 +2456,11 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: -- cgit v1.2.3 From 33490af3f5c15757448b6c454ca93b48a333aa1b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2018 15:03:35 -0700 Subject: mm, mmu_notifier: be explicit about range invalition non-blocking mode If invalidate_range_start() is called for !blocking mode then all callbacks have to guarantee they will no block/sleep. The same obviously applies to invalidate_range_end because this operation pairs with the former and they are called from the same context. Make sure this is appropriately documented. Link: http://lkml.kernel.org/r/20180827112623.8992-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Jerome Glisse Cc: Boris Ostrovsky Cc: David Rientjes Cc: Juergen Gross Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 133ba78820ee..698e371aafe3 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -153,7 +153,9 @@ struct mmu_notifier_ops { * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN. 0 should be returned - * otherwise. + * otherwise. Please note that if invalidate_range_start approves + * a non-blocking behavior then the same applies to + * invalidate_range_end. * */ int (*invalidate_range_start)(struct mmu_notifier *mn, -- cgit v1.2.3 From 4e15a073a168b62311db911a55c4d4f1500c2821 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 26 Oct 2018 15:03:39 -0700 Subject: Revert "mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks" Revert 5ff7091f5a2ca ("mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks"). MMU_INVALIDATE_DOES_NOT_BLOCK flags was the only one used and it is no longer needed since 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu notifiers"). We now have a full support for per range !blocking behavior so we can drop the stop gap workaround which the per notifier flag was used for. Link: http://lkml.kernel.org/r/20180827112623.8992-4-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Boris Ostrovsky Cc: Jerome Glisse Cc: Juergen Gross Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/mmu_rb.c | 1 - drivers/iommu/amd_iommu_v2.c | 1 - drivers/iommu/intel-svm.c | 1 - drivers/misc/sgi-gru/grutlbpurge.c | 1 - include/linux/mmu_notifier.h | 23 ----------------------- mm/mmu_notifier.c | 31 ------------------------------- virt/kvm/kvm_main.c | 1 - 7 files changed, 59 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e1c7996c018e..475b769e120c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 58da65df03f5..fd552235bd13 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -427,7 +427,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 4a03e5090952..db301efe126d 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -273,7 +273,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index be28f05bfafa..03b49d52092e 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -261,7 +261,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 698e371aafe3..9893a6432adf 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,7 +2,6 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H -#include #include #include #include @@ -11,9 +10,6 @@ struct mmu_notifier; struct mmu_notifier_ops; -/* mmu_notifier_ops flags */ -#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) - #ifdef CONFIG_MMU_NOTIFIER /* @@ -30,15 +26,6 @@ struct mmu_notifier_mm { }; struct mmu_notifier_ops { - /* - * Flags to specify behavior of callbacks for this MMU notifier. - * Used to determine which context an operation may be called. - * - * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not - * block - */ - int flags; - /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are @@ -183,10 +170,6 @@ struct mmu_notifier_ops { * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. - * - * If this callback cannot block, and invalidate_range_{start,end} - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -241,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); -extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -495,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } -static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - return false; -} - static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 82bb1a939c0e..5119ff846769 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); -/* - * Must be called while holding mm->mmap_sem for either read or write. - * The result is guaranteed to be valid until mm->mmap_sem is dropped. - */ -bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) -{ - struct mmu_notifier *mn; - int id; - bool ret = false; - - WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); - - if (!mm_has_notifiers(mm)) - return ret; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (!mn->ops->invalidate_range && - !mn->ops->invalidate_range_start && - !mn->ops->invalidate_range_end) - continue; - - if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { - ret = true; - break; - } - } - srcu_read_unlock(&srcu, id); - return ret; -} - static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 786ade1843a2..2679e476b6c3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -497,7 +497,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -- cgit v1.2.3 From 5d7476374564645b1a2d299e242ad7b17b1104ee Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:10 -0700 Subject: mm: remove vm_insert_mixed() All callers are now converted to vmf_insert_mixed() so convert vmf_insert_mixed() from being a compatibility wrapper into the real function. Link: http://lkml.kernel.org/r/20180828145728.11873-3-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +-------------- mm/memory.c | 14 ++++++++++---- 2 files changed, 11 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index daa2b8f1e9a8..ecc6f9347756 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2506,7 +2506,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); @@ -2525,19 +2525,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) -{ - int err = vm_insert_mixed(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { diff --git a/mm/memory.c b/mm/memory.c index 21a5e6e4758b..200aaf291e98 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1693,13 +1693,19 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, false); + int err = __vm_insert_mixed(vma, addr, pfn, false); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed); +EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a -- cgit v1.2.3 From f5e6d1d5f8f3080aa7a51acea1f77085f45abe9c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:13 -0700 Subject: mm: introduce vmf_insert_pfn_prot() Like vm_insert_pfn_prot(), but returns a vm_fault_t instead of an errno. Also unexport vm_insert_pfn_prot as it has no modular users. Link: http://lkml.kernel.org/r/20180828145728.11873-4-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 47 +++++++++++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecc6f9347756..f1293bdc6de2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2506,6 +2506,8 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index 200aaf291e98..b3eecb3aa65f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1596,21 +1596,6 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_pfn); -/** - * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * @pgprot: pgprot flags for the inserted page - * - * This is exactly like vm_insert_pfn, except that it allows drivers to - * to override pgprot on a per-page basis. - * - * This only makes sense for IO mappings, and it makes no sense for - * cow mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is - * impractical. - */ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { @@ -1640,7 +1625,37 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn_prot); + +/** + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vmf_insert_pfn(), except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * COW mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ + int err = vm_insert_pfn_prot(vma, addr, pfn, pgprot); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { -- cgit v1.2.3 From bc12e6ad9617831727e4201e7cbf5c3b868cc8fd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:21 -0700 Subject: mm: make vm_insert_pfn_prot() static Now this is no longer used outside mm/memory.c, make it static. Link: http://lkml.kernel.org/r/20180828145728.11873-6-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- mm/memory.c | 50 +++++++++++++++++++++++++------------------------- 2 files changed, 25 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f1293bdc6de2..0f5db0455e61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2504,8 +2504,6 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index b3eecb3aa65f..6365144f8267 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1572,31 +1572,7 @@ out: return retval; } -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); -} -EXPORT_SYMBOL(vm_insert_pfn); - -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +static int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { int ret; @@ -1626,6 +1602,30 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, return ret; } +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to -- cgit v1.2.3 From 67fa1666223d7c825f6651add97f0011fe155f36 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:26 -0700 Subject: mm: remove references to vm_insert_pfn() Documentation and comments. Link: http://lkml.kernel.org/r/20180828145728.11873-7-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/x86/pat.txt | 4 ++-- include/asm-generic/pgtable.h | 4 ++-- include/linux/hmm.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 2a4ee6302122..481d8d8536ac 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -90,12 +90,12 @@ pci proc | -- | -- | WC | Advanced APIs for drivers ------------------------- A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, -vm_insert_pfn +vmf_insert_pfn Drivers wanting to export some pages to userspace do it by using mmap interface and a combination of 1) pgprot_noncached() -2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn() +2) io_remap_pfn_range() or remap_pfn_range() or vmf_insert_pfn() With PAT support, a new API pgprot_writecombine is being added. So, drivers can continue to use the above sequence, with either pgprot_noncached() or diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 88ebc6102c7c..5657a20e0c59 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -757,7 +757,7 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, - * vm_insert_pfn. + * vmf_insert_pfn. */ /* @@ -773,7 +773,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, /* * track_pfn_insert is called when a _new_ single pfn is established - * by vm_insert_pfn(). + * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4c92e3ba3e16..dde947083d4e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -107,7 +107,7 @@ enum hmm_pfn_flag_e { * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the - * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not + * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. * -- cgit v1.2.3 From ae2b01f37044c10e975d22116755df56252b09d8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 15:04:29 -0700 Subject: mm: remove vm_insert_pfn() All callers are now converted to vmf_insert_pfn() so convert vmf_insert_pfn() from being a compatibility wrapper around vm_insert_pfn() to being a compatibility wrapper around vmf_insert_pfn_prot(). Link: http://lkml.kernel.org/r/20180828145728.11873-8-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Nicolas Pitre Cc: Souptick Joarder Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +-------------- mm/memory.c | 54 +++++++++++++++++++++++++++++------------------------- 2 files changed, 30 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f5db0455e61..737279bb479c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2502,7 +2502,7 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); @@ -2525,19 +2525,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn) -{ - int err = vm_insert_pfn(vma, addr, pfn); - - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - - return VM_FAULT_NOPAGE; -} - static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) diff --git a/mm/memory.c b/mm/memory.c index 6365144f8267..08653d0a795a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1602,30 +1602,6 @@ static int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, return ret; } -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); -} -EXPORT_SYMBOL(vm_insert_pfn); - /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to @@ -1638,9 +1614,10 @@ EXPORT_SYMBOL(vm_insert_pfn); * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is + * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * + * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, @@ -1657,6 +1634,33 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn_prot); +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vmf_insert_pfn); + static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { /* these checks mirror the abort conditions in vm_normal_page */ -- cgit v1.2.3 From cc252eae85e09552f9c1e7ac0c3227f835efdf2d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:34 -0700 Subject: mm, slab: combine kmalloc_caches and kmalloc_dma_caches Patch series "kmalloc-reclaimable caches", v4. As discussed at LSF/MM [1] here's a patchset that introduces kmalloc-reclaimable caches (more details in the second patch) and uses them for dcache external names. That allows us to repurpose the NR_INDIRECTLY_RECLAIMABLE_BYTES counter later in the series. With patch 3/6, dcache external names are allocated from kmalloc-rcl-* caches, eliminating the need for manual accounting. More importantly, it also ensures the reclaimable kmalloc allocations are grouped in pages separate from the regular kmalloc allocations. The need for proper accounting of dcache external names has shown it's easy for misbehaving process to allocate lots of them, causing premature OOMs. Without the added grouping, it's likely that a similar workload can interleave the dcache external names allocations with regular kmalloc allocations (note: I haven't searched myself for an example of such regular kmalloc allocation, but I would be very surprised if there wasn't some). A pathological case would be e.g. one 64byte regular allocations with 63 external dcache names in a page (64x64=4096), which means the page is not freed even after reclaiming after all dcache names, and the process can thus "steal" the whole page with single 64byte allocation. If other kmalloc users similar to dcache external names become identified, they can also benefit from the new functionality simply by adding __GFP_RECLAIMABLE to the kmalloc calls. Side benefits of the patchset (that could be also merged separately) include removed branch for detecting __GFP_DMA kmalloc(), and shortening kmalloc cache names in /proc/slabinfo output. The latter is potentially an ABI break in case there are tools parsing the names and expecting the values to be in bytes. This is how /proc/slabinfo looks like after booting in virtme: ... kmalloc-rcl-4M 0 0 4194304 1 1024 : tunables 1 1 0 : slabdata 0 0 0 ... kmalloc-rcl-96 7 32 128 32 1 : tunables 120 60 8 : slabdata 1 1 0 kmalloc-rcl-64 25 128 64 64 1 : tunables 120 60 8 : slabdata 2 2 0 kmalloc-rcl-32 0 0 32 124 1 : tunables 120 60 8 : slabdata 0 0 0 kmalloc-4M 0 0 4194304 1 1024 : tunables 1 1 0 : slabdata 0 0 0 kmalloc-2M 0 0 2097152 1 512 : tunables 1 1 0 : slabdata 0 0 0 kmalloc-1M 0 0 1048576 1 256 : tunables 1 1 0 : slabdata 0 0 0 ... /proc/vmstat with renamed nr_indirectly_reclaimable_bytes counter: ... nr_slab_reclaimable 2817 nr_slab_unreclaimable 1781 ... nr_kernel_misc_reclaimable 0 ... /proc/meminfo with new KReclaimable counter: ... Shmem: 564 kB KReclaimable: 11260 kB Slab: 18368 kB SReclaimable: 11260 kB SUnreclaim: 7108 kB KernelStack: 1248 kB ... This patch (of 6): The kmalloc caches currently mainain separate (optional) array kmalloc_dma_caches for __GFP_DMA allocations. There are tests for __GFP_DMA in the allocation hotpaths. We can avoid the branches by combining kmalloc_caches and kmalloc_dma_caches into a single two-dimensional array where the outer dimension is cache "type". This will also allow to add kmalloc-reclaimable caches as a third type. Link: http://lkml.kernel.org/r/20180731090649.16028-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Laura Abbott Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 42 +++++++++++++++++++++++++++++++----------- mm/slab.c | 4 ++-- mm/slab_common.c | 31 ++++++++++++------------------- mm/slub.c | 13 +++++++------ 4 files changed, 52 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index ed9cbddeb4a6..2a7137043e91 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -295,12 +295,29 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) +enum kmalloc_cache_type { + KMALLOC_NORMAL = 0, +#ifdef CONFIG_ZONE_DMA + KMALLOC_DMA, +#endif + NR_KMALLOC_TYPES +}; + #ifndef CONFIG_SLOB -extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +extern struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; + +static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) +{ + int is_dma = 0; + #ifdef CONFIG_ZONE_DMA -extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; + is_dma = !!(flags & __GFP_DMA); #endif + return is_dma; +} + /* * Figure out which kmalloc slab an allocation of a certain size * belongs to. @@ -501,18 +518,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { +#ifndef CONFIG_SLOB + unsigned int index; +#endif if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); #ifndef CONFIG_SLOB - if (!(flags & GFP_DMA)) { - unsigned int index = kmalloc_index(size); + index = kmalloc_index(size); - if (!index) - return ZERO_SIZE_PTR; + if (!index) + return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace(kmalloc_caches[index], - flags, size); - } + return kmem_cache_alloc_trace( + kmalloc_caches[kmalloc_type(flags)][index], + flags, size); #endif } return __kmalloc(size, flags); @@ -542,13 +561,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { + size <= KMALLOC_MAX_CACHE_SIZE) { unsigned int i = kmalloc_index(size); if (!i) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace(kmalloc_caches[i], + return kmem_cache_alloc_node_trace( + kmalloc_caches[kmalloc_type(flags)][i], flags, node, size); } #endif diff --git a/mm/slab.c b/mm/slab.c index d73c7a4820a4..2a5654bb3b3f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 0, kmalloc_size(INDEX_NODE)); @@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 3a7ac4f15194..d880b2a3c81b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; -EXPORT_SYMBOL(kmalloc_dma_caches); -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -1040,12 +1036,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) index = fls(size - 1); } -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & GFP_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; + return kmalloc_caches[kmalloc_type(flags)][index]; } /* @@ -1119,7 +1110,8 @@ void __init setup_kmalloc_cache_index_table(void) static void __init new_kmalloc_cache(int idx, slab_flags_t flags) { - kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + kmalloc_caches[KMALLOC_NORMAL][idx] = create_kmalloc_cache( + kmalloc_info[idx].name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1132,9 +1124,10 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) void __init create_kmalloc_caches(slab_flags_t flags) { int i; + int type = KMALLOC_NORMAL; for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) + if (!kmalloc_caches[type][i]) new_kmalloc_cache(i, flags); /* @@ -1142,9 +1135,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) * These have to be created immediately after the * earlier power of two caches */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1] && i == 6) new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2] && i == 7) new_kmalloc_cache(2, flags); } @@ -1153,7 +1146,7 @@ void __init create_kmalloc_caches(slab_flags_t flags) #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; + struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { unsigned int size = kmalloc_size(i); @@ -1161,8 +1154,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) "dma-kmalloc-%u", size); BUG_ON(!n); - kmalloc_dma_caches[i] = create_kmalloc_cache(n, - size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( + n, size, SLAB_CACHE_DMA | flags, 0, 0); } } #endif diff --git a/mm/slub.c b/mm/slub.c index 18bd07daf4e4..e3629cd7aff1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4689,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf, static void __init resiliency_test(void) { u8 *p; + int type = KMALLOC_NORMAL; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); @@ -4701,7 +4702,7 @@ static void __init resiliency_test(void) pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", p + 16); - validate_slab_cache(kmalloc_caches[4]); + validate_slab_cache(kmalloc_caches[type][4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); @@ -4710,33 +4711,33 @@ static void __init resiliency_test(void) p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[5]); + validate_slab_cache(kmalloc_caches[type][5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); + validate_slab_cache(kmalloc_caches[type][6]); pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); + validate_slab_cache(kmalloc_caches[type][7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[8]); + validate_slab_cache(kmalloc_caches[type][8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); + validate_slab_cache(kmalloc_caches[type][9]); } #else #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 1291523f2c1d631fea34102fd241fb54a4e8f7a0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:38 -0700 Subject: mm, slab/slub: introduce kmalloc-reclaimable caches Kmem caches can be created with a SLAB_RECLAIM_ACCOUNT flag, which indicates they contain objects which can be reclaimed under memory pressure (typically through a shrinker). This makes the slab pages accounted as NR_SLAB_RECLAIMABLE in vmstat, which is reflected also the MemAvailable meminfo counter and in overcommit decisions. The slab pages are also allocated with __GFP_RECLAIMABLE, which is good for anti-fragmentation through grouping pages by mobility. The generic kmalloc-X caches are created without this flag, but sometimes are used also for objects that can be reclaimed, which due to varying size cannot have a dedicated kmem cache with SLAB_RECLAIM_ACCOUNT flag. A prominent example are dcache external names, which prompted the creation of a new, manually managed vmstat counter NR_INDIRECTLY_RECLAIMABLE_BYTES in commit f1782c9bc547 ("dcache: account external names as indirectly reclaimable memory"). To better handle this and any other similar cases, this patch introduces SLAB_RECLAIM_ACCOUNT variants of kmalloc caches, named kmalloc-rcl-X. They are used whenever the kmalloc() call passes __GFP_RECLAIMABLE among gfp flags. They are added to the kmalloc_caches array as a new type. Allocations with both __GFP_DMA and __GFP_RECLAIMABLE will use a dma type cache. This change only applies to SLAB and SLUB, not SLOB. This is fine, since SLOB's target are tiny system and this patch does add some overhead of kmem management objects. Link: http://lkml.kernel.org/r/20180731090649.16028-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Laura Abbott Cc: Matthew Wilcox Cc: Michal Hocko Cc: Sumit Semwal Cc: Vijayanand Jitta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 +++++++++++++++- mm/slab_common.c | 48 +++++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 2a7137043e91..918f374e7156 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -295,8 +295,13 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) +/* + * Whenever changing this, take care of that kmalloc_type() and + * create_kmalloc_caches() still work as intended. + */ enum kmalloc_cache_type { KMALLOC_NORMAL = 0, + KMALLOC_RECLAIM, #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif @@ -310,12 +315,21 @@ kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) { int is_dma = 0; + int type_dma = 0; + int is_reclaimable; #ifdef CONFIG_ZONE_DMA is_dma = !!(flags & __GFP_DMA); + type_dma = is_dma * KMALLOC_DMA; #endif - return is_dma; + is_reclaimable = !!(flags & __GFP_RECLAIMABLE); + + /* + * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return + * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE + */ + return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM; } /* diff --git a/mm/slab_common.c b/mm/slab_common.c index d880b2a3c81b..5b19439fd862 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1108,10 +1108,21 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, slab_flags_t flags) +static void __init +new_kmalloc_cache(int idx, int type, slab_flags_t flags) { - kmalloc_caches[KMALLOC_NORMAL][idx] = create_kmalloc_cache( - kmalloc_info[idx].name, + const char *name; + + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + name = kasprintf(GFP_NOWAIT, "kmalloc-rcl-%u", + kmalloc_info[idx].size); + BUG_ON(!name); + } else { + name = kmalloc_info[idx].name; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache(name, kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1123,22 +1134,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i; - int type = KMALLOC_NORMAL; + int i, type; - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, flags); + for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[type][1] && i == 6) - new_kmalloc_cache(1, flags); - if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[type][2] && i == 7) - new_kmalloc_cache(2, flags); + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } } /* Kmalloc array is now usable */ -- cgit v1.2.3 From b29940c1abd7a4c3abeb926df0a5ec84d6902d47 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 26 Oct 2018 15:05:46 -0700 Subject: mm: rename and change semantics of nr_indirectly_reclaimable_bytes The vmstat counter NR_INDIRECTLY_RECLAIMABLE_BYTES was introduced by commit eb59254608bc ("mm: introduce NR_INDIRECTLY_RECLAIMABLE_BYTES") with the goal of accounting objects that can be reclaimed, but cannot be allocated via a SLAB_RECLAIM_ACCOUNT cache. This is now possible via kmalloc() with __GFP_RECLAIMABLE flag, and the dcache external names user is converted. The counter is however still useful for accounting direct page allocations (i.e. not slab) with a shrinker, such as the ION page pool. So keep it, and: - change granularity to pages to be more like other counters; sub-page allocations should be able to use kmalloc - rename the counter to NR_KERNEL_MISC_RECLAIMABLE - expose the counter again in vmstat as "nr_kernel_misc_reclaimable"; we can again remove the check for not printing "hidden" counters Link: http://lkml.kernel.org/r/20180731090649.16028-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: Vijayanand Jitta Cc: Laura Abbott Cc: Sumit Semwal Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/ion/ion_page_pool.c | 8 ++++---- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 19 +++++++------------ mm/util.c | 3 +-- mm/vmstat.c | 6 +----- 5 files changed, 14 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c index 9bc56eb48d2a..0d2a95957ee8 100644 --- a/drivers/staging/android/ion/ion_page_pool.c +++ b/drivers/staging/android/ion/ion_page_pool.c @@ -33,8 +33,8 @@ static void ion_page_pool_add(struct ion_page_pool *pool, struct page *page) pool->low_count++; } - mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, - (1 << (PAGE_SHIFT + pool->order))); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + 1 << pool->order); mutex_unlock(&pool->mutex); } @@ -53,8 +53,8 @@ static struct page *ion_page_pool_remove(struct ion_page_pool *pool, bool high) } list_del(&page->lru); - mod_node_page_state(page_pgdat(page), NR_INDIRECTLY_RECLAIMABLE_BYTES, - -(1 << (PAGE_SHIFT + pool->order))); + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE, + -(1 << pool->order)); return page; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d4b0c79d2924..7bbeba21f6a3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -180,7 +180,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ - NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */ + NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 747031c2352d..20f25d06c00c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4701,6 +4701,7 @@ long si_mem_available(void) unsigned long pagecache; unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; struct zone *zone; int lru; @@ -4726,19 +4727,13 @@ long si_mem_available(void) available += pagecache; /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. */ - available += global_node_page_state(NR_SLAB_RECLAIMABLE) - - min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, - wmark_low); - - /* - * Part of the kernel memory, which can be released under memory - * pressure. - */ - available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> - PAGE_SHIFT; + reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) available = 0; diff --git a/mm/util.c b/mm/util.c index 470f5cd80b64..f740754f5012 100644 --- a/mm/util.c +++ b/mm/util.c @@ -678,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Part of the kernel memory, which can be released * under memory pressure. */ - free += global_node_page_state( - NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. diff --git a/mm/vmstat.c b/mm/vmstat.c index 7878da76abf2..2cec2fa4c8ae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,7 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "", /* nr_indirectly_reclaimable */ + "nr_kernel_misc_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1706,10 +1706,6 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - /* Skip hidden vmstat items. */ - if (*vmstat_text[off] == '\0') - return 0; - seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); -- cgit v1.2.3 From 1899ad18c6072d689896badafb81267b0a1092a4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:04 -0700 Subject: mm: workingset: tell cache transitions from workingset thrashing Refaults happen during transitions between workingsets as well as in-place thrashing. Knowing the difference between the two has a range of applications, including measuring the impact of memory shortage on the system performance, as well as the ability to smarter balance pressure between the filesystem cache and the swap-backed workingset. During workingset transitions, inactive cache refaults and pushes out established active cache. When that active cache isn't stale, however, and also ends up refaulting, that's bonafide thrashing. Introduce a new page flag that tells on eviction whether the page has been active or not in its lifetime. This bit is then stored in the shadow entry, to classify refaults as transitioning or thrashing. How many page->flags does this leave us with on 32-bit? 20 bits are always page flags 21 if you have an MMU 23 with the zone bits for DMA, Normal, HighMem, Movable 29 with the sparsemem section bits 30 if PAE is enabled 31 with this patch. So on 32-bit PAE, that leaves 1 bit for distinguishing two NUMA nodes. If that's not enough, the system can switch to discontigmem and re-gain the 6 or 7 sparsemem section bits. Link: http://lkml.kernel.org/r/20180828172258.3185-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + include/linux/page-flags.h | 5 ++- include/linux/swap.h | 2 +- include/trace/events/mmflags.h | 1 + mm/filemap.c | 9 ++-- mm/huge_memory.c | 1 + mm/migrate.c | 2 + mm/swap_state.c | 1 + mm/vmscan.c | 1 + mm/vmstat.c | 1 + mm/workingset.c | 95 +++++++++++++++++++++++++++--------------- 11 files changed, 77 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7bbeba21f6a3..ba51d5bf7af1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -163,6 +163,7 @@ enum node_stat_item { NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, + WORKINGSET_RESTORE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 74bee8cecf4c..4d99504f6496 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -69,13 +69,14 @@ */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ - PG_error, PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_active, + PG_workingset, PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ + PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, @@ -280,6 +281,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) +PAGEFLAG(Workingset, workingset, PF_HEAD) + TESTCLEARFLAG(Workingset, workingset, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 8e2c11e692ba..b93740d72e78 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -296,7 +296,7 @@ struct vma_swap_readahead { /* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); -bool workingset_refault(void *shadow); +void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); /* Do not use directly, use workingset_lookup_update */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index a81cffb76d89..a1675d43777e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -88,6 +88,7 @@ {1UL << PG_dirty, "dirty" }, \ {1UL << PG_lru, "lru" }, \ {1UL << PG_active, "active" }, \ + {1UL << PG_workingset, "workingset" }, \ {1UL << PG_slab, "slab" }, \ {1UL << PG_owner_priv_1, "owner_priv_1" }, \ {1UL << PG_arch_1, "arch_1" }, \ diff --git a/mm/filemap.c b/mm/filemap.c index de6fed2a0815..7997adce5a29 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -915,12 +915,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ - if (!(gfp_mask & __GFP_WRITE) && - shadow && workingset_refault(shadow)) { - SetPageActive(page); - workingset_activation(page); - } else - ClearPageActive(page); + WARN_ON_ONCE(PageActive(page)); + if (!(gfp_mask & __GFP_WRITE) && shadow) + workingset_refault(page, shadow); lru_cache_add(page); } return ret; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index deed97fba979..8ea1b36bd452 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2369,6 +2369,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | + (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | (1L << PG_dirty))); diff --git a/mm/migrate.c b/mm/migrate.c index 84381b55b2bd..1ea27b343ccd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -685,6 +685,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index ecee9c6c4cc1..0d6a7f268d2e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -448,6 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Initiate read into locked page and return. */ + SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; diff --git a/mm/vmscan.c b/mm/vmscan.c index 961401c46334..87e9fef341d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2145,6 +2145,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 2cec2fa4c8ae..d918f6192d15 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1145,6 +1145,7 @@ const char * const vmstat_text[] = { "nr_isolated_file", "workingset_refault", "workingset_activate", + "workingset_restore", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", diff --git a/mm/workingset.c b/mm/workingset.c index 7d5fa0dd2b38..99b7f7c09b13 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -121,7 +121,7 @@ * the only thing eating into inactive list space is active pages. * * - * Activating refaulting pages + * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given @@ -134,6 +134,10 @@ * used less frequently than the refaulting page - or even not used at * all anymore. * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. @@ -141,6 +145,14 @@ * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * * * Implementation * @@ -156,8 +168,7 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ - MEM_CGROUP_ID_SHIFT) + 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -170,23 +181,28 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) { eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; + eviction = (eviction << 1) | workingset; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, - unsigned long *evictionp) + unsigned long *evictionp, bool *workingsetp) { unsigned long entry = (unsigned long)shadow; int memcgid, nid; + bool workingset; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + workingset = entry & 1; + entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; + *workingsetp = workingset; } /** @@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg = page_memcg(page); struct pglist_data *pgdat = page_pgdat(page); + struct mem_cgroup *memcg = page_memcg(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, pgdat, eviction); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } /** * workingset_refault - evaluate the refault of a previously evicted page + * @page: the freshly allocated replacement page * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously * evicted page in the context of the node it was allocated in. - * - * Returns %true if the page should be activated, %false otherwise. */ -bool workingset_refault(void *shadow) +void workingset_refault(struct page *page, void *shadow) { unsigned long refault_distance; + struct pglist_data *pgdat; unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct pglist_data *pgdat; + bool workingset; int memcgid; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); rcu_read_lock(); /* @@ -263,41 +280,51 @@ bool workingset_refault(void *shadow) * configurations instead. */ memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) { - rcu_read_unlock(); - return false; - } + if (!mem_cgroup_disabled() && !memcg) + goto out; lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. + * Calculate the refault distance * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * inactive_age to lap a shadow entry in the field, which can + * then result in a false small refault distance, leading to a + * false activation should this old entry actually refault + * again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; inc_lruvec_state(lruvec, WORKINGSET_REFAULT); - if (refault_distance <= active_file) { - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); - rcu_read_unlock(); - return true; + /* + * Compare the distance to the existing workingset size. We + * don't act on pages that couldn't stay resident even if all + * the memory was available to the page cache. + */ + if (refault_distance > active_file) + goto out; + + SetPageActive(page); + atomic_long_inc(&lruvec->inactive_age); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + + /* Page was active prior to eviction */ + if (workingset) { + SetPageWorkingset(page); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } +out: rcu_read_unlock(); - return false; } /** -- cgit v1.2.3 From b1d29ba82cf2bc784f4c963ddd6a2cf29e229b33 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:08 -0700 Subject: delayacct: track delays from thrashing cache pages Delay accounting already measures the time a task spends in direct reclaim and waiting for swapin, but in low memory situations tasks spend can spend a significant amount of their time waiting on thrashing page cache. This isn't tracked right now. To know the full impact of memory contention on an individual task, measure the delay when waiting for a recently evicted active cache page to read back into memory. Also update tools/accounting/getdelays.c: [hannes@computer accounting]$ sudo ./getdelays -d -p 1 print delayacct stats ON PID 1 CPU count real total virtual total delay total delay average 50318 745000000 847346785 400533713 0.008ms IO count delay total delay average 435 122601218 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 19 12621439 0ms Link: http://lkml.kernel.org/r/20180828172258.3185-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 23 +++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 15 +++++++++++++++ mm/filemap.c | 11 +++++++++++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 61 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 31c865d1842e..577d1b25fccd 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -57,7 +57,12 @@ struct task_delay_info { u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ + + u64 thrashing_start; + u64 thrashing_delay; /* wait for thrashing page */ + u32 freepages_count; /* total count of memory reclaim */ + u32 thrashing_count; /* total count of thrash waits */ }; #endif @@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); +extern void __delayacct_thrashing_start(void); +extern void __delayacct_thrashing_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { @@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void) __delayacct_freepages_end(); } +static inline void delayacct_thrashing_start(void) +{ + if (current->delays) + __delayacct_thrashing_start(); +} + +static inline void delayacct_thrashing_end(void) +{ + if (current->delays) + __delayacct_thrashing_end(); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} +static inline void delayacct_thrashing_start(void) +{} +static inline void delayacct_thrashing_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b7aa7bb2349f..5e8ca16a9079 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 8 +#define TASKSTATS_VERSION 9 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -164,6 +164,10 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total; + + /* Delay waiting for thrashing page */ + __u64 thrashing_count; + __u64 thrashing_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac2824f0b..2a12b988c717 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } +void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->thrashing_start, + ¤t->delays->thrashing_delay, + ¤t->delays->thrashing_count); +} diff --git a/mm/filemap.c b/mm/filemap.c index 7997adce5a29..01a841f17bf4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -1073,8 +1074,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; int ret = 0; + if (bit_nr == PG_locked && !PageSwapBacked(page) && + !PageUptodate(page) && PageWorkingset(page)) { + delayacct_thrashing_start(); + thrashing = true; + } + init_wait(wait); wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; @@ -1113,6 +1121,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); + if (thrashing) + delayacct_thrashing_end(); + /* * A signal could leave PageWaiters set. Clearing it here if * !waitqueue_active would be possible (by open-coding finish_wait), diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 9f420d98b5fb..8cb504d30384 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -203,6 +203,8 @@ static void print_delayacct(struct taskstats *t) "SWAP %15s%15s%15s\n" " %15llu%15llu%15llums\n" "RECLAIM %12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "THRASHING%12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -222,7 +224,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->freepages_count, (unsigned long long)t->freepages_delay_total, - average_ms(t->freepages_delay_total, t->freepages_count)); + average_ms(t->freepages_delay_total, t->freepages_count), + "count", "delay total", "delay average", + (unsigned long long)t->thrashing_count, + (unsigned long long)t->thrashing_delay_total, + average_ms(t->thrashing_delay_total, t->thrashing_count)); } static void task_context_switch_counts(struct taskstats *t) -- cgit v1.2.3 From 8508cf3ffad4defa202b303e5b6379efc4cd9054 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:11 -0700 Subject: sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD There are several definitions of those functions/macros in places that mess with fixed-point load averages. Provide an official version. [akpm@linux-foundation.org: fix missed conversion in block/blk-iolatency.c] Link: http://lkml.kernel.org/r/20180828172258.3185-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/spufs/sched.c | 9 +++------ arch/s390/appldata/appldata_os.c | 4 ---- block/blk-iolatency.c | 8 +++++--- drivers/cpuidle/governors/menu.c | 4 ---- fs/proc/loadavg.c | 3 --- include/linux/sched/loadavg.h | 21 +++++++++++++++++---- kernel/debug/kdb/kdb_main.c | 7 +------ kernel/sched/loadavg.c | 15 --------------- 9 files changed, 27 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 882944c36ef5..5d8e8b6bb1cc 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) cpu = info->policy->cpu; busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); - CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); + info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", cpu, busy_spus, info->busy_spus); diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index c9ef3c532169..9fcccb4490b9 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -987,9 +987,9 @@ static void spu_calc_load(void) unsigned long active_tasks; /* fixed-point */ active_tasks = count_active_contexts() * FIXED_1; - CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); - CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); - CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); + spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); + spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); + spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); } static void spusched_wake(struct timer_list *unused) @@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, } } -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int show_spu_loadavg(struct seq_file *s, void *private) { int a, b, c; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 433a994b1a89..54f375627532 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -25,10 +25,6 @@ #include "appldata.h" - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - /* * OS data * diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 35c48d7b8f78..28f80d227528 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -153,7 +153,7 @@ struct iolatency_grp { #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC /* * These are the constants used to fake the fixed-point moving average - * calculation just like load average. The call to CALC_LOAD folds + * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows @@ -248,7 +248,7 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, return; /* - * CALC_LOAD takes in a number stored in fixed point representation. + * calc_load() takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and @@ -257,7 +257,9 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); + iolat->lat_avg = calc_load(iolat->lat_avg, + iolatency_exp_factors[exp_idx], + stat->rqs.mean); } static inline bool iolatency_may_queue(struct iolatency_grp *iolat, diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 575a68f31761..71979605246e 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -130,10 +130,6 @@ struct menu_device { int interval_ptr; }; - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static inline int get_loadavg(unsigned long load) { return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d06694757201..8468baee951d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include #include -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 80bc84ba5d2a..cc9cc62bb1f8 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -22,10 +22,23 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define EXP_5 2014 /* 1/exp(5sec/5min) */ #define EXP_15 2037 /* 1/exp(5sec/15min) */ -#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; +/* + * a1 = a0 * e + a * (1 - e) + */ +static inline unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) extern void calc_global_load(unsigned long ticks); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..54fbdfb2d86c 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,21 +91,6 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - unsigned long newload; - - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; - - return newload / FIXED_1; -} - #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. -- cgit v1.2.3 From 5c54f5b9edb1aa2eabbb1091c458f1b6776a1896 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:16 -0700 Subject: sched: loadavg: make calc_load_n() public It's going to be used in a later patch. Keep the churn separate. Link: http://lkml.kernel.org/r/20180828172258.3185-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/loadavg.h | 3 + kernel/sched/loadavg.c | 138 +++++++++++++++++++++--------------------- 2 files changed, 72 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index cc9cc62bb1f8..4859bea47a7b 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -37,6 +37,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return newload / FIXED_1; } +extern unsigned long calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n); + #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 54fbdfb2d86c..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,6 +91,75 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. @@ -210,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into -- cgit v1.2.3 From eb414681d5a07d28d2ff90dc05f69ec6b232ebd2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:27 -0700 Subject: psi: pressure stall information for CPU, memory, and IO When systems are overcommitted and resources become contended, it's hard to tell exactly the impact this has on workload productivity, or how close the system is to lockups and OOM kills. In particular, when machines work multiple jobs concurrently, the impact of overcommit in terms of latency and throughput on the individual job can be enormous. In order to maximize hardware utilization without sacrificing individual job health or risk complete machine lockups, this patch implements a way to quantify resource pressure in the system. A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that expose the percentage of time the system is stalled on CPU, memory, or IO, respectively. Stall states are aggregate versions of the per-task delay accounting delays: cpu: some tasks are runnable but not executing on a CPU memory: tasks are reclaiming, or waiting for swapin or thrashing cache io: tasks are waiting for io completions These percentages of walltime can be thought of as pressure percentages, and they give a general sense of system health and productivity loss incurred by resource overcommit. They can also indicate when the system is approaching lockup scenarios and OOMs. To do this, psi keeps track of the task states associated with each CPU and samples the time they spend in stall states. Every 2 seconds, the samples are averaged across CPUs - weighted by the CPUs' non-idle time to eliminate artifacts from unused CPUs - and translated into percentages of walltime. A running average of those percentages is maintained over 10s, 1m, and 5m periods (similar to the loadaverage). [hannes@cmpxchg.org: doc fixlet, per Randy] Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org [hannes@cmpxchg.org: code optimization] Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org [hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter] Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org [hannes@cmpxchg.org: fix build] Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/psi.txt | 64 ++++ include/linux/psi.h | 28 ++ include/linux/psi_types.h | 92 ++++++ include/linux/sched.h | 10 + init/Kconfig | 15 + kernel/fork.c | 4 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 12 +- kernel/sched/psi.c | 657 +++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 + kernel/sched/stats.h | 86 +++++ mm/compaction.c | 5 + mm/filemap.c | 15 +- mm/page_alloc.c | 9 + mm/vmscan.c | 9 + 15 files changed, 1003 insertions(+), 6 deletions(-) create mode 100644 Documentation/accounting/psi.txt create mode 100644 include/linux/psi.h create mode 100644 include/linux/psi_types.h create mode 100644 kernel/sched/psi.c (limited to 'include/linux') diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt new file mode 100644 index 000000000000..3753a82f1cf5 --- /dev/null +++ b/Documentation/accounting/psi.txt @@ -0,0 +1,64 @@ +================================ +PSI - Pressure Stall Information +================================ + +:Date: April, 2018 +:Author: Johannes Weiner + +When CPU, memory or IO devices are contended, workloads experience +latency spikes, throughput losses, and run the risk of OOM kills. + +Without an accurate measure of such contention, users are forced to +either play it safe and under-utilize their hardware resources, or +roll the dice and frequently suffer the disruptions resulting from +excessive overcommit. + +The psi feature identifies and quantifies the disruptions caused by +such resource crunches and the time impact it has on complex workloads +or even entire systems. + +Having an accurate measure of productivity losses caused by resource +scarcity aids users in sizing workloads to hardware--or provisioning +hardware according to workload demand. + +As psi aggregates this information in realtime, systems can be managed +dynamically using techniques such as load shedding, migrating jobs to +other systems or data centers, or strategically pausing or killing low +priority or restartable batch jobs. + +This allows maximizing hardware utilization without sacrificing +workload health or risking major disruptions such as OOM kills. + +Pressure interface +================== + +Pressure information for each resource is exported through the +respective file in /proc/pressure/ -- cpu, memory, and io. + +The format for CPU is as such: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +and for memory and IO: + +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 + +The "some" line indicates the share of time in which at least some +tasks are stalled on a given resource. + +The "full" line indicates the share of time in which all non-idle +tasks are stalled on a given resource simultaneously. In this state +actual CPU cycles are going to waste, and a workload that spends +extended time in this state is considered to be thrashing. This has +severe impact on performance, and it's useful to distinguish this +situation from a state where some tasks are stalled but the CPU is +still doing productive work. As such, time spent in this subset of the +stall state is tracked separately and exported in the "full" averages. + +The ratios are tracked as recent trends over ten, sixty, and three +hundred second windows, which gives insight into short term events as +well as medium and long term trends. The total absolute stall time is +tracked and exported as well, to allow detection of latency spikes +which wouldn't necessarily make a dent in the time averages, or to +average trends over custom time frames. diff --git a/include/linux/psi.h b/include/linux/psi.h new file mode 100644 index 000000000000..b0daf050de58 --- /dev/null +++ b/include/linux/psi.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_PSI_H +#define _LINUX_PSI_H + +#include +#include + +#ifdef CONFIG_PSI + +extern bool psi_disabled; + +void psi_init(void); + +void psi_task_change(struct task_struct *task, int clear, int set); + +void psi_memstall_tick(struct task_struct *task, int cpu); +void psi_memstall_enter(unsigned long *flags); +void psi_memstall_leave(unsigned long *flags); + +#else /* CONFIG_PSI */ + +static inline void psi_init(void) {} + +static inline void psi_memstall_enter(unsigned long *flags) {} +static inline void psi_memstall_leave(unsigned long *flags) {} + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_H */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h new file mode 100644 index 000000000000..2cf422db5d18 --- /dev/null +++ b/include/linux/psi_types.h @@ -0,0 +1,92 @@ +#ifndef _LINUX_PSI_TYPES_H +#define _LINUX_PSI_TYPES_H + +#include +#include + +#ifdef CONFIG_PSI + +/* Tracked task states */ +enum psi_task_count { + NR_IOWAIT, + NR_MEMSTALL, + NR_RUNNING, + NR_PSI_TASK_COUNTS, +}; + +/* Task state bitmasks */ +#define TSK_IOWAIT (1 << NR_IOWAIT) +#define TSK_MEMSTALL (1 << NR_MEMSTALL) +#define TSK_RUNNING (1 << NR_RUNNING) + +/* Resources that workloads could be stalled on */ +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, + NR_PSI_RESOURCES, +}; + +/* + * Pressure states for each resource: + * + * SOME: Stalled tasks & working tasks + * FULL: Stalled tasks & no working tasks + */ +enum psi_states { + PSI_IO_SOME, + PSI_IO_FULL, + PSI_MEM_SOME, + PSI_MEM_FULL, + PSI_CPU_SOME, + /* Only per-CPU, to weigh the CPU in the global average: */ + PSI_NONIDLE, + NR_PSI_STATES, +}; + +struct psi_group_cpu { + /* 1st cacheline updated by the scheduler */ + + /* Aggregator needs to know of concurrent changes */ + seqcount_t seq ____cacheline_aligned_in_smp; + + /* States of the tasks belonging to this group */ + unsigned int tasks[NR_PSI_TASK_COUNTS]; + + /* Period time sampling buckets for each state of interest (ns) */ + u32 times[NR_PSI_STATES]; + + /* Time of last task change in this group (rq_clock) */ + u64 state_start; + + /* 2nd cacheline updated by the aggregator */ + + /* Delta detection against the sampling buckets */ + u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp; +}; + +struct psi_group { + /* Protects data updated during an aggregation */ + struct mutex stat_lock; + + /* Per-cpu task state & time tracking */ + struct psi_group_cpu __percpu *pcpu; + + /* Periodic aggregation state */ + u64 total_prev[NR_PSI_STATES - 1]; + u64 last_update; + u64 next_update; + struct delayed_work clock_work; + + /* Total stall times and sampled pressure averages */ + u64 total[NR_PSI_STATES - 1]; + unsigned long avg[NR_PSI_STATES - 1][3]; +}; + +#else /* CONFIG_PSI */ + +struct psi_group { }; + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index adfb3f9a7597..b8fcc6b3080c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -706,6 +707,10 @@ struct task_struct { unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; +#ifdef CONFIG_PSI + unsigned sched_psi_wake_requeue:1; +#endif + /* Force alignment to the next boundary: */ unsigned :0; @@ -965,6 +970,10 @@ struct task_struct { kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; +#ifdef CONFIG_PSI + /* Pressure stall state */ + unsigned int psi_flags; +#endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; @@ -1391,6 +1400,7 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ +#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ diff --git a/init/Kconfig b/init/Kconfig index 317d5ccb5191..26e639df5517 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -490,6 +490,21 @@ config TASK_IO_ACCOUNTING Say N if unsure. +config PSI + bool "Pressure stall information tracking" + help + Collect metrics that indicate how overcommitted the CPU, memory, + and IO capacity are in the system. + + If you say Y here, the kernel will create /proc/pressure/ with the + pressure statistics files cpu, memory, and io. These will indicate + the share of walltime in which some or all tasks in the system are + delayed due to contention of the respective resource. + + For more details see Documentation/accounting/psi.txt. + + Say N if unsure. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/fork.c b/kernel/fork.c index 3c719fec46c5..8f82a3bdcb8f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1822,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process( p->default_timer_slack_ns = current->timer_slack_ns; +#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3efef387797..fd2fce8a001b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -3051,6 +3056,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -6067,6 +6073,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..595414599b98 --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,657 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + psi_group_change(&psi_system, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(psi_system.pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +static int psi_show(struct seq_file *m, struct psi_group *group, + enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1de189bb9209..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include +#include struct cfs_rq; struct rt_rq; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/mm/compaction.c b/mm/compaction.c index faca45ebe62d..7c607479de4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2068,11 +2069,15 @@ static int kcompactd(void *p) pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { + unsigned long pflags; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); wait_event_freezable(pgdat->kcompactd_wait, kcompactd_work_requested(pgdat)); + psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); } return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 01a841f17bf4..41586009fa42 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -1075,11 +1076,14 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; + unsigned long pflags; int ret = 0; - if (bit_nr == PG_locked && !PageSwapBacked(page) && + if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { - delayacct_thrashing_start(); + if (!PageSwapBacked(page)) + delayacct_thrashing_start(); + psi_memstall_enter(&pflags); thrashing = true; } @@ -1121,8 +1125,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, finish_wait(q, wait); - if (thrashing) - delayacct_thrashing_end(); + if (thrashing) { + if (!PageSwapBacked(page)) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } /* * A signal could leave PageWaiters set. Clearing it here if diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20f25d06c00c..f97b5a1700a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -3549,15 +3550,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned long pflags; unsigned int noreclaim_flag; if (!order) return NULL; + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3756,11 +3762,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; + unsigned long pflags; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; @@ -3772,6 +3780,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); + psi_memstall_leave(&pflags); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 87e9fef341d2..8ea87586925e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -3305,6 +3306,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3333,9 +3335,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3500,6 +3506,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3510,6 +3517,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); @@ -3611,6 +3619,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller -- cgit v1.2.3 From 2ce7135adc9ad081aa3c49744144376ac74fea60 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:31 -0700 Subject: psi: cgroup support On a system that executes multiple cgrouped jobs and independent workloads, we don't just care about the health of the overall system, but also that of individual jobs, so that we can ensure individual job health, fairness between jobs, or prioritize some jobs over others. This patch implements pressure stall tracking for cgroups. In kernels with CONFIG_PSI=y, cgroup2 groups will have cpu.pressure, memory.pressure, and io.pressure files that track aggregate pressure stall times for only the tasks inside the cgroup. Link: http://lkml.kernel.org/r/20180828172258.3185-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/psi.txt | 9 +++ Documentation/admin-guide/cgroup-v2.rst | 18 +++++ include/linux/cgroup-defs.h | 4 ++ include/linux/cgroup.h | 15 ++++ include/linux/psi.h | 25 +++++++ init/Kconfig | 4 ++ kernel/cgroup/cgroup.c | 45 +++++++++++- kernel/sched/psi.c | 118 +++++++++++++++++++++++++++++--- 8 files changed, 228 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 3753a82f1cf5..b8ca28b60215 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is tracked and exported as well, to allow detection of latency spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames. + +Cgroup2 interface +================= + +In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem +mounted, pressure stall information is also tracked for tasks grouped +into cgroups. Each subdirectory in the cgroupfs mountpoint contains +cpu.pressure, memory.pressure, and io.pressure files; the format is +the same as the /proc/pressure/ files. diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index caf36105a1c7..8389d6f72a77 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -966,6 +966,12 @@ All time durations are in microseconds. $PERIOD duration. "max" for $MAX indicates no limit. If only one number is written, $MAX is updated. + cpu.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for CPU. See + Documentation/accounting/psi.txt for details. + Memory ------ @@ -1271,6 +1277,12 @@ PAGE_SIZE multiple when read back. higher than the limit for an extended period of time. This reduces the impact on the workload and memory management. + memory.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for memory. See + Documentation/accounting/psi.txt for details. + Usage Guidelines ~~~~~~~~~~~~~~~~ @@ -1408,6 +1420,12 @@ IO Interface Files 8:16 rbps=2097152 wbps=max riops=max wiops=max + io.pressure + A read-only nested-key file which exists on non-root cgroups. + + Shows pressure stall information for IO. See + Documentation/accounting/psi.txt for details. + Writeback ~~~~~~~~~ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 22254c1fe1c5..5e1694fe035b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -436,6 +437,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + /* used to track pressure stalls */ + struct psi_group psi; + /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b622d6608605..9968332cceed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return &cgrp->psi; +} + static inline void cgroup_init_kthreadd(void) { /* @@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) return NULL; } +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + return NULL; +} + +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return NULL; +} + static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { diff --git a/include/linux/psi.h b/include/linux/psi.h index b0daf050de58..8e0725aac0aa 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -4,6 +4,9 @@ #include #include +struct seq_file; +struct css_set; + #ifdef CONFIG_PSI extern bool psi_disabled; @@ -16,6 +19,14 @@ void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); +int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgrp); +void psi_cgroup_free(struct cgroup *cgrp); +void cgroup_move_task(struct task_struct *p, struct css_set *to); +#endif + #else /* CONFIG_PSI */ static inline void psi_init(void) {} @@ -23,6 +34,20 @@ static inline void psi_init(void) {} static inline void psi_memstall_enter(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {} +#ifdef CONFIG_CGROUPS +static inline int psi_cgroup_alloc(struct cgroup *cgrp) +{ + return 0; +} +static inline void psi_cgroup_free(struct cgroup *cgrp) +{ +} +static inline void cgroup_move_task(struct task_struct *p, struct css_set *to) +{ + rcu_assign_pointer(p->cgroups, to); +} +#endif + #endif /* CONFIG_PSI */ #endif /* _LINUX_PSI_H */ diff --git a/init/Kconfig b/init/Kconfig index 26e639df5517..a4112e95724a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -501,6 +501,10 @@ config PSI the share of walltime in which some or all tasks in the system are delayed due to contention of the respective resource. + In kernels with cgroup support, cgroups (cgroup2 only) will + have cpu.pressure, memory.pressure, and io.pressure files, + which aggregate pressure stalls for the grouped tasks only. + For more details see Documentation/accounting/psi.txt. Say N if unsure. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ }; @@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 595414599b98..7cdecfc010af 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->clock_work, PSI_FREQ); } +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; if (!task->pid) return; @@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set; - psi_group_change(&psi_system, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); } void psi_memstall_tick(struct task_struct *task, int cpu) { - struct psi_group_cpu *groupc; + struct psi_group *group; + void *iter = NULL; - groupc = per_cpu_ptr(psi_system.pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } } /** @@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } -static int psi_show(struct seq_file *m, struct psi_group *group, - enum psi_res res) +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; -- cgit v1.2.3 From 68d48e6a2df575b935edd420396c3cb8b6aa6ad3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:39 -0700 Subject: mm: workingset: add vmstat counter for shadow nodes Make it easier to catch bugs in the shadow node shrinker by adding a counter for the shadow nodes in circulation. [akpm@linux-foundation.org: assert that irqs are disabled, for __inc_lruvec_page_state()] [akpm@linux-foundation.org: s/WARN_ON_ONCE/VM_WARN_ON_ONCE/, per Johannes] Link: http://lkml.kernel.org/r/20181009184732.762-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/vmstat.c | 1 + mm/workingset.c | 14 ++++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ba51d5bf7af1..9f0caccd5833 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -161,6 +161,7 @@ enum node_stat_item { NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + WORKINGSET_NODES, WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_RESTORE, diff --git a/mm/vmstat.c b/mm/vmstat.c index d918f6192d15..dab53430f63c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1143,6 +1143,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", + "workingset_nodes", "workingset_refault", "workingset_activate", "workingset_restore", diff --git a/mm/workingset.c b/mm/workingset.c index 5a72c9d5e195..7e6ef312cea5 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -377,12 +377,20 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ + VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + if (node->count && node->count == node->exceptional) { - if (list_empty(&node->private_list)) + if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } else { - if (!list_empty(&node->private_list)) + if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_page_state(virt_to_page(node), + WORKINGSET_NODES); + } } } @@ -473,6 +481,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); + __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES); + spin_unlock(lru_lock); /* -- cgit v1.2.3 From 85cfb245060e45640fa3447f8b0bad5e8bd3bdaf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 26 Oct 2018 15:07:41 -0700 Subject: memcg: remove memcg_kmem_skip_account The flag memcg_kmem_skip_account was added during the era of opt-out kmem accounting. There is no need for such flag in the opt-in world as there aren't any __GFP_ACCOUNT allocations within memcg_create_cache_enqueue(). Link: http://lkml.kernel.org/r/20180919004501.178023-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 --- mm/memcontrol.c | 24 +----------------------- 2 files changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b8fcc6b3080c..8f8a5418b627 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -724,9 +724,6 @@ struct task_struct { #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; -#ifdef CONFIG_MEMCG_KMEM - unsigned memcg_kmem_skip_account:1; -#endif #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e9ede617b89..645ede7ad1b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, struct kmem_cache *cachep) { struct memcg_kmem_cache_create_work *cw; @@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, queue_work(memcg_kmem_cache_wq, &cw->work); } -static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, - struct kmem_cache *cachep) -{ - /* - * We need to stop accounting when we kmalloc, because if the - * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_schedule_kmem_cache_create will recurse. - * - * However, it is better to enclose the whole function. Depending on - * the debugging options enabled, INIT_WORK(), for instance, can - * trigger an allocation. This too, will make us recurse. Because at - * this point we can't allow ourselves back into memcg_kmem_get_cache, - * the safest choice is to do it like this, wrapping the whole function. - */ - current->memcg_kmem_skip_account = 1; - __memcg_schedule_kmem_cache_create(memcg, cachep); - current->memcg_kmem_skip_account = 0; -} - static inline bool memcg_kmem_bypass(void) { if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) @@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (memcg_kmem_bypass()) return cachep; - if (current->memcg_kmem_skip_account) - return cachep; - memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) -- cgit v1.2.3 From f682a97a00591def7cefbb5003dc04045028e405 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:45 -0700 Subject: mm: provide kernel parameter to allow disabling page init poisoning Patch series "Address issues slowing persistent memory initialization", v5. The main thing this patch set achieves is that it allows us to initialize each node worth of persistent memory independently. As a result we reduce page init time by about 2 minutes because instead of taking 30 to 40 seconds per node and going through each node one at a time, we process all 4 nodes in parallel in the case of a 12TB persistent memory setup spread evenly over 4 nodes. This patch (of 3): On systems with a large amount of memory it can take a significant amount of time to initialize all of the page structs with the PAGE_POISON_PATTERN value. I have seen it take over 2 minutes to initialize a system with over 12TB of RAM. In order to work around the issue I had to disable CONFIG_DEBUG_VM and then the boot time returned to something much more reasonable as the arch_add_memory call completed in milliseconds versus seconds. However in doing that I had to disable all of the other VM debugging on the system. In order to work around a kernel that might have CONFIG_DEBUG_VM enabled on a system that has a large amount of memory I have added a new kernel parameter named "vm_debug" that can be set to "-" in order to disable it. Link: http://lkml.kernel.org/r/20180925201921.3576.84239.stgit@localhost.localdomain Reviewed-by: Pavel Tatashin Signed-off-by: Alexander Duyck Cc: Dave Hansen Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 12 +++++++ include/linux/page-flags.h | 8 +++++ mm/debug.c | 46 +++++++++++++++++++++++++ mm/memblock.c | 5 ++- mm/sparse.c | 4 +-- 5 files changed, 69 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8022d902e770..dcd082576e79 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4839,6 +4839,18 @@ This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. + vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y. + May slow down system boot speed, especially when + enabled on systems with a large amount of memory. + All options are enabled by default, and this + interface is meant to allow for selectively + enabling or disabling specific virtual memory + debugging features. + + Available options are: + P Enable page structure init time poisoning + - Disable all of the above options + vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact size of . This can be used to increase the minimum size (128MB on x86). It can also be used to diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4d99504f6496..934f91ef3f54 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -163,6 +163,14 @@ static inline int PagePoisoned(const struct page *page) return page->flags == PAGE_POISON_PATTERN; } +#ifdef CONFIG_DEBUG_VM +void page_init_poison(struct page *page, size_t size); +#else +static inline void page_init_poison(struct page *page, size_t size) +{ +} +#endif + /* * Page flags policies wrt compound pages * diff --git a/mm/debug.c b/mm/debug.c index bd10aad8539a..cdacba12e09a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" @@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm) ); } +static bool page_init_poisoning __read_mostly = true; + +static int __init setup_vm_debug(char *str) +{ + bool __page_init_poisoning = true; + + /* + * Calling vm_debug with no arguments is equivalent to requesting + * to enable all debugging options we can control. + */ + if (*str++ != '=' || !*str) + goto out; + + __page_init_poisoning = false; + if (*str == '-') + goto out; + + while (*str) { + switch (tolower(*str)) { + case'p': + __page_init_poisoning = true; + break; + default: + pr_err("vm_debug option '%c' unknown. skipped\n", + *str); + } + + str++; + } +out: + if (page_init_poisoning && !__page_init_poisoning) + pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); + + page_init_poisoning = __page_init_poisoning; + + return 1; +} +__setup("vm_debug", setup_vm_debug); + +void page_init_poison(struct page *page, size_t size) +{ + if (page_init_poisoning) + memset(page, PAGE_POISON_PATTERN, size); +} +EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/memblock.c b/mm/memblock.c index 237944479d25..a85315083b5a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw( ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); -#ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, PAGE_POISON_PATTERN, size); -#endif + page_init_poison(ptr, size); + return ptr; } diff --git a/mm/sparse.c b/mm/sparse.c index 10b07eea9a6e..67ad061f7fb8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } -#ifdef CONFIG_DEBUG_VM /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); -#endif + page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); section_mark_present(ms); sparse_init_one_section(ms, section_nr, memmap, usemap); -- cgit v1.2.3 From d483da5bc78b86fe4200d2947f193a745f711713 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:48 -0700 Subject: mm: create non-atomic version of SetPageReserved for init use It doesn't make much sense to use the atomic SetPageReserved at init time when we are using memset to clear the memory and manipulating the page flags via simple "&=" and "|=" operations in __init_single_page. This patch adds a non-atomic version __SetPageReserved that can be used during page init and shows about a 10% improvement in initialization times on the systems I have available for testing. On those systems I saw initialization times drop from around 35 seconds to around 32 seconds to initialize a 3TB block of persistent memory. I believe the main advantage of this is that it allows for more compiler optimization as the __set_bit operation can be reordered whereas the atomic version cannot. I tried adding a bit of documentation based on f1dd2cd13c4 ("mm, memory_hotplug: do not associate hotadded memory to zones until online"). Ideally the reserved flag should be set earlier since there is a brief window where the page is initialization via __init_single_page and we have not set the PG_Reserved flag. I'm leaving that for a future patch set as that will require a more significant refactor. Link: http://lkml.kernel.org/r/20180925202018.3576.11607.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 1 + mm/page_alloc.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 934f91ef3f54..50ce1bddaf56 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -303,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eb6c50cc8880..cee1abf85d72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1232,7 +1232,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); - SetPageReserved(page); + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } } @@ -5508,7 +5513,7 @@ not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); if (context == MEMMAP_HOTPLUG) - SetPageReserved(page); + __SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for -- cgit v1.2.3 From 966cf44f637e6aeea7e3d01ba004bf8b5beac78f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:52 -0700 Subject: mm: defer ZONE_DEVICE page initialization to the point where we init pgmap The ZONE_DEVICE pages were being initialized in two locations. One was with the memory_hotplug lock held and another was outside of that lock. The problem with this is that it was nearly doubling the memory initialization time. Instead of doing this twice, once while holding a global lock and once without, I am opting to defer the initialization to the one outside of the lock. This allows us to avoid serializing the overhead for memory init and we can instead focus on per-node init times. One issue I encountered is that devm_memremap_pages and hmm_devmmem_pages_create were initializing only the pgmap field the same way. One wasn't initializing hmm_data, and the other was initializing it to a poison value. Since this is something that is exposed to the driver in the case of hmm I am opting for a third option and just initializing hmm_data to 0 since this is going to be exposed to unknown third party drivers. [alexander.h.duyck@linux.intel.com: fix reference count for pgmap in devm_memremap_pages] Link: http://lkml.kernel.org/r/20181008233404.1909.37302.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/20180925202053.3576.66039.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Tested-by: Dan Williams Cc: Dave Hansen Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ kernel/memremap.c | 25 ++++++--------- mm/hmm.c | 12 ++++--- mm/page_alloc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 108 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 737279bb479c..33228a49d7d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..620fc4d2559a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; + unsigned long pgoff, order; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); diff --git a/mm/hmm.c b/mm/hmm.c index c968e49f7a0c..774d684fa2b4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) resource_size_t key, align_start, align_size, align_end; struct device *device = devmem->device; int ret, nid, is_ram; - unsigned long pfn; align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(devmem->resource->start + @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) align_size >> PAGE_SHIFT, NULL); mem_hotplug_done(); - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, &devmem->pagemap); - page->pgmap = &devmem->pagemap; - } return 0; error_add_memory: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cee1abf85d72..d73ff2188d72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5465,12 +5465,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; +#ifdef CONFIG_ZONE_DEVICE /* * Honor reservation requested by the driver for this ZONE_DEVICE - * memory + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. */ - if (altmap && start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* @@ -5537,6 +5548,81 @@ not_early: } } +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long size, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + size; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + return; + + /* + * The call to memmap_init_zone should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (pgmap->altmap_valid) { + struct vmem_altmap *altmap = &pgmap->altmap; + + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + size = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer and hmm_data. It is a bug if a ZONE_DEVICE + * page is ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->hmm_data = 0; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + * + * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * because this is done early in sparse_add_one_section + */ + if (!(pfn & (pageblock_nr_pages - 1))) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } + } + + pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + size, jiffies_to_msecs(jiffies - start)); +} + +#endif static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; -- cgit v1.2.3 From 85a06835f6f1ba79f0f00838ccd5ad840dd1eafb Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:08:50 -0700 Subject: mm: mremap: downgrade mmap_sem to read when shrinking Other than munmap, mremap might be used to shrink memory mapping too. So, it may hold write mmap_sem for long time when shrinking large mapping, as what commit ("mm: mmap: zap pages with read mmap_sem in munmap") described. The mremap() will not manipulate vmas anymore after __do_munmap() call for the mapping shrink use case, so it is safe to downgrade to read mmap_sem. So, the same optimization, which downgrades mmap_sem to read for zapping pages, is also feasible and reasonable to this case. The period of holding exclusive mmap_sem for shrinking large mapping would be reduced significantly with this optimization. MREMAP_FIXED and MREMAP_MAYMOVE are more complicated to adopt this optimization since they need manipulate vmas after do_munmap(), downgrading mmap_sem may create race window. Simple mapping shrink is the low hanging fruit, and it may cover the most cases of unmap with munmap together. [akpm@linux-foundation.org: tweak comment] [yang.shi@linux.alibaba.com: fix unsigned compare against 0 issue] Link: http://lkml.kernel.org/r/1538687672-17795-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1538067582-60038-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Matthew Wilcox Cc: Laurent Dufour Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/mmap.c | 4 ++-- mm/mremap.c | 20 ++++++++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 33228a49d7d2..a023c5ce71fa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2306,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern int __do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); diff --git a/mm/mmap.c b/mm/mmap.c index 58e323c92c8e..1bfd12032664 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2687,8 +2687,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -static int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; diff --git a/mm/mremap.c b/mm/mremap.c index a9617e72e6b7..7f9f9180e401 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_munmap does all the needed commit accounting + * __do_munmap does all the needed commit accounting, and + * downgrades mmap_sem to read if so directed. */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); - if (ret && old_len != new_len) + int retval; + + retval = __do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, true); + if (retval < 0 && old_len != new_len) { + ret = retval; goto out; + /* Returning 1 indicates mmap_sem is downgraded to read. */ + } else if (retval == 1) + downgraded = true; ret = addr; goto out; } @@ -627,7 +636,10 @@ out: vm_unacct_memory(charged); locked = 0; } - up_write(¤t->mm->mmap_sem); + if (downgraded) + up_read(¤t->mm->mmap_sem); + else + up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); -- cgit v1.2.3 From cc4b8c794f476076c9ce19f43eb4d98dc4b5e155 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 26 Oct 2018 15:08:57 -0700 Subject: mm: dax: add comment for PFN_SPECIAL The comment for PFN_SPECIAL is missed in pfn_t.h. Add comment to get consistent with other pfn flags. Link: http://lkml.kernel.org/r/1538086549-100536-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: Dan Williams Reviewed-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn_t.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 673546ba7342..7bb77850c65a 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -9,6 +9,8 @@ * PFN_SG_LAST - pfn references a page and is the last scatterlist entry * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver + * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not + * get_user_pages */ #define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) -- cgit v1.2.3 From 1c2d479a119b84feacbe4de782016f1bf1ad16dc Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 26 Oct 2018 15:09:28 -0700 Subject: mm/memcontrol.c: convert mem_cgroup_id::ref to refcount_t type This will allow to use generic refcount_t interfaces to check counters overflow instead of currently existing VM_BUG_ON(). The only difference after the patch is VM_BUG_ON() may cause BUG(), while refcount_t fires with WARN(). But this seems not to be significant here, since such the problems are usually caught by syzbot with panic-on-warn enabled. Link: http://lkml.kernel.org/r/153910718919.7006.13400779039257185427.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Andrea Parri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4399cc3f00e4..7ab2120155a4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie { struct mem_cgroup_id { int id; - atomic_t ref; + refcount_t ref; }; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 645ede7ad1b2..92d38c88250f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4299,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg) static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); - atomic_add(n, &memcg->id.ref); + refcount_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - VM_BUG_ON(atomic_read(&memcg->id.ref) < n); - if (atomic_sub_and_test(n, &memcg->id.ref)) { + if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ @@ -4523,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) } /* Online state pins memcg ID, memcg ID pins CSS */ - atomic_set(&memcg->id.ref, 1); + refcount_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -6357,7 +6355,7 @@ subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { - while (!atomic_inc_not_zero(&memcg->id.ref)) { + while (!refcount_inc_not_zero(&memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. -- cgit v1.2.3 From 907ec5fca3dc38d37737de826f06f25b063aa08e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 26 Oct 2018 15:10:15 -0700 Subject: mm: zero remaining unavailable struct pages Patch series "mm: Fix for movable_node boot option", v3. This patch series contains a fix for the movable_node boot option issue which was introduced by commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"). The commit breaks the option because it changed the memory gap range to reserved memblock. So, the node is marked as Normal zone even if the SRAT has Hot pluggable affinity. First and second patch fix the original issue which the commit tried to fix, then revert the commit. This patch (of 3): There is a kernel panic that is triggered when reading /proc/kpageflags on the kernel booted with kernel parameter 'memmap=nn[KMG]!ss[KMG]': BUG: unable to handle kernel paging request at fffffffffffffffe PGD 9b20e067 P4D 9b20e067 PUD 9b210067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 2 PID: 1728 Comm: page-types Not tainted 4.17.0-rc6-mm1-v4.17-rc6-180605-0816-00236-g2dfb086ef02c+ #160 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.fc28 04/01/2014 RIP: 0010:stable_page_flags+0x27/0x3c0 Code: 00 00 00 0f 1f 44 00 00 48 85 ff 0f 84 a0 03 00 00 41 54 55 49 89 fc 53 48 8b 57 08 48 8b 2f 48 8d 42 ff 83 e2 01 48 0f 44 c7 <48> 8b 00 f6 c4 01 0f 84 10 03 00 00 31 db 49 8b 54 24 08 4c 89 e7 RSP: 0018:ffffbbd44111fde0 EFLAGS: 00010202 RAX: fffffffffffffffe RBX: 00007fffffffeff9 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000202 RDI: ffffed1182fff5c0 RBP: ffffffffffffffff R08: 0000000000000001 R09: 0000000000000001 R10: ffffbbd44111fed8 R11: 0000000000000000 R12: ffffed1182fff5c0 R13: 00000000000bffd7 R14: 0000000002fff5c0 R15: ffffbbd44111ff10 FS: 00007efc4335a500(0000) GS:ffff93a5bfc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffffffe CR3: 00000000b2a58000 CR4: 00000000001406e0 Call Trace: kpageflags_read+0xc7/0x120 proc_reg_read+0x3c/0x60 __vfs_read+0x36/0x170 vfs_read+0x89/0x130 ksys_pread64+0x71/0x90 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7efc42e75e23 Code: 09 00 ba 9f 01 00 00 e8 ab 81 f4 ff 66 2e 0f 1f 84 00 00 00 00 00 90 83 3d 29 0a 2d 00 00 75 13 49 89 ca b8 11 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 db d3 01 00 48 89 04 24 According to kernel bisection, this problem became visible due to commit f7f99100d8d9 which changes how struct pages are initialized. Memblock layout affects the pfn ranges covered by node/zone. Consider that we have a VM with 2 NUMA nodes and each node has 4GB memory, and the default (no memmap= given) memblock layout is like below: MEMBLOCK configuration: memory size = 0x00000001fff75c00 reserved size = 0x000000000300c000 memory.cnt = 0x4 memory[0x0] [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0 memory[0x1] [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0 memory[0x2] [0x0000000100000000-0x000000013fffffff], 0x0000000040000000 bytes on node 0 flags: 0x0 memory[0x3] [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0 ... If you give memmap=1G!4G (so it just covers memory[0x2]), the range [0x100000000-0x13fffffff] is gone: MEMBLOCK configuration: memory size = 0x00000001bff75c00 reserved size = 0x000000000300c000 memory.cnt = 0x3 memory[0x0] [0x0000000000001000-0x000000000009efff], 0x000000000009e000 bytes on node 0 flags: 0x0 memory[0x1] [0x0000000000100000-0x00000000bffd6fff], 0x00000000bfed7000 bytes on node 0 flags: 0x0 memory[0x2] [0x0000000140000000-0x000000023fffffff], 0x0000000100000000 bytes on node 1 flags: 0x0 ... This causes shrinking node 0's pfn range because it is calculated by the address range of memblock.memory. So some of struct pages in the gap range are left uninitialized. We have a function zero_resv_unavail() which does zeroing the struct pages outside memblock.memory, but currently it covers only the reserved unavailable range (i.e. memblock.memory && !memblock.reserved). This patch extends it to cover all unavailable range, which fixes the reported issue. Link: http://lkml.kernel.org/r/20181002143821.5112-2-msys.mizuma@gmail.com Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") Signed-off-by: Naoya Horiguchi Signed-off-by-by: Masayoshi Mizuma Tested-by: Oscar Salvador Tested-by: Masayoshi Mizuma Reviewed-by: Pavel Tatashin Cc: Ingo Molnar Cc: Michal Hocko Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 15 --------------- mm/page_alloc.c | 36 +++++++++++++++++++++++++----------- 2 files changed, 25 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 516920549378..2acdd046df2d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -/** - * for_each_resv_unavail_range - iterate through reserved and unavailable memory - * @i: u64 used as loop variable - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over unavailable but reserved (reserved && !memory) areas of memblock. - * Available as soon as memblock is initialized. - * Note: because this memory does not belong to any physical node, flags and - * nid arguments do not make sense and thus not exported as arguments. - */ -#define for_each_resv_unavail_range(i, p_start, p_end) \ - for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ - NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) - static inline void memblock_set_region_flags(struct memblock_region *r, enum memblock_flags flags) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c26d3152f9ba..6d863c5afa08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6515,29 +6515,42 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, * struct pages which are reserved in memblock allocator and their fields * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. + * + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock + * layout is manually configured via memmap=. */ void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; u64 i, pgcnt; + phys_addr_t next = 0; /* - * Loop through ranges that are reserved, but do not have reported - * physical memory backing. + * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_resv_unavail_range(i, &start, &end) { - for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; + for_each_mem_range(i, &memblock.memory, NULL, + NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + if (next < start) { + for (pfn = PFN_DOWN(next); pfn < PFN_UP(start); pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + continue; + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; } - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; } + next = end; } + for (pfn = PFN_DOWN(next); pfn < max_pfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + continue; + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + /* * Struct pages that do not have backing memory. This could be because @@ -6547,7 +6560,8 @@ void __init zero_resv_unavail(void) * this code can be removed. */ if (pgcnt) - pr_info("Reserved but unavailable: %lld pages", pgcnt); + pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); + } #endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ -- cgit v1.2.3 From df06b37ffe5a442503b7095b77b0a970df515459 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 26 Oct 2018 15:10:28 -0700 Subject: mm/gup: cache dev_pagemap while pinning pages Getting pages from ZONE_DEVICE memory needs to check the backing device's live-ness, which is tracked in the device's dev_pagemap metadata. This metadata is stored in a radix tree and looking it up adds measurable software overhead. This patch avoids repeating this relatively costly operation when dev_pagemap is used by caching the last dev_pagemap while getting user pages. The gup_benchmark kernel self test reports this reduces time to get user pages to as low as 1/3 of the previous time. Link: http://lkml.kernel.org/r/20181012173040.15669-1-keith.busch@intel.com Signed-off-by: Keith Busch Reviewed-by: Dan Williams Acked-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++-- include/linux/mm.h | 12 +----- mm/gup.c | 110 ++++++++++++++++++++++++++++-------------------- mm/huge_memory.c | 16 +++---- mm/nommu.c | 6 +-- 5 files changed, 79 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fdcb45999b26..4663ee96cf59 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page) } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags); + pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags); + pud_t *pud, int flags, struct dev_pagemap **pgmap); extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); @@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm) } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags) + unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, - unsigned long addr, pud_t *pud, int flags) + unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } diff --git a/include/linux/mm.h b/include/linux/mm.h index a023c5ce71fa..1e52b8fd1685 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2536,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags, - unsigned int *page_mask); - -static inline struct page *follow_page(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags) -{ - unsigned int unused_page_mask; - return follow_page_mask(vma, address, foll_flags, &unused_page_mask); -} +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags); #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ diff --git a/mm/gup.c b/mm/gup.c index 08eb350e0f35..841d7ef53591 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -20,6 +20,11 @@ #include "internal.h" +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags) + unsigned long address, pmd_t *pmd, unsigned int flags, + struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -116,8 +121,8 @@ retry: * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ - pgmap = get_dev_pagemap(pte_pfn(pte), NULL); - if (pgmap) + *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); + if (*pgmap) page = pte_page(pte); else goto no_page; @@ -152,15 +157,8 @@ retry: goto retry; } - if (flags & FOLL_GET) { + if (flags & FOLL_GET) get_page(page); - - /* drop the pgmap reference now that we hold the page */ - if (pgmap) { - put_dev_pagemap(pgmap); - pgmap = NULL; - } - } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -210,7 +208,8 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -258,13 +257,13 @@ retry: } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags); + page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); @@ -284,7 +283,7 @@ retry_locked: } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT) { int ret; @@ -307,18 +306,18 @@ retry_locked: } return ret ? ERR_PTR(ret) : - follow_page_pte(vma, address, pmd, flags); + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; + ctx->page_mask = HPAGE_PMD_NR - 1; return page; } - static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; @@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); + page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; @@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); - return follow_pmd_mask(vma, address, pud, flags, page_mask); + return follow_pmd_mask(vma, address, pud, flags, ctx); } - static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, - unsigned int flags, unsigned int *page_mask) + unsigned int flags, + struct follow_page_context *ctx) { p4d_t *p4d; struct page *page; @@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - return follow_pud_mask(vma, address, p4d, flags, page_mask); + return follow_pud_mask(vma, address, p4d, flags, ctx); } /** @@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - unsigned int *page_mask) + struct follow_page_context *ctx) { pgd_t *pgd; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; + ctx->page_mask = 0; /* make this handle hugepd */ page = follow_huge_addr(mm, address, flags & FOLL_WRITE); @@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return no_page_table(vma, flags); } - return follow_p4d_mask(vma, address, pgd, flags, page_mask); + return follow_p4d_mask(vma, address, pgd, flags, ctx); +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + struct follow_page_context ctx = { NULL }; + struct page *page; + + page = follow_page_mask(vma, address, foll_flags, &ctx); + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i = 0; - unsigned int page_mask; + long ret = 0, i = 0; struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; @@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pages ? &pages[i] : NULL); if (ret) return i ? : ret; - page_mask = 0; + ctx.page_mask = 0; goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) - return i ? : -EFAULT; + if (!vma || check_vma_flags(vma, gup_flags)) { + ret = -EFAULT; + goto out; + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -709,23 +722,26 @@ retry: * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; + if (unlikely(fatal_signal_pending(current))) { + ret = -ERESTARTSYS; + goto out; + } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &page_mask); + + page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - int ret; ret = faultin_page(tsk, vma, start, &foll_flags, nonblocking); switch (ret) { case 0: goto retry; + case -EBUSY: + ret = 0; + /* FALLTHRU */ case -EFAULT: case -ENOMEM: case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; + goto out; case -ENOENT: goto next_page; } @@ -737,27 +753,31 @@ retry: */ goto next_page; } else if (IS_ERR(page)) { - return i ? i : PTR_ERR(page); + ret = PTR_ERR(page); + goto out; } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); - page_mask = 0; + ctx.page_mask = 0; } next_page: if (vmas) { vmas[i] = vma; - page_mask = 0; + ctx.page_mask = 0; } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); - return i; +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8ea1b36bd452..25c7d7509cf4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } @@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; - struct dev_pagemap *pgmap; struct page *page; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - pgmap = get_dev_pagemap(pfn, NULL); - if (!pgmap) + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); get_page(page); - put_dev_pagemap(pgmap); return page; } diff --git a/mm/nommu.c b/mm/nommu.c index e4aac33216ae..749276beb109 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) { - *page_mask = 0; return NULL; } -- cgit v1.2.3 From bc4ae27d817a4e92071ef67cb6368120cfabe7ec Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 26 Oct 2018 15:10:51 -0700 Subject: mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS The SWP_FILE flag serves two purposes: to make swap_{read,write}page() go through the filesystem, and to make swapoff() call ->swap_deactivate(). For Btrfs, we want the latter but not the former, so split this flag into two. This makes us always call ->swap_deactivate() if ->swap_activate() succeeded, not just if it didn't add any swap extents itself. This also resolves the issue of the very misleading name of SWP_FILE, which is only used for swap files over NFS. Link: http://lkml.kernel.org/r/6d63d8668c4287a4f6d203d65696e96f80abdfc7.1536704650.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Nikolay Borisov Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 13 +++++++------ mm/page_io.c | 6 +++--- mm/swapfile.c | 13 ++++++++----- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index b93740d72e78..38195f5c96b1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -167,13 +167,14 @@ enum { SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ - SWP_FILE = (1 << 7), /* set after swap_activate success */ - SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ - SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ - SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ - SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ + SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ + SWP_FS = (1 << 8), /* swap file goes through fs */ + SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/page_io.c b/mm/page_io.c index 573d3663d846..a451ffa9491c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_FS) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2681e50592c5..f0c7e4c11bab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1003,7 +1003,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FILE)) + if (!(si->flags & SWP_FS)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, @@ -2299,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis) kfree(se); } - if (sis->flags & SWP_FILE) { + if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - sis->flags &= ~SWP_FILE; - mapping->a_ops->swap_deactivate(swap_file); + sis->flags &= ~SWP_ACTIVATED; + if (mapping->a_ops->swap_deactivate) + mapping->a_ops->swap_deactivate(swap_file); } } @@ -2400,8 +2401,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (ret >= 0) + sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FILE; + sis->flags |= SWP_FS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } -- cgit v1.2.3