From 92be775a3db343889ab159c44d55315c5ee0be4e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Aug 2018 21:51:53 -0700 Subject: mm: struct shrink_control: keep int fields together Patch series "Reorderings in struct shrinker and struct shrink_control". These structures are intensively used during reclaim and, displace other data in cache, so there is no a reason they have int fields not grouped together. This patch (of 2): gfp_t is of unsigned type, so let's move nid to keep them together. Link: http://lkml.kernel.org/r/153199747930.21131.861043607301997810.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: Chris Wilson Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index b154fd2b084c..d58aaaed34a4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -12,6 +12,9 @@ struct shrink_control { gfp_t gfp_mask; + /* current node being shrunk (for NUMA aware shrinkers) */ + int nid; + /* * How many objects scan_objects should scan and try to reclaim. * This is reset before every call, so it is safe for callees @@ -26,9 +29,6 @@ struct shrink_control { */ unsigned long nr_scanned; - /* current node being shrunk (for NUMA aware shrinkers) */ - int nid; - /* current memcg being shrunk (for memcg aware shrinkers) */ struct mem_cgroup *memcg; }; -- cgit v1.2.3 From e50ef89b0f58a2cb0e7d496a97b851e0dced62a6 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Aug 2018 21:51:57 -0700 Subject: mm: struct shrinker: make flags of unsigned type Currently, there are two flags only, so unsigned is more then enough. Also, move int seeks to keep these fields together. Link: http://lkml.kernel.org/r/153199748720.21131.6476256940113102483.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: Chris Wilson Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index d58aaaed34a4..9443cafd1969 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -63,9 +63,9 @@ struct shrinker { unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); - int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ - unsigned long flags; + int seeks; /* seeks to recreate an obj */ + unsigned flags; /* These are for internal use */ struct list_head list; -- cgit v1.2.3 From 5d5e8f19544a35ae1609bd96ae6b28c9fcd1baf6 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:20 -0700 Subject: mm, swap, get_swap_pages: use entry_size instead of cluster in parameter As suggested by Matthew Wilcox, it is better to use "int entry_size" instead of "bool cluster" as parameter to specify whether to operate for huge or normal swap entries. Because this improve the flexibility to support other swap entry size. And Dave Hansen thinks that this improves code readability too. So in this patch, the "bool cluster" parameter of get_swap_pages() is replaced by "int entry_size". And nr_swap_entries() trick is used to reduce the binary size when !CONFIG_TRANSPARENT_HUGE_PAGE. text data bss dec hex filename base 24215 2028 340 26583 67d7 mm/swapfile.o head 24123 2004 340 26467 6763 mm/swapfile.o Link: http://lkml.kernel.org/r/20180720071845.17920-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Matthew Wilcox Acked-by: Dave Hansen Cc: Daniel Jordan Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/swap_slots.c | 8 ++++---- mm/swapfile.c | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1a8bd05a335e..8e2c11e692ba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,7 +447,7 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 008ccb22fee6..63a7b4563a57 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, - cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, + cache->slots, 1); return cache->nr; } @@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, true, &entry); + get_swap_pages(1, &entry, HPAGE_PMD_NR); goto out; } @@ -350,7 +350,7 @@ repeat: goto out; } - get_swap_pages(1, false, &entry); + get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 043645e7f0b5..b30dd0642ccf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -940,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { - unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; + unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && cluster); + WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; + avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) goto noswap; @@ -961,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); + atomic_long_sub(n_goal * size, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -988,14 +988,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) { + if (size == SWAPFILE_CLUSTER) { if (!(si->flags & SWP_FILE)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret || cluster) + if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -1021,7 +1021,7 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long)(n_goal - n_ret) * nr_pages, + atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; -- cgit v1.2.3 From 93065ac753e4443840a057bfef4be71ec766fde9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2018 21:52:33 -0700 Subject: mm, oom: distinguish blockable mode for mmu notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several blockable mmu notifiers which might sleep in mmu_notifier_invalidate_range_start and that is a problem for the oom_reaper because it needs to guarantee a forward progress so it cannot depend on any sleepable locks. Currently we simply back off and mark an oom victim with blockable mmu notifiers as done after a short sleep. That can result in selecting a new oom victim prematurely because the previous one still hasn't torn its memory down yet. We can do much better though. Even if mmu notifiers use sleepable locks there is no reason to automatically assume those locks are held. Moreover majority of notifiers only care about a portion of the address space and there is absolutely zero reason to fail when we are unmapping an unrelated range. Many notifiers do really block and wait for HW which is harder to handle and we have to bail out though. This patch handles the low hanging fruit. __mmu_notifier_invalidate_range_start gets a blockable flag and callbacks are not allowed to sleep if the flag is set to false. This is achieved by using trylock instead of the sleepable lock for most callbacks and continue as long as we do not block down the call chain. I think we can improve that even further because there is a common pattern to do a range lookup first and then do something about that. The first part can be done without a sleeping lock in most cases AFAICS. The oom_reaper end then simply retries if there is at least one notifier which couldn't make any progress in !blockable mode. A retry loop is already implemented to wait for the mmap_sem and this is basically the same thing. The simplest way for driver developers to test this code path is to wrap userspace code which uses these notifiers into a memcg and set the hard limit to hit the oom. This can be done e.g. after the test faults in all the mmu notifier managed memory and set the hard limit to something really small. Then we are looking for a proper process tear down. [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: minor code simplification] Link: http://lkml.kernel.org/r/20180716115058.5559-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Christian König # AMD notifiers Acked-by: Leon Romanovsky # mlx and umem_odp Reported-by: David Rientjes Cc: "David (ChunMing) Zhou" Cc: Paolo Bonzini Cc: Alex Deucher Cc: David Airlie Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Doug Ledford Cc: Jason Gunthorpe Cc: Mike Marciniszyn Cc: Dennis Dalessandro Cc: Sudeep Dutt Cc: Ashutosh Dixit Cc: Dimitri Sivanich Cc: Boris Ostrovsky Cc: Juergen Gross Cc: "Jérôme Glisse" Cc: Andrea Arcangeli Cc: Felix Kuehling Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/x86.c | 7 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 43 ++++++++++++++++++++++++++------ drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++++++--- drivers/gpu/drm/radeon/radeon_mn.c | 22 ++++++++++++++--- drivers/infiniband/core/umem_odp.c | 33 +++++++++++++++++++------ drivers/infiniband/hw/hfi1/mmu_rb.c | 11 ++++++--- drivers/infiniband/hw/mlx5/odp.c | 2 +- drivers/misc/mic/scif/scif_dma.c | 7 ++++-- drivers/misc/sgi-gru/grutlbpurge.c | 7 ++++-- drivers/xen/gntdev.c | 44 ++++++++++++++++++++++++++------- include/linux/kvm_host.h | 4 +-- include/linux/mmu_notifier.h | 33 +++++++++++++++++++------ include/linux/oom.h | 2 +- include/rdma/ib_umem_odp.h | 3 ++- mm/hmm.c | 7 ++++-- mm/mmap.c | 2 +- mm/mmu_notifier.c | 19 +++++++++++--- mm/oom_kill.c | 29 +++++++++++----------- virt/kvm/kvm_main.c | 15 +++++++---- 19 files changed, 223 insertions(+), 80 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7dff0457846..4a74a7cf0a8b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, + bool blockable) { unsigned long apic_address; @@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + + return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index a365ea2383d1..e55508b39496 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn) * * @amn: our notifier */ -static void amdgpu_mn_read_lock(struct amdgpu_mn *amn) +static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable) { - mutex_lock(&amn->read_lock); + if (blockable) + mutex_lock(&amn->read_lock); + else if (!mutex_trylock(&amn->read_lock)) + return -EAGAIN; + if (atomic_inc_return(&amn->recursion) == 1) down_read_non_owner(&amn->lock); mutex_unlock(&amn->read_lock); + + return 0; } /** @@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, +static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; @@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(amn); + /* TODO we should be able to split locking for interval tree and + * amdgpu_mn_invalidate_node + */ + if (amdgpu_mn_read_lock(amn, blockable)) + return -EAGAIN; it = interval_tree_iter_first(&amn->objects, start, end); while (it) { struct amdgpu_mn_node *node; + if (!blockable) { + amdgpu_mn_read_unlock(amn); + return -EAGAIN; + } + node = container_of(it, struct amdgpu_mn_node, it); it = interval_tree_iter_next(it, start, end); amdgpu_mn_invalidate_node(node, start, end); } + + return 0; } /** @@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * necessitates evicting all user-mode queues of the process. The BOs * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ -static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, +static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; @@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(amn); + if (amdgpu_mn_read_lock(amn, blockable)) + return -EAGAIN; it = interval_tree_iter_first(&amn->objects, start, end); while (it) { struct amdgpu_mn_node *node; struct amdgpu_bo *bo; + if (!blockable) { + amdgpu_mn_read_unlock(amn); + return -EAGAIN; + } + node = container_of(it, struct amdgpu_mn_node, it); it = interval_tree_iter_next(it, start, end); @@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, amdgpu_amdkfd_evict_userptr(mem, mm); } } + + return 0; } /** diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index dcd6e230d16a..2c9b284036d1 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo) mo->attached = false; } -static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, +static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct i915_mmu_notifier *mn = container_of(_mn, struct i915_mmu_notifier, mn); @@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, LIST_HEAD(cancelled); if (RB_EMPTY_ROOT(&mn->objects.rb_root)) - return; + return 0; /* interval ranges are inclusive, but invalidate range is exclusive */ end--; @@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, spin_lock(&mn->lock); it = interval_tree_iter_first(&mn->objects, start, end); while (it) { + if (!blockable) { + spin_unlock(&mn->lock); + return -EAGAIN; + } /* The mmu_object is released late when destroying the * GEM object so it is entirely possible to gain a * reference on an object in the process of being freed @@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, if (!list_empty(&cancelled)) flush_workqueue(mn->wq); + + return 0; } static const struct mmu_notifier_ops i915_gem_userptr_notifier = { diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index abd24975c9b1..f8b35df44c60 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn, * We block for all BOs between start and end to be idle and * unmap them by move them into system domain again. */ -static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, +static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); struct ttm_operation_ctx ctx = { false, false }; struct interval_tree_node *it; + int ret = 0; /* notification is exclusive, but interval is inclusive */ end -= 1; - mutex_lock(&rmn->lock); + /* TODO we should be able to split locking for interval tree and + * the tear down. + */ + if (blockable) + mutex_lock(&rmn->lock); + else if (!mutex_trylock(&rmn->lock)) + return -EAGAIN; it = interval_tree_iter_first(&rmn->objects, start, end); while (it) { @@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; + if (!blockable) { + ret = -EAGAIN; + goto out_unlock; + } + node = container_of(it, struct radeon_mn_node, it); it = interval_tree_iter_next(it, start, end); @@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, } } +out_unlock: mutex_unlock(&rmn->lock); + + return ret; } static const struct mmu_notifier_ops radeon_mn_ops = { diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 182436b92ba9..6ec748eccff7 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, + true, NULL); up_read(&context->umem_rwsem); } @@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, return 0; } -static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + int ret; if (!context->invalidate_range) - return; + return 0; + + if (blockable) + down_read(&context->umem_rwsem); + else if (!down_read_trylock(&context->umem_rwsem)) + return -EAGAIN; ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, end, - invalidate_range_start_trampoline, NULL); + invalidate_range_start_trampoline, + blockable, NULL); up_read(&context->umem_rwsem); + + return ret; } static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, @@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!context->invalidate_range) return; + /* + * TODO: we currently bail out if there is any sleepable work to be done + * in ib_umem_notifier_invalidate_range_start so we shouldn't really block + * here. But this is ugly and fragile. + */ down_read(&context->umem_rwsem); rbt_ib_umem_for_each_in_range(&context->umem_tree, start, end, - invalidate_range_end_trampoline, NULL); + invalidate_range_end_trampoline, true, NULL); up_read(&context->umem_rwsem); ib_ucontext_notifier_end_account(context); } @@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 last, umem_call_back cb, + bool blockable, void *cookie) { int ret_val = 0; @@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; node = next) { + /* TODO move the blockable decision up to the callback */ + if (!blockable) + return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem->umem, start, last, cookie) || ret_val; diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 70aceefe14d5..e1c7996c018e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -67,9 +67,9 @@ struct mmu_rb_handler { static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *); -static void mmu_notifier_range_start(struct mmu_notifier *, +static int mmu_notifier_range_start(struct mmu_notifier *, struct mm_struct *, - unsigned long, unsigned long); + unsigned long, unsigned long, bool); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); static void do_remove(struct mmu_rb_handler *handler, @@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, handler->ops->remove(handler->ops_arg, node); } -static void mmu_notifier_range_start(struct mmu_notifier *mn, +static int mmu_notifier_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct mmu_rb_handler *handler = container_of(mn, struct mmu_rb_handler, mn); @@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn, if (added) queue_work(handler->wq, &handler->del_work); + + return 0; } /* diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f1a87a690a4c..d216e0d2921d 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) down_read(&ctx->umem_rwsem); rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, - mr_leaf_free, imr); + mr_leaf_free, true, imr); up_read(&ctx->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c index 63d6246d6dff..6369aeaa7056 100644 --- a/drivers/misc/mic/scif/scif_dma.c +++ b/drivers/misc/mic/scif/scif_dma.c @@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn, schedule_work(&scif_info.misc_work); } -static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct scif_mmu_notif *mmn; mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); scif_rma_destroy_tcw(mmn, start, end - start); + + return 0; } static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index a3454eb56fbf..be28f05bfafa 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru) /* * MMUOPS notifier callout functions */ -static void gru_invalidate_range_start(struct mmu_notifier *mn, +static int gru_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool blockable) { struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, ms_notifier); @@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn, gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, start, end, atomic_read(&gms->ms_range_active)); gru_flush_tlb_range(gms, start, end - start); + + return 0; } static void gru_invalidate_range_end(struct mmu_notifier *mn, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index c866a62f766d..57390c7666e5 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ +static bool in_range(struct gntdev_grant_map *map, + unsigned long start, unsigned long end) +{ + if (!map->vma) + return false; + if (map->vma->vm_start >= end) + return false; + if (map->vma->vm_end <= start) + return false; + + return true; +} + static void unmap_if_in_range(struct gntdev_grant_map *map, unsigned long start, unsigned long end) { unsigned long mstart, mend; int err; - if (!map->vma) - return; - if (map->vma->vm_start >= end) - return; - if (map->vma->vm_end <= start) - return; mstart = max(start, map->vma->vm_start); mend = min(end, map->vma->vm_end); pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", @@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map, WARN_ON(err); } -static void mn_invl_range_start(struct mmu_notifier *mn, +static int mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool blockable) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct gntdev_grant_map *map; + int ret = 0; + + /* TODO do we really need a mutex here? */ + if (blockable) + mutex_lock(&priv->lock); + else if (!mutex_trylock(&priv->lock)) + return -EAGAIN; - mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { + if (in_range(map, start, end)) { + ret = -EAGAIN; + goto out_unlock; + } unmap_if_in_range(map, start, end); } list_for_each_entry(map, &priv->freeable_maps, next) { + if (in_range(map, start, end)) { + ret = -EAGAIN; + goto out_unlock; + } unmap_if_in_range(map, start, end); } + +out_unlock: mutex_unlock(&priv->lock); + + return ret; } static void mn_release(struct mmu_notifier *mn, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c7362dd2faa..0205aee44ded 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 392e6af82701..133ba78820ee 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -151,13 +151,15 @@ struct mmu_notifier_ops { * address space but may still be referenced by sptes until * the last refcount is dropped. * - * If both of these callbacks cannot block, and invalidate_range - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. + * If blockable argument is set to false then the callback cannot + * sleep and has to return with -EAGAIN. 0 should be returned + * otherwise. + * */ - void (*invalidate_range_start)(struct mmu_notifier *mn, + int (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool blockable); void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); -extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end, bool only_end); @@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_start(mm, start, end); + __mmu_notifier_invalidate_range_start(mm, start, end, true); +} + +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_invalidate_range_start(mm, start, end, false); + return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, { } +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + return 0; +} + static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 6adac113e96d..92f70e4c6252 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm) return 0; } -void __oom_reap_task_mm(struct mm_struct *mm); +bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 6a17f856f841..381cdf5a9bd1 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, */ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 end, - umem_call_back cb, void *cookie); + umem_call_back cb, + bool blockable, void *cookie); /* * Find first region intersecting with address range. diff --git a/mm/hmm.c b/mm/hmm.c index 76e7a058b32f..0b0554591610 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) up_write(&hmm->mirrors_sem); } -static void hmm_invalidate_range_start(struct mmu_notifier *mn, +static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); atomic_inc(&hmm->sequence); + + return 0; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, diff --git a/mm/mmap.c b/mm/mmap.c index 8d6449e74431..bb2a7e097c7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3064,7 +3064,7 @@ void exit_mmap(struct mm_struct *mm) * reliably test it. */ mutex_lock(&oom_lock); - __oom_reap_task_mm(mm); + (void)__oom_reap_task_mm(mm); mutex_unlock(&oom_lock); set_bit(MMF_OOM_SKIP, &mm->flags); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index eff6b88a993f..82bb1a939c0e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable) { struct mmu_notifier *mn; + int ret = 0; int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range_start) - mn->ops->invalidate_range_start(mn, mm, start, end); + if (mn->ops->invalidate_range_start) { + int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); + if (_ret) { + pr_info("%pS callback failed with %d in %sblockable context.\n", + mn->ops->invalidate_range_start, _ret, + !blockable ? "non-" : ""); + ret = _ret; + } + } } srcu_read_unlock(&srcu, id); + + return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 412f43453a68..be31a1e0fe78 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -487,9 +487,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -void __oom_reap_task_mm(struct mm_struct *mm) +bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + bool ret = true; /* * Tell all users of get_user/copy_from_user etc... that the content @@ -519,12 +520,17 @@ void __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); + if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { + ret = false; + continue; + } unmap_page_range(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } } + + return ret; } static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) @@ -553,18 +559,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } - /* - * If the mm has invalidate_{start,end}() notifiers that could block, - * sleep to give the oom victim some more time. - * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time - */ - if (mm_has_blockable_invalidate_notifiers(mm)) { - up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); - goto unlock_oom; - } - /* * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't * work on the mm anymore. The check for MMF_OOM_SKIP must run @@ -579,7 +573,12 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) trace_start_task_reaping(tsk->pid); - __oom_reap_task_mm(mm); + /* failed to reap part of the address space. Try again later */ + if (!__oom_reap_task_mm(mm)) { + up_read(&mm->mmap_sem); + ret = false; + goto unlock_oom; + } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9263ead9fd32..0116b449b993 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; -__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable) { + return 0; } bool kvm_is_reserved_pfn(kvm_pfn_t pfn) @@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } -static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; + int ret; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, spin_unlock(&kvm->mmu_lock); - kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); + ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable); srcu_read_unlock(&kvm->srcu, idx); + + return ret; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, -- cgit v1.2.3 From a670468f5e0b5fad4db6e4d195f15915dc2a35c1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Aug 2018 21:53:06 -0700 Subject: mm: zero out the vma in vma_init() Rather than in vm_area_alloc(). To ensure that the various oddball stack-based vmas are in a good state. Some of the callers were zeroing them out, others were not. Acked-by: Kirill A. Shutemov Cc: Russell King Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 9 ++++----- fs/hugetlbfs/inode.c | 2 -- include/linux/mm.h | 1 + kernel/fork.c | 3 ++- mm/mempolicy.c | 1 - mm/shmem.c | 1 - 6 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d9c299133111..82ab015bf42b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * atomic helpers. Insert it into the gate_vma so that it is visible * through ptrace and /proc//mem. */ -static struct vm_area_struct gate_vma = { - .vm_start = 0xffff0000, - .vm_end = 0xffff0000 + PAGE_SIZE, - .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, -}; +static struct vm_area_struct gate_vma; static int __init gate_vma_init(void) { vma_init(&gate_vma, NULL); gate_vma.vm_page_prot = PAGE_READONLY_EXEC; + gate_vma.vm_start = 0xffff0000; + gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; + gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; return 0; } arch_initcall(gate_vma_init); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 346a146c7617..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, current->mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); @@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * allocation routines. If NUMA is configured, use page index * as input to create an allocation policy. */ - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; diff --git a/include/linux/mm.h b/include/linux/mm.h index a3cae495f9ce..3a4b87d1a59a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; + memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); diff --git a/kernel/fork.c b/kernel/fork.c index 5ee74c113381..8c760effa42e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) vma_init(vma, mm); return vma; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01f1a14facc4..4861ba738d6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/shmem.c b/mm/shmem.c index c48c79018a7c..fb04baacc9fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; -- cgit v1.2.3 From 89696701ea847786f88ccc1c580fb4c3c20c4a3a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:24 -0700 Subject: mm: remove zone_id() and make use of zone_idx() in is_dev_zone() is_dev_zone() is using zone_id() to check if the zone is ZONE_DEVICE. zone_id() looks pretty much the same as zone_idx(), and while the use of zone_idx() is quite spread in the kernel, zone_id() is only being used by is_dev_zone(). This patch removes zone_id() and makes is_dev_zone() use zone_idx() to check the zone, so we do not have two things with the same functionality around. Link: http://lkml.kernel.org/r/20180730133718.28683-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Pavel Tatashin Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 32699b2dc52a..3d4577a3ae7e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -755,25 +755,6 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } -static inline int zone_id(const struct zone *zone) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - - return zone - pgdat->node_zones; -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_dev_zone(const struct zone *zone) -{ - return zone_id(zone) == ZONE_DEVICE; -} -#else -static inline bool is_dev_zone(const struct zone *zone) -{ - return false; -} -#endif - #include void build_all_zonelists(pg_data_t *pgdat); @@ -824,6 +805,18 @@ static inline int local_memory_node(int node_id) { return node_id; }; */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_dev_zone(const struct zone *zone) +{ + return zone_idx(zone) == ZONE_DEVICE; +} +#else +static inline bool is_dev_zone(const struct zone *zone) +{ + return false; +} +#endif + /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than -- cgit v1.2.3 From c1093b746c0576ed81c4d568d1e39cab651d37e6 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 21 Aug 2018 21:53:32 -0700 Subject: mm: access zone->node via zone_to_nid() and zone_set_nid() zone->node is configured only when CONFIG_NUMA=y, so it is a good idea to have inline functions to access this field in order to avoid ifdef's in c files. Link: http://lkml.kernel.org/r/20180730101757.28058-3-osalvador@techadventures.net Signed-off-by: Pavel Tatashin Signed-off-by: Oscar Salvador Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 --------- include/linux/mmzone.h | 26 ++++++++++++++++++++------ mm/mempolicy.c | 4 ++-- mm/mm_init.c | 9 ++------- mm/page_alloc.c | 10 ++++------ 5 files changed, 28 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a4b87d1a59a..a55f5389c491 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -960,15 +960,6 @@ static inline int page_zone_id(struct page *page) return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline int zone_to_nid(struct zone *zone) -{ -#ifdef CONFIG_NUMA - return zone->node; -#else - return 0; -#endif -} - #ifdef NODE_NOT_IN_PAGE_FLAGS extern int page_to_nid(const struct page *page); #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3d4577a3ae7e..1e22d96734e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -834,6 +834,25 @@ static inline bool populated_zone(struct zone *zone) return zone->present_pages; } +#ifdef CONFIG_NUMA +static inline int zone_to_nid(struct zone *zone) +{ + return zone->node; +} + +static inline void zone_set_nid(struct zone *zone, int nid) +{ + zone->node = nid; +} +#else +static inline int zone_to_nid(struct zone *zone) +{ + return 0; +} + +static inline void zone_set_nid(struct zone *zone, int nid) {} +#endif + extern int movable_zone; #ifdef CONFIG_HIGHMEM @@ -949,12 +968,7 @@ static inline int zonelist_zone_idx(struct zoneref *zoneref) static inline int zonelist_node_idx(struct zoneref *zoneref) { -#ifdef CONFIG_NUMA - /* zone_to_nid not available in this context */ - return zoneref->zone->node; -#else - return 0; -#endif /* CONFIG_NUMA */ + return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4861ba738d6f..da858f794eb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); - return z->zone ? z->zone->node : node; + return z->zone ? zone_to_nid(z->zone) : node; } default: @@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); - polnid = z->zone->node; + polnid = zone_to_nid(z->zone); break; default: diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..6838a530789b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void) zone->name); /* Iterate the zonelist */ - for_each_zone_zonelist(zone, z, zonelist, zoneid) { -#ifdef CONFIG_NUMA - pr_cont("%d:%s ", zone->node, zone->name); -#else - pr_cont("0:%s ", zone->name); -#endif /* CONFIG_NUMA */ - } + for_each_zone_zonelist(zone, z, zonelist, zoneid) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); pr_cont("\n"); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 455cc3bfae99..6b2324bc61db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) if (!static_branch_likely(&vm_numa_stat_key)) return; - if (z->node != numa_node_id()) + if (zone_to_nid(z) != numa_node_id()) local_stat = NUMA_OTHER; - if (z->node == preferred_zone->node) + if (zone_to_nid(z) == zone_to_nid(preferred_zone)) __inc_numa_state(z, NUMA_HIT); else { __inc_numa_state(z, NUMA_MISS); @@ -5278,7 +5278,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return z->zone->node; + return zone_to_nid(z->zone); } #endif @@ -6299,9 +6299,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * And all highmem pages will be managed by the buddy system. */ zone->managed_pages = freesize; -#ifdef CONFIG_NUMA - zone->node = nid; -#endif + zone_set_nid(zone, nid); zone->name = zone_names[j]; zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); -- cgit v1.2.3 From 03e85f9d5f1f8c74f127c5f7a87575d74a78d248 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:43 -0700 Subject: mm/page_alloc: Introduce free_area_init_core_hotplug Currently, whenever a new node is created/re-used from the memhotplug path, we call free_area_init_node()->free_area_init_core(). But there is some code that we do not really need to run when we are coming from such path. free_area_init_core() performs the following actions: 1) Initializes pgdat internals, such as spinlock, waitqueues and more. 2) Account # nr_all_pages and # nr_kernel_pages. These values are used later on when creating hash tables. 3) Account number of managed_pages per zone, substracting dma_reserved and memmap pages. 4) Initializes some fields of the zone structure data 5) Calls init_currently_empty_zone to initialize all the freelists 6) Calls memmap_init to initialize all pages belonging to certain zone When called from memhotplug path, free_area_init_core() only performs actions #1 and #4. Action #2 is pointless as the zones do not have any pages since either the node was freed, or we are re-using it, eitherway all zones belonging to this node should have 0 pages. For the same reason, action #3 results always in manages_pages being 0. Action #5 and #6 are performed later on when onlining the pages: online_pages()->move_pfn_range_to_zone()->init_currently_empty_zone() online_pages()->move_pfn_range_to_zone()->memmap_init_zone() This patch does two things: First, moves the node/zone initializtion to their own function, so it allows us to create a small version of free_area_init_core, where we only perform: 1) Initialization of pgdat internals, such as spinlock, waitqueues and more 4) Initialization of some fields of the zone structure data These two functions are: pgdat_init_internals() and zone_init_internals(). The second thing this patch does, is to introduce free_area_init_core_hotplug(), the memhotplug version of free_area_init_core(): Currently, we call free_area_init_node() from the memhotplug path. In there, we set some pgdat's fields, and call calculate_node_totalpages(). calculate_node_totalpages() calculates the # of pages the node has. Since the node is either new, or we are re-using it, the zones belonging to this node should not have any pages, so there is no point to calculate this now. Actually, we re-set these values to 0 later on with the calls to: reset_node_managed_pages() reset_node_present_pages() The # of pages per node and the # of pages per zone will be calculated when onlining the pages: online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_zone_range() online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_pgdat_range() Also, since free_area_init_core/free_area_init_node will now only get called during early init, let us replace __paginginit with __init, so their code gets freed up. [osalvador@techadventures.net: fix section usage] Link: http://lkml.kernel.org/r/20180731101752.GA473@techadventures.net [osalvador@suse.de: v6] Link: http://lkml.kernel.org/r/20180801122348.21588-6-osalvador@techadventures.net Link: http://lkml.kernel.org/r/20180730101757.28058-5-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Pasha Tatashin Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + include/linux/mm.h | 2 +- mm/memory_hotplug.c | 16 +++------ mm/page_alloc.c | 78 +++++++++++++++++++++++++++++------------- 4 files changed, 61 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4e9828cda7a2..34a28227068d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,6 +319,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) static inline void remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern void __ref free_area_init_core_hotplug(int nid); extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mm.h b/include/linux/mm.h index a55f5389c491..a9e733b5fb76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2015,7 +2015,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) extern void __init pagecache_init(void); extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, unsigned long * zones_size, +extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4eb6e824a80c..9eea6e809a4e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat) static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); @@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ + pgdat->node_id = nid; + pgdat->node_start_pfn = start_pfn; + /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, zones_size, start_pfn, zholes_size); + free_area_init_core_hotplug(nid); pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* @@ -1016,19 +1017,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) */ build_all_zonelists(pgdat); - /* - * zone->managed_pages is set to an approximate value in - * free_area_init_core(), which will cause - * /sys/device/system/node/nodeX/meminfo has wrong data. - * So reset it to 0 before any memory is onlined. - */ - reset_node_managed_pages(pgdat); - /* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). */ + reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b939bd1bff9..c677c1506d73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __meminit set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -6168,13 +6168,13 @@ void __meminit set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __meminit set_pageblock_order(void) +void __init set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -static unsigned long __meminit calc_memmap_size(unsigned long spanned_pages, +static unsigned long __init calc_memmap_size(unsigned long spanned_pages, unsigned long present_pages) { unsigned long pages = spanned_pages; @@ -6225,19 +6225,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} #endif -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - */ -static void __meminit free_area_init_core(struct pglist_data *pgdat) +static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { - enum zone_type j; - int nid = pgdat->node_id; - pgdat_resize_init(pgdat); pgdat_init_numabalancing(pgdat); @@ -6250,7 +6239,54 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); +} + +static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, + unsigned long remaining_pages) +{ + zone->managed_pages = remaining_pages; + zone_set_nid(zone, nid); + zone->name = zone_names[idx]; + zone->zone_pgdat = NODE_DATA(nid); + spin_lock_init(&zone->lock); + zone_seqlock_init(zone); + zone_pcp_init(zone); +} + +/* + * Set up the zone data structures + * - init pgdat internals + * - init all zones belonging to this node + * + * NOTE: this function is only called during memory hotplug + */ +#ifdef CONFIG_MEMORY_HOTPLUG +void __ref free_area_init_core_hotplug(int nid) +{ + enum zone_type z; + pg_data_t *pgdat = NODE_DATA(nid); + + pgdat_init_internals(pgdat); + for (z = 0; z < MAX_NR_ZONES; z++) + zone_init_internals(&pgdat->node_zones[z], z, nid, 0); +} +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + * NOTE: this function is only called during early init. + */ +static void __init free_area_init_core(struct pglist_data *pgdat) +{ + enum zone_type j; + int nid = pgdat->node_id; + pgdat_init_internals(pgdat); pgdat->per_cpu_nodestats = &boot_nodestats; for (j = 0; j < MAX_NR_ZONES; j++) { @@ -6298,13 +6334,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = freesize; - zone_set_nid(zone, nid); - zone->name = zone_names[j]; - zone->zone_pgdat = pgdat; - spin_lock_init(&zone->lock); - zone_seqlock_init(zone); - zone_pcp_init(zone); + zone_init_internals(zone, j, nid, freesize); if (!size) continue; @@ -6379,7 +6409,7 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __meminit free_area_init_node(int nid, unsigned long *zones_size, +void __init free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { @@ -6418,7 +6448,7 @@ void __meminit free_area_init_node(int nid, unsigned long *zones_size, * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. */ -void __meminit zero_resv_unavail(void) +void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; -- cgit v1.2.3 From 3d8b38eb81cac81395f6a823f6bf401b327268e6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 21 Aug 2018 21:53:54 -0700 Subject: mm, oom: introduce memory.oom.group For some workloads an intervention from the OOM killer can be painful. Killing a random task can bring the workload into an inconsistent state. Historically, there are two common solutions for this problem: 1) enabling panic_on_oom, 2) using a userspace daemon to monitor OOMs and kill all outstanding processes. Both approaches have their downsides: rebooting on each OOM is an obvious waste of capacity, and handling all in userspace is tricky and requires a userspace agent, which will monitor all cgroups for OOMs. In most cases an in-kernel after-OOM cleaning-up mechanism can eliminate the necessity of enabling panic_on_oom. Also, it can simplify the cgroup management for userspace applications. This commit introduces a new knob for cgroup v2 memory controller: memory.oom.group. The knob determines whether the cgroup should be treated as an indivisible workload by the OOM killer. If set, all tasks belonging to the cgroup or to its descendants (if the memory cgroup is not a leaf cgroup) are killed together or not at all. To determine which cgroup has to be killed, we do traverse the cgroup hierarchy from the victim task's cgroup up to the OOMing cgroup (or root) and looking for the highest-level cgroup with memory.oom.group set. Tasks with the OOM protection (oom_score_adj set to -1000) are treated as an exception and are never killed. This patch doesn't change the OOM victim selection algorithm. Link: http://lkml.kernel.org/r/20180802003201.817-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: David Rientjes Cc: Tetsuo Handa Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 18 +++++++ include/linux/memcontrol.h | 18 +++++++ mm/memcontrol.c | 93 +++++++++++++++++++++++++++++++++ mm/oom_kill.c | 30 +++++++++++ 4 files changed, 159 insertions(+) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 1746131bc9cb..184193bcb262 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1072,6 +1072,24 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.oom.group + A read-write single value file which exists on non-root + cgroups. The default value is "0". + + Determines whether the cgroup should be treated as + an indivisible workload by the OOM killer. If set, + all tasks belonging to the cgroup or to its descendants + (if the memory cgroup is not a leaf cgroup) are killed + together or not at all. This can be used to avoid + partial kills to guarantee workload integrity. + + Tasks with the OOM protection (oom_score_adj set to -1000) + are treated as an exception and are never killed. + + If the OOM killer is invoked in a cgroup, it's not going + to kill any tasks outside of this cgroup, regardless + memory.oom.group values of ancestor cgroups. + memory.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e6c515fb698..652f602167df 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -225,6 +225,11 @@ struct mem_cgroup { */ bool use_hierarchy; + /* + * Should the OOM killer kill all belonging tasks, had it kill one? + */ + bool oom_group; + /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; @@ -542,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p) } bool mem_cgroup_oom_synchronize(bool wait); +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain); +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -1001,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline struct mem_cgroup *mem_cgroup_get_oom_group( + struct task_struct *victim, struct mem_cgroup *oom_domain) +{ + return NULL; +} + +static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59c14c988143..4ead5a4817de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1776,6 +1776,62 @@ cleanup: return true; } +/** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain) +{ + struct mem_cgroup *oom_group = NULL; + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return NULL; + + if (!oom_domain) + oom_domain = root_mem_cgroup; + + rcu_read_lock(); + + memcg = mem_cgroup_from_task(victim); + if (memcg == root_mem_cgroup) + goto out; + + /* + * Traverse the memory cgroup hierarchy from the victim task's + * cgroup up to the OOMing cgroup (or root) to find the + * highest-level memory cgroup with oom.group set. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + if (memcg->oom_group) + oom_group = memcg; + + if (memcg == oom_domain) + break; + } + + if (oom_group) + css_get(&oom_group->css); +out: + rcu_read_unlock(); + + return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ + pr_info("Tasks in "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + /** * lock_page_memcg - lock a page->mem_cgroup binding * @page: the page @@ -5561,6 +5617,37 @@ static int memory_stat_show(struct seq_file *m, void *v) return 0; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%d\n", memcg->oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, oom_group; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_group); + if (ret) + return ret; + + if (oom_group != 0 && oom_group != 1) + return -EINVAL; + + memcg->oom_group = oom_group; + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5602,6 +5689,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, + { + .name = "oom.group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, { } /* terminate */ }; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 330416c67ce5..0e10b864e074 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -908,6 +908,19 @@ static void __oom_kill_process(struct task_struct *victim) } #undef K +/* + * Kill provided task unless it's secured by setting + * oom_score_adj to OOM_SCORE_ADJ_MIN. + */ +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(task); + __oom_kill_process(task); + } + return 0; +} + static void oom_kill_process(struct oom_control *oc, const char *message) { struct task_struct *p = oc->chosen; @@ -915,6 +928,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) struct task_struct *victim = p; struct task_struct *child; struct task_struct *t; + struct mem_cgroup *oom_group; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -968,7 +982,23 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } read_unlock(&tasklist_lock); + /* + * Do we need to kill the entire memory cgroup? + * Or even one of the ancestor memory cgroups? + * Check this out before killing the victim task. + */ + oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + __oom_kill_process(victim); + + /* + * If necessary, kill all tasks in the selected memory cgroup. + */ + if (oom_group) { + mem_cgroup_print_oom_group(oom_group); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_put(oom_group); + } } /* -- cgit v1.2.3 From 7e8a6304d5419cbf056a59de92939e5eef039c57 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 21 Aug 2018 21:53:58 -0700 Subject: /proc/meminfo: add percpu populated pages count Currently, percpu memory only exposes allocation and utilization information via debugfs. This more or less is only really useful for understanding the fragmentation and allocation information at a per-chunk level with a few global counters. This is also gated behind a config. BPF and cgroup, for example, have seen an increase in use causing increased use of percpu memory. Let's make it easier for someone to identify how much memory is being used. This patch adds the "Percpu" stat to meminfo to more easily look up how much percpu memory is in use. This number includes the cost for all allocated backing pages and not just insight at the per a unit, per chunk level. Metadata is excluded. I think excluding metadata is fair because the backing memory scales with the numbere of cpus and can quickly outweigh the metadata. It also makes this calculation light. Link: http://lkml.kernel.org/r/20180807184723.74919-1-dennisszhou@gmail.com Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Acked-by: Roman Gushchin Reviewed-by: Andrew Morton Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Christoph Lameter Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +++ fs/proc/meminfo.c | 2 ++ include/linux/percpu.h | 2 ++ mm/percpu.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+) (limited to 'include') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 1605acbb23b6..22b4b00dee31 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -870,6 +870,7 @@ Committed_AS: 100056 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB +Percpu: 62080 kB HardwareCorrupted: 0 kB AnonHugePages: 49152 kB ShmemHugePages: 0 kB @@ -962,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free + Percpu: Memory allocated to the percpu allocator used to back percpu + allocations. This stat excludes the cost of metadata. .............................................................................. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 2fb04846ed11..edda898714eb 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", 0ul); show_val_kb(m, "VmallocChunk: ", 0ul); + show_val_kb(m, "Percpu: ", pcpu_nr_pages()); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 296bbe49d5d1..70b7123f38c7 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +extern unsigned long pcpu_nr_pages(void); + #endif /* __LINUX_PERCPU_H */ diff --git a/mm/percpu.c b/mm/percpu.c index 0b6480979ac7..a749d4d96e3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -169,6 +169,14 @@ static LIST_HEAD(pcpu_map_extend_chunks); */ int pcpu_nr_empty_pop_pages; +/* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between @@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + pcpu_nr_populated += nr; if (!for_alloc) { chunk->nr_empty_pop_pages += nr; @@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, chunk->nr_populated -= nr; chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; + pcpu_nr_populated -= nr; } /* @@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); @@ -2745,6 +2758,22 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ +/* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up -- cgit v1.2.3 From 5df66d306ec9b1952d3d183fe4e2ced1a7b2bddb Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:54:06 -0700 Subject: mm: fix comment for NODEMASK_ALLOC Currently, NODEMASK_ALLOC allocates a nodemask_t with kmalloc when NODES_SHIFT is higher than 8, otherwise it declares it within the stack. The comment says that the reasoning behind this, is that nodemask_t will be 256 bytes when NODES_SHIFT is higher than 8, but this is not true. For example, NODES_SHIFT = 9 will give us a 64 bytes nodemask_t. Let us fix up the comment for that. Another thing is that it might make sense to let values lower than 128bytes be allocated in the stack. Although this all depends on the depth of the stack (and this changes from function to function), I think that 64 bytes is something we can easily afford. So we could even bump the limit by 1 (from > 8 to > 9). Link: http://lkml.kernel.org/r/20180820085516.9687-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 1fbde8a880d9..5a30ad594ccc 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -518,7 +518,7 @@ static inline int node_random(const nodemask_t *mask) * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ -#if NODES_SHIFT > 8 /* nodemask_t > 256 bytes */ +#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) -- cgit v1.2.3 From 891ae71dc4fbd2454a3fa569e115a7ca86630949 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:37 -0700 Subject: proc: spread "const" a bit Link: http://lkml.kernel.org/r/20180627200614.GB18434@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 4 ++-- include/linux/proc_fs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c9d97818a421..5185d7f6a51e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -113,12 +113,12 @@ static inline void *__PDE_DATA(const struct inode *inode) return PDE(inode)->data; } -static inline struct pid *proc_pid(struct inode *inode) +static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } -static inline struct task_struct *get_proc_task(struct inode *inode) +static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 626fc65c4336..d0e1f1522a78 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -129,7 +129,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ -static inline struct pid_namespace *proc_pid_ns(struct inode *inode) +static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) { return inode->i_sb->s_fs_info; } -- cgit v1.2.3 From a8dd9c4df18edc873d244790d163564a5d17626b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:54:51 -0700 Subject: proc/kcore: don't grab lock for kclist_add() Patch series "/proc/kcore improvements", v4. This series makes a few improvements to /proc/kcore. It fixes a couple of small issues in v3 but is otherwise the same. Patches 1, 2, and 3 are prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep patch. Patches 6 and 7 are optimizations to ->read(). Patch 8 makes it possible to enable CRASH_CORE on any architecture, which is needed for patch 9. Patch 9 adds vmcoreinfo to /proc/kcore. This patch (of 9): kclist_add() is only called at init time, so there's no point in grabbing any locks. We're also going to replace the rwlock with a rwsem, which we don't want to try grabbing during early boot. While we're here, mark kclist_add() with __init so that we'll get a warning if it's called from non-init code. Link: http://lkml.kernel.org/r/98208db1faf167aa8b08eebfa968d95c70527739.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Reviewed-by: Bhupesh Sharma Tested-by: Bhupesh Sharma Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 7 +++---- include/linux/kcore.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 66c373230e60..b0b9a76f28d6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head); static DEFINE_RWLOCK(kclist_lock); static int kcore_need_update = 1; -void -kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +/* This doesn't grab kclist_lock, so it should only be used at init time. */ +void __init kclist_add(struct kcore_list *new, void *addr, size_t size, + int type) { new->addr = (unsigned long)addr; new->size = size; new->type = type; - write_lock(&kclist_lock); list_add_tail(&new->list, &kclist_head); - write_unlock(&kclist_lock); } static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8de55e4b5ee9..c20f296438fb 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -35,7 +35,7 @@ struct vmcoredd_node { }; #ifdef CONFIG_PROC_KCORE -extern void kclist_add(struct kcore_list *, void *, size_t, int type); +void __init kclist_add(struct kcore_list *, void *, size_t, int type); #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) -- cgit v1.2.3 From 23c85094fe1895caefdd19ef624ee687ec5f4507 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:20 -0700 Subject: proc/kcore: add vmcoreinfo note to /proc/kcore The vmcoreinfo information is useful for runtime debugging tools, not just for crash dumps. A lot of this information can be determined by other means, but this is much more convenient, and it only adds a page at most to the file. Link: http://lkml.kernel.org/r/fddbcd08eed76344863303878b12de1c1e2a04b6.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Kconfig | 1 + fs/proc/kcore.c | 18 ++++++++++++++++-- include/linux/crash_core.h | 2 ++ kernel/crash_core.c | 4 ++-- 4 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 0eaeb41453f5..817c02b13b1d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -31,6 +31,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + select CRASH_CORE help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 758c14e46a44..80464432dfe6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,6 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar */ +#include #include #include #include @@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, } *phdrs_len = *nphdr * sizeof(struct elf_phdr); - *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 4)) + + *notes_len = (4 * sizeof(struct elf_note) + + 3 * ALIGN(sizeof(CORE_STR), 4) + + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + - ALIGN(arch_task_struct_size, 4)); + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + *notes_len); return *data_offset + size; @@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) sizeof(prpsinfo)); append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, arch_task_struct_size); + /* + * vmcoreinfo_size is mostly constant after init time, but it + * can be changed by crash_save_vmcoreinfo(). Racing here with a + * panic on another CPU before the machine goes down is insanely + * unlikely, but it's better to not leave potential buffer + * overflows lying around, regardless. + */ + append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, + vmcoreinfo_data, + min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index b511f6d24b42..525510a9f965 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index e7b4025c7b24..a683bfb0530f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,8 +14,8 @@ #include /* vmcoreinfo stuff */ -static unsigned char *vmcoreinfo_data; -static size_t vmcoreinfo_size; +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ -- cgit v1.2.3 From 96c6a32ccb55a366054fd82cc63523bb7f7493d3 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 21 Aug 2018 21:55:24 -0700 Subject: include/asm-generic/bug.h: clarify valid uses of WARN() Explicitly state that WARN*() should be used only for recoverable kernel issues/bugs and that it should not be used for any kind of invalid external inputs or transient conditions. Motivation: it's a very useful capability to be able to understand if a particular kernel splat means a kernel bug or simply an invalid user-space program. For the former one wants to notify kernel developers, while notifying kernel developers for the latter is annoying. Even a kernel developer may not know what to do with a WARNING in an unfamiliar subsystem. This is especially critical for any automated testing systems that may use panic_on_warn and mail kernel developers. The clear separation also serves as an additional documentation: is it a condition that must never occur because of additional checks/logic elsewhere? or is it simply a check for invalid inputs or unfortunate conditions? Use of pr_err() for user messages also leads to better error messages. "Something is wrong in file foo on line X" is not particularly useful message for end user. pr_err() forces developers to write more meaningful error messages for user. As of now we are almost there. We are doing systematic kernel testing with panic_on_warn and are not seeing massive amounts of false positives. But every now and then another WARN on ENOMEM or invalid inputs pops up and leads to a lengthy argument each time. The goal of this change is to officially document the rules. Link: http://lkml.kernel.org/r/20180620103716.61636-1-dvyukov@gmail.com Signed-off-by: Dmitry Vyukov Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index a7613e1b0c87..20561a60db9c 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -75,9 +75,19 @@ struct bug_entry { /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report - * significant issues that need prompt attention if they should ever - * appear at runtime. Use the versions with printk format strings - * to provide better diagnostics. + * significant kernel issues that need prompt attention if they should ever + * appear at runtime. + * + * Do not use these macros when checking for invalid external inputs + * (e.g. invalid system call arguments, or invalid data coming from + * network/devices), and on transient conditions like ENOMEM or EAGAIN. + * These macros should be used for recoverable kernel issues only. + * For invalid external inputs, transient conditions, etc use + * pr_err[_once/_ratelimited]() followed by dump_stack(), if necessary. + * Do not include "BUG"/"WARNING" in format strings manually to make these + * conditions distinguishable from kernel issues. + * + * Use the versions with printk format strings to provide better diagnostics. */ #ifndef __WARN_TAINT extern __printf(3, 4) -- cgit v1.2.3 From cedc5b6aab493f6b1b1d381dccc0cc082da7d3d8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 21 Aug 2018 21:55:28 -0700 Subject: kernel.h: documentation for roundup() vs round_up() Things like 3619dec5103d ("dh key: fix rounding up KDF output length") expose the lack of explicit documentation for roundup() vs round_up(). At least we can try to document it better if anyone goes looking. Link: http://lkml.kernel.org/r/20180703041950.GA43464@beast Signed-off-by: Kees Cook Cc: Ingo Molnar Cc: Greg Kroah-Hartman Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 941dc0a5a877..d6aac75b51ba 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,7 +85,23 @@ * arguments just once each. */ #define __round_mask(x, y) ((__typeof__(x))((y)-1)) +/** + * round_up - round up to next specified power of 2 + * @x: the value to round + * @y: multiple to round up to (must be a power of 2) + * + * Rounds @x up to next multiple of @y (which must be a power of 2). + * To perform arbitrary rounding up, use roundup() below. + */ #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +/** + * round_down - round down to next specified power of 2 + * @x: the value to round + * @y: multiple to round down to (must be a power of 2) + * + * Rounds @x down to next multiple of @y (which must be a power of 2). + * To perform arbitrary rounding down, use rounddown() below. + */ #define round_down(x, y) ((x) & ~__round_mask(x, y)) /** @@ -110,13 +126,30 @@ # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d) #endif -/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */ +/** + * roundup - round up to the next specified multiple + * @x: the value to up + * @y: multiple to round up to + * + * Rounds @x up to next multiple of @y. If @y will always be a power + * of 2, consider using the faster round_up(). + * + * The `const' here prevents gcc-3.3 from calling __divdi3 + */ #define roundup(x, y) ( \ { \ const typeof(y) __y = y; \ (((x) + (__y - 1)) / __y) * __y; \ } \ ) +/** + * rounddown - round down to next specified multiple + * @x: the value to round + * @y: multiple to round down to + * + * Rounds @x down to next multiple of @y. If @y will always be a power + * of 2, consider using the faster round_down(). + */ #define rounddown(x, y) ( \ { \ typeof(x) __x = (x); \ -- cgit v1.2.3 From e58dd0de5eadf145895b13451a1fef8ef03946eb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Aug 2018 21:55:31 -0700 Subject: bdi: use refcount_t for reference counting instead atomic_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This permits avoiding accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/20180703200141.28415-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Suggested-by: Peter Zijlstra Cc: Jens Axboe Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 3 ++- include/linux/backing-dev.h | 4 ++-- mm/backing-dev.c | 12 ++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 24251762c20c..9a6bc0951cfa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -12,6 +12,7 @@ #include #include #include +#include struct page; struct device; @@ -75,7 +76,7 @@ enum wb_reason { */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ - atomic_t refcnt; /* nr of attached wb's and blkg */ + refcount_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK struct backing_dev_info *__bdi; /* the associated bdi, set to NULL diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 72ca0f3d39f3..c28a47cbe355 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -404,13 +404,13 @@ static inline bool inode_cgwb_enabled(struct inode *inode) static inline struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { - atomic_inc(&bdi->wb_congested->refcnt); + refcount_inc(&bdi->wb_congested->refcnt); return bdi->wb_congested; } static inline void wb_congested_put(struct bdi_writeback_congested *congested) { - if (atomic_dec_and_test(&congested->refcnt)) + if (refcount_dec_and_test(&congested->refcnt)) kfree(congested); } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2e5d3df0853d..55a233d75f39 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -438,10 +438,10 @@ retry: if (new_congested) { /* !found and storage for new one already allocated, insert */ congested = new_congested; - new_congested = NULL; rb_link_node(&congested->rb_node, parent, node); rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - goto found; + spin_unlock_irqrestore(&cgwb_lock, flags); + return congested; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -451,13 +451,13 @@ retry: if (!new_congested) return NULL; - atomic_set(&new_congested->refcnt, 0); + refcount_set(&new_congested->refcnt, 1); new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; found: - atomic_inc(&congested->refcnt); + refcount_inc(&congested->refcnt); spin_unlock_irqrestore(&cgwb_lock, flags); kfree(new_congested); return congested; @@ -474,7 +474,7 @@ void wb_congested_put(struct bdi_writeback_congested *congested) unsigned long flags; local_irq_save(flags); - if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + if (!refcount_dec_and_lock(&congested->refcnt, &cgwb_lock)) { local_irq_restore(flags); return; } @@ -804,7 +804,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; - atomic_set(&bdi->wb_congested->refcnt, 1); + refcount_set(&bdi->wb_congested->refcnt, 1); err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { -- cgit v1.2.3 From fc37191272a972d449bab3d8247cf52cadf5d4e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Aug 2018 21:55:38 -0700 Subject: userns: use refcount_t for reference counting instead atomic_t refcount_t type and corresponding API should be used instead of atomic_t wh en the variable is used as a reference counter. This avoids accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/20180703200141.28415-6-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Reviewed-by: Andrew Morton Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/user.h | 5 +++-- kernel/user.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 96fe289c4c6e..39ad98c09c58 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include #include struct key; @@ -12,7 +13,7 @@ struct key; * Some day this will be a full-fledged user tracking system.. */ struct user_struct { - atomic_t __count; /* reference count */ + refcount_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY @@ -59,7 +60,7 @@ extern struct user_struct root_user; extern struct user_struct * alloc_uid(kuid_t); static inline struct user_struct *get_uid(struct user_struct *u) { - atomic_inc(&u->__count); + refcount_inc(&u->__count); return u; } extern void free_uid(struct user_struct *); diff --git a/kernel/user.c b/kernel/user.c index 36288d840675..5f65ef195259 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = REFCOUNT_INIT(1), .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, @@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { - atomic_inc(&user->__count); + refcount_inc(&user->__count); return user; } } @@ -170,7 +170,7 @@ void free_uid(struct user_struct *up) return; local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + if (refcount_dec_and_lock(&up->__count, &uidhash_lock)) free_user(up, flags); else local_irq_restore(flags); @@ -191,7 +191,7 @@ struct user_struct *alloc_uid(kuid_t uid) goto out_unlock; new->uid = uid; - atomic_set(&new->__count, 1); + refcount_set(&new->__count, 1); ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); -- cgit v1.2.3 From 203583990cb675aeddff6d7623f68268068c078c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 21 Aug 2018 21:55:45 -0700 Subject: linux/compiler.h: don't use bool Appararently, it's possible to have a non-trivial TU include a few headers, including linux/build_bug.h, without ending up with linux/types.h. So the 0day bot sent me config: um-x86_64_defconfig (attached as .config) >> include/linux/compiler.h:316:3: error: unknown type name 'bool'; did you mean '_Bool'? bool __cond = !(condition); \ for something I'm working on. Rather than contributing to the #include madness and including linux/types.h from compiler.h, just use int. Link: http://lkml.kernel.org/r/20180817101036.20969-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Christopher Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 42506e4d1f53..c8eab637a2a7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -313,7 +313,7 @@ unsigned long read_word_at_a_time(const void *addr) #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ - bool __cond = !(condition); \ + int __cond = !(condition); \ extern void prefix ## suffix(void) __compiletime_error(msg); \ if (__cond) \ prefix ## suffix(); \ -- cgit v1.2.3 From a2e514453861dd39b53b7a50b6771bd3f9852078 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 21 Aug 2018 21:55:52 -0700 Subject: kernel/hung_task.c: allow to set checking interval separately from timeout Currently task hung checking interval is equal to timeout, as the result hung is detected anywhere between timeout and 2*timeout. This is fine for most interactive environments, but this hurts automated testing setups (syzbot). In an automated setup we need to strictly order CPU lockup < RCU stall < workqueue lockup < task hung < silent loss, so that RCU stall is not detected as task hung and task hung is not detected as silent machine loss. The large variance in task hung detection timeout requires setting silent machine loss timeout to a very large value (e.g. if task hung is 3 mins, then silent loss need to be set to ~7 mins). The additional 3 minutes significantly reduce testing efficiency because usually we crash kernel within a minute, and this can add hours to bug localization process as it needs to do dozens of tests. Allow setting checking interval separately from timeout. This allows to set timeout to, say, 3 minutes, but checking interval to 10 secs. The interval is controlled via a new hung_task_check_interval_secs sysctl, similar to the existing hung_task_timeout_secs sysctl. The default value of 0 results in the current behavior: checking interval is equal to timeout. [akpm@linux-foundation.org: update hung_task_timeout_max's comment] Link: http://lkml.kernel.org/r/20180611111004.203513-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Paul E. McKenney Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 15 ++++++++++++++- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 1 + kernel/fork.c | 1 + kernel/hung_task.c | 15 ++++++++++++++- kernel/sysctl.c | 13 ++++++++++++- 6 files changed, 43 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 59585030cbaf..9d38e75b19a0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -38,6 +38,7 @@ show up in /proc/sys/kernel: - hung_task_panic - hung_task_check_count - hung_task_timeout_secs +- hung_task_check_interval_secs - hung_task_warnings - hyperv_record_panic_msg - kexec_load_disabled @@ -355,7 +356,7 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. hung_task_timeout_secs: -Check interval. When a task in D state did not get scheduled +When a task in D state did not get scheduled for more than this value report a warning. This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. @@ -364,6 +365,18 @@ Possible values to set are in range {0..LONG_MAX/HZ}. ============================================================== +hung_task_check_interval_secs: + +Hung task check interval. If hung task checking is enabled +(see hung_task_timeout_secs), the check is done every +hung_task_check_interval_secs seconds. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +0 (default): means use hung_task_timeout_secs as checking interval. +Possible values to set are in range {0..LONG_MAX/HZ}. + +============================================================== + hung_task_warnings: The maximum number of warnings to report. During a check interval diff --git a/include/linux/sched.h b/include/linux/sched.h index 789923fbee3a..58eb3a2bc695 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -853,6 +853,7 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; + unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 913488d828cb..a9c32daeb9d8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -10,6 +10,7 @@ struct ctl_table; extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, diff --git a/kernel/fork.c b/kernel/fork.c index 8c760effa42e..8bcef2413f1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1302,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + tsk->last_switch_time = 0; #endif tsk->mm = NULL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..b9132d1269ef 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +/* + * Zero (default value) means use sysctl_hung_task_timeout_secs: + */ +unsigned long __read_mostly sysctl_hung_task_check_interval_secs; + int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; + t->last_switch_time = jiffies; return; } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return; trace_sched_process_hang(t); @@ -245,8 +253,13 @@ static int watchdog(void *dummy) for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; - long t = hung_timeout_jiffies(hung_last_checked, timeout); + unsigned long interval = sysctl_hung_task_check_interval_secs; + long t; + if (interval == 0) + interval = timeout; + interval = min_t(unsigned long, interval, timeout); + t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0)) check_hung_uninterruptible_tasks(timeout); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f22f76b7a138..dbdcb6673b27 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -145,7 +145,10 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ #ifdef CONFIG_DETECT_HUNG_TASK static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #endif @@ -1090,6 +1093,14 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dohung_task_timeout_secs, .extra2 = &hung_task_timeout_max, }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, -- cgit v1.2.3 From f922c4abdf7648523589abee9460c87f51630d2f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:04 -0700 Subject: module: allow symbol exports to be disabled To allow existing C code to be incorporated into the decompressor or the UEFI stub, introduce a CPP macro that turns all EXPORT_SYMBOL_xxx declarations into nops, and #define it in places where such exports are undesirable. Note that this gets rid of a rather dodgy redefine of linux/export.h's header guard. Link: http://lkml.kernel.org/r/20180704083651.24360-3-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Nicolas Pitre Acked-by: Michael Ellerman Reviewed-by: Will Deacon Acked-by: Ingo Molnar Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/boot/compressed/kaslr.c | 5 +---- drivers/firmware/efi/libstub/Makefile | 1 + include/linux/export.h | 11 ++++++++++- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 302517929932..d1e19f358b6e 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -23,11 +23,8 @@ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL * which is meaningless and will cause compiling error in some cases. - * So do not include linux/export.h and define EXPORT_SYMBOL(sym) - * as empty. */ -#define _LINUX_EXPORT_H -#define EXPORT_SYMBOL(sym) +#define __DISABLE_EXPORTS #include "misc.h" #include "error.h" diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 88c322d7c71e..14c40a7750d1 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -24,6 +24,7 @@ KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ -D__NO_FORTIFY \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) \ + -D__DISABLE_EXPORTS GCOV_PROFILE := n KASAN_SANITIZE := n diff --git a/include/linux/export.h b/include/linux/export.h index b768d6dd3c90..ea7df303d68d 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -66,7 +66,16 @@ extern struct module __this_module; __attribute__((section("___ksymtab" sec "+" #sym), used)) \ = { (unsigned long)&sym, __kstrtab_##sym } -#if defined(__KSYM_DEPS__) +#if defined(__DISABLE_EXPORTS) + +/* + * Allow symbol exports to be disabled completely so that C code may + * be reused in other execution contexts such as the UEFI stub or the + * decompressor. + */ +#define __EXPORT_SYMBOL(sym, sec) + +#elif defined(__KSYM_DEPS__) /* * For fine grained build dependencies, we want to tell the build system -- cgit v1.2.3 From 7290d58095712a89f845e1bca05334796dd49ed2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:09 -0700 Subject: module: use relative references for __ksymtab entries An ordinary arm64 defconfig build has ~64 KB worth of __ksymtab entries, each consisting of two 64-bit fields containing absolute references, to the symbol itself and to a char array containing its name, respectively. When we build the same configuration with KASLR enabled, we end up with an additional ~192 KB of relocations in the .init section, i.e., one 24 byte entry for each absolute reference, which all need to be processed at boot time. Given how the struct kernel_symbol that describes each entry is completely local to module.c (except for the references emitted by EXPORT_SYMBOL() itself), we can easily modify it to contain two 32-bit relative references instead. This reduces the size of the __ksymtab section by 50% for all 64-bit architectures, and gets rid of the runtime relocations entirely for architectures implementing KASLR, either via standard PIE linking (arm64) or using custom host tools (x86). Note that the binary search involving __ksymtab contents relies on each section being sorted by symbol name. This is implemented based on the input section names, not the names in the ksymtab entries, so this patch does not interfere with that. Given that the use of place-relative relocations requires support both in the toolchain and in the module loader, we cannot enable this feature for all architectures. So make it dependent on whether CONFIG_HAVE_ARCH_PREL32_RELOCATIONS is defined. Link: http://lkml.kernel.org/r/20180704083651.24360-4-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Jessica Yu Acked-by: Michael Ellerman Reviewed-by: Will Deacon Acked-by: Ingo Molnar Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/export.h | 5 ----- include/asm-generic/export.h | 12 +++++++++-- include/linux/compiler.h | 19 ++++++++++++++++++ include/linux/export.h | 46 ++++++++++++++++++++++++++++++++----------- kernel/module.c | 32 ++++++++++++++++++++++++------ 6 files changed, 91 insertions(+), 24 deletions(-) delete mode 100644 arch/x86/include/asm/export.h (limited to 'include') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index de690c2d2e33..a0ab9ab61c75 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h generic-y += dma-contiguous.h generic-y += early_ioremap.h +generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h deleted file mode 100644 index 2a51d66689c5..000000000000 --- a/arch/x86/include/asm/export.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_64BIT -#define KSYM_ALIGN 16 -#endif -#include diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 68efb950a918..4d73e6e3c66c 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -5,12 +5,10 @@ #define KSYM_FUNC(x) x #endif #ifdef CONFIG_64BIT -#define __put .quad #ifndef KSYM_ALIGN #define KSYM_ALIGN 8 #endif #else -#define __put .long #ifndef KSYM_ALIGN #define KSYM_ALIGN 4 #endif @@ -19,6 +17,16 @@ #define KCRC_ALIGN 4 #endif +.macro __put, val, name +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + .long \val - ., \name - . +#elif defined(CONFIG_64BIT) + .quad \val, \name +#else + .long \val, \name +#endif +.endm + /* * note on .section use: @progbits vs %progbits nastiness doesn't matter, * since we immediately emit into those sections anyway. diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c8eab637a2a7..681d866efb1e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -280,6 +280,25 @@ unsigned long read_word_at_a_time(const void *addr) #endif /* __KERNEL__ */ +/* + * Force the compiler to emit 'sym' as a symbol, so that we can reference + * it from inline assembler. Necessary in case 'sym' could be inlined + * otherwise, or eliminated entirely due to lack of references that are + * visible to the compiler. + */ +#define __ADDRESSABLE(sym) \ + static void * __attribute__((section(".discard.addressable"), used)) \ + __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; + +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + #endif /* __ASSEMBLY__ */ #ifndef __optimize diff --git a/include/linux/export.h b/include/linux/export.h index ea7df303d68d..ae072bc5aacf 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -18,12 +18,6 @@ #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x) #ifndef __ASSEMBLY__ -struct kernel_symbol -{ - unsigned long value; - const char *name; -}; - #ifdef MODULE extern struct module __this_module; #define THIS_MODULE (&__this_module) @@ -54,17 +48,47 @@ extern struct module __this_module; #define __CRC_SYMBOL(sym, sec) #endif +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#include +/* + * Emit the ksymtab entry as a pair of relative references: this reduces + * the size by half on 64-bit architectures, and eliminates the need for + * absolute relocations that require runtime processing on relocatable + * kernels. + */ +#define __KSYMTAB_ENTRY(sym, sec) \ + __ADDRESSABLE(sym) \ + asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ + " .balign 8 \n" \ + "__ksymtab_" #sym ": \n" \ + " .long " #sym "- . \n" \ + " .long __kstrtab_" #sym "- . \n" \ + " .previous \n") + +struct kernel_symbol { + int value_offset; + int name_offset; +}; +#else +#define __KSYMTAB_ENTRY(sym, sec) \ + static const struct kernel_symbol __ksymtab_##sym \ + __attribute__((section("___ksymtab" sec "+" #sym), used)) \ + = { (unsigned long)&sym, __kstrtab_##sym } + +struct kernel_symbol { + unsigned long value; + const char *name; +}; +#endif + /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL(sym, sec) \ extern typeof(sym) sym; \ __CRC_SYMBOL(sym, sec) \ static const char __kstrtab_##sym[] \ - __attribute__((section("__ksymtab_strings"), aligned(1))) \ + __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #sym; \ - static const struct kernel_symbol __ksymtab_##sym \ - __used \ - __attribute__((section("___ksymtab" sec "+" #sym), used)) \ - = { (unsigned long)&sym, __kstrtab_##sym } + __KSYMTAB_ENTRY(sym, sec) #if defined(__DISABLE_EXPORTS) diff --git a/kernel/module.c b/kernel/module.c index b046a32520d8..6746c85511fe 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -529,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms, return true; } +static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +static const char *kernel_symbol_name(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->name_offset); +#else + return sym->name; +#endif +} + static int cmp_name(const void *va, const void *vb) { const char *a; const struct kernel_symbol *b; a = va; b = vb; - return strcmp(a, b->name); + return strcmp(a, kernel_symbol_name(b)); } static bool find_symbol_in_section(const struct symsearch *syms, @@ -2170,7 +2188,7 @@ void *__symbol_get(const char *symbol) sym = NULL; preempt_enable(); - return sym ? (void *)sym->value : NULL; + return sym ? (void *)kernel_symbol_value(sym) : NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2200,10 +2218,12 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + if (find_symbol(kernel_symbol_name(s), &owner, NULL, + true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", - mod->name, s->name, module_name(owner)); + mod->name, kernel_symbol_name(s), + module_name(owner)); return -ENOEXEC; } } @@ -2252,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = ksym->value; + sym[i].st_value = kernel_symbol_value(ksym); break; } @@ -2516,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value, ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - return ks != NULL && ks->value == value; + return ks != NULL && kernel_symbol_value(ks) == value; } /* As per nm */ -- cgit v1.2.3 From 1b1eeca7e4c19fa76d409d4c7b338dba21f2df45 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:13 -0700 Subject: init: allow initcall tables to be emitted using relative references Allow the initcall tables to be emitted using relative references that are only half the size on 64-bit architectures and don't require fixups at runtime on relocatable kernels. Link: http://lkml.kernel.org/r/20180704083651.24360-5-ard.biesheuvel@linaro.org Acked-by: James Morris Acked-by: Sergey Senozhatsky Acked-by: Petr Mladek Acked-by: Michael Ellerman Acked-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Russell King Cc: "Serge E. Hallyn" Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 44 +++++++++++++++++++++++++++++++++----------- init/main.c | 32 ++++++++++++++++---------------- kernel/printk/printk.c | 16 +++++++++------- security/security.c | 17 ++++++++++------- 4 files changed, 68 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/init.h b/include/linux/init.h index bc27cf03c41e..2538d176dd1f 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -116,8 +116,24 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -extern initcall_t __con_initcall_start[], __con_initcall_end[]; -extern initcall_t __security_initcall_start[], __security_initcall_end[]; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +typedef int initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return offset_to_ptr(entry); +} +#else +typedef initcall_t initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return *entry; +} +#endif + +extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; +extern initcall_entry_t __security_initcall_start[], __security_initcall_end[]; /* Used for contructor calls. */ typedef void (*ctor_fn_t)(void); @@ -167,9 +183,20 @@ extern bool initcall_debug; * as KEEP() in the linker script. */ -#define __define_initcall(fn, id) \ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define ___define_initcall(fn, id, __sec) \ + __ADDRESSABLE(fn) \ + asm(".section \"" #__sec ".init\", \"a\" \n" \ + "__initcall_" #fn #id ": \n" \ + ".long " #fn " - . \n" \ + ".previous \n"); +#else +#define ___define_initcall(fn, id, __sec) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn; + __attribute__((__section__(#__sec ".init"))) = fn; +#endif + +#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id) /* * Early initcalls run before initializing SMP. @@ -208,13 +235,8 @@ extern bool initcall_debug; #define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -#define console_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.con_initcall.init) = fn - -#define security_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.security_initcall.init) = fn +#define console_initcall(fn) ___define_initcall(fn,, .con_initcall) +#define security_initcall(fn) ___define_initcall(fn,, .security_initcall) struct obs_kernel_param { const char *str; diff --git a/init/main.c b/init/main.c index b729e1f22838..3a6ce89e128f 100644 --- a/init/main.c +++ b/init/main.c @@ -902,18 +902,18 @@ int __init_or_module do_one_initcall(initcall_t fn) } -extern initcall_t __initcall_start[]; -extern initcall_t __initcall0_start[]; -extern initcall_t __initcall1_start[]; -extern initcall_t __initcall2_start[]; -extern initcall_t __initcall3_start[]; -extern initcall_t __initcall4_start[]; -extern initcall_t __initcall5_start[]; -extern initcall_t __initcall6_start[]; -extern initcall_t __initcall7_start[]; -extern initcall_t __initcall_end[]; - -static initcall_t *initcall_levels[] __initdata = { +extern initcall_entry_t __initcall_start[]; +extern initcall_entry_t __initcall0_start[]; +extern initcall_entry_t __initcall1_start[]; +extern initcall_entry_t __initcall2_start[]; +extern initcall_entry_t __initcall3_start[]; +extern initcall_entry_t __initcall4_start[]; +extern initcall_entry_t __initcall5_start[]; +extern initcall_entry_t __initcall6_start[]; +extern initcall_entry_t __initcall7_start[]; +extern initcall_entry_t __initcall_end[]; + +static initcall_entry_t *initcall_levels[] __initdata = { __initcall0_start, __initcall1_start, __initcall2_start, @@ -939,7 +939,7 @@ static char *initcall_level_names[] __initdata = { static void __init do_initcall_level(int level) { - initcall_t *fn; + initcall_entry_t *fn; strcpy(initcall_command_line, saved_command_line); parse_args(initcall_level_names[level], @@ -950,7 +950,7 @@ static void __init do_initcall_level(int level) trace_initcall_level(initcall_level_names[level]); for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) - do_one_initcall(*fn); + do_one_initcall(initcall_from_entry(fn)); } static void __init do_initcalls(void) @@ -981,11 +981,11 @@ static void __init do_basic_setup(void) static void __init do_pre_smp_initcalls(void) { - initcall_t *fn; + initcall_entry_t *fn; trace_initcall_level("early"); for (fn = __initcall_start; fn < __initcall0_start; fn++) - do_one_initcall(*fn); + do_one_initcall(initcall_from_entry(fn)); } /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 90b6ab01db59..918f386b2f6e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2788,7 +2788,8 @@ EXPORT_SYMBOL(unregister_console); void __init console_init(void) { int ret; - initcall_t *call; + initcall_t call; + initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); @@ -2797,13 +2798,14 @@ void __init console_init(void) * set up the console device so that later boot sequences can * inform about problems etc.. */ - call = __con_initcall_start; + ce = __con_initcall_start; trace_initcall_level("console"); - while (call < __con_initcall_end) { - trace_initcall_start((*call)); - ret = (*call)(); - trace_initcall_finish((*call), ret); - call++; + while (ce < __con_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } diff --git a/security/security.c b/security/security.c index 47cfff01d7ec..736e78da1ab9 100644 --- a/security/security.c +++ b/security/security.c @@ -48,14 +48,17 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = static void __init do_security_initcalls(void) { int ret; - initcall_t *call; - call = __security_initcall_start; + initcall_t call; + initcall_entry_t *ce; + + ce = __security_initcall_start; trace_initcall_level("security"); - while (call < __security_initcall_end) { - trace_initcall_start((*call)); - ret = (*call) (); - trace_initcall_finish((*call), ret); - call++; + while (ce < __security_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } -- cgit v1.2.3 From c9d8b55fa0191623fccb9ed67d2ff8f9159e9a89 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:18 -0700 Subject: PCI: Add support for relative addressing in quirk tables Allow the PCI quirk tables to be emitted in a way that avoids absolute references to the hook functions. This reduces the size of the entries, and, more importantly, makes them invariant under runtime relocation (e.g., for KASLR) Link: http://lkml.kernel.org/r/20180704083651.24360-6-ard.biesheuvel@linaro.org Acked-by: Bjorn Helgaas Acked-by: Michael Ellerman Acked-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pci/quirks.c | 12 +++++++++--- include/linux/pci.h | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 46f58a9771d7..ef7143a274e0 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -66,9 +66,15 @@ static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, f->vendor == (u16) PCI_ANY_ID) && (f->device == dev->device || f->device == (u16) PCI_ANY_ID)) { - calltime = fixup_debug_start(dev, f->hook); - f->hook(dev); - fixup_debug_report(dev, calltime, f->hook); + void (*hook)(struct pci_dev *dev); +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + hook = offset_to_ptr(&f->hook_offset); +#else + hook = f->hook; +#endif + calltime = fixup_debug_start(dev, hook); + hook(dev); + fixup_debug_report(dev, calltime, hook); } } diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b87f1936906..e72ca8dd6241 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1809,7 +1809,11 @@ struct pci_fixup { u16 device; /* Or PCI_ANY_ID */ u32 class; /* Or PCI_ANY_ID */ unsigned int class_shift; /* should be 0, 8, 16 */ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + int hook_offset; +#else void (*hook)(struct pci_dev *dev); +#endif }; enum pci_fixup_pass { @@ -1823,12 +1827,28 @@ enum pci_fixup_pass { pci_fixup_suspend_late, /* pci_device_suspend_late() */ }; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) \ + __ADDRESSABLE(hook) \ + asm(".section " #sec ", \"a\" \n" \ + ".balign 16 \n" \ + ".short " #vendor ", " #device " \n" \ + ".long " #class ", " #class_shift " \n" \ + ".long " #hook " - . \n" \ + ".previous \n"); +#define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) \ + __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) +#else /* Anonymous variables would be nice... */ #define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class, \ class_shift, hook) \ static const struct pci_fixup __PASTE(__pci_fixup_##name,__LINE__) __used \ __attribute__((__section__(#section), aligned((sizeof(void *))))) \ = { vendor, device, class, class_shift, hook }; +#endif #define DECLARE_PCI_FIXUP_CLASS_EARLY(vendor, device, class, \ class_shift, hook) \ -- cgit v1.2.3 From 46e0c9be206fa7b11aca75da2d6b8535d0139752 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:22 -0700 Subject: kernel: tracepoints: add support for relative references To avoid the need for relocating absolute references to tracepoint structures at boot time when running relocatable kernels (which may take a disproportionate amount of space), add the option to emit these tables as relative references instead. Link: http://lkml.kernel.org/r/20180704083651.24360-7-ard.biesheuvel@linaro.org Acked-by: Michael Ellerman Acked-by: Ingo Molnar Acked-by: Steven Rostedt (VMware) Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracepoint.h | 19 ++++++++++++++---- kernel/tracepoint.c | 49 ++++++++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d9a084c72541..7f2e16e76ac4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -249,6 +249,19 @@ extern void syscall_unregfunc(void); return static_key_false(&__tracepoint_##name.key); \ } +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __TRACEPOINT_ENTRY(name) \ + asm(" .section \"__tracepoints_ptrs\", \"a\" \n" \ + " .balign 4 \n" \ + " .long __tracepoint_" #name " - . \n" \ + " .previous \n") +#else +#define __TRACEPOINT_ENTRY(name) \ + static struct tracepoint * const __tracepoint_ptr_##name __used \ + __attribute__((section("__tracepoints_ptrs"))) = \ + &__tracepoint_##name +#endif + /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration @@ -258,11 +271,9 @@ extern void syscall_unregfunc(void); static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"))) = \ + __attribute__((section("__tracepoints"), used)) = \ { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ - static struct tracepoint * const __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ - &__tracepoint_##name; + __TRACEPOINT_ENTRY(name); #define DEFINE_TRACE(name) \ DEFINE_TRACE_FN(name, NULL, NULL); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 96db841bf0fc..bf2c06ef9afc 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -371,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + if (!begin) + return; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { + const int *iter; + + for (iter = (const int *)begin; iter < (const int *)end; iter++) + fct(offset_to_ptr(iter), priv); + } else { + struct tracepoint * const *iter; + + for (iter = begin; iter < end; iter++) + fct(*iter, priv); + } +} + #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { @@ -435,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ -static void tp_module_going_check_quiescent(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - WARN_ON_ONCE((*iter)->funcs); + WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) @@ -494,8 +509,9 @@ static void tracepoint_module_going(struct module *mod) * Called the going notifier before checking for * quiescence. */ - tp_module_going_check_quiescent(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + for_each_tracepoint_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints, + tp_module_going_check_quiescent, NULL); break; } } @@ -547,19 +563,6 @@ static __init int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, - void (*fct)(struct tracepoint *tp, void *priv), - void *priv) -{ - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - fct(*iter, priv); -} - /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback -- cgit v1.2.3 From 9144d75e22cad3c89e6b2ccab551db9ee28d250a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 21 Aug 2018 21:57:03 -0700 Subject: include/linux/bitops.h: introduce BITS_PER_TYPE net_dim.h has a rather useful extension to BITS_PER_BYTE to compute the number of bits in a type (BITS_PER_BYTE * sizeof(T)), so promote the macro to bitops.h, alongside BITS_PER_BYTE, for wider usage. Link: http://lkml.kernel.org/r/20180706094458.14116-1-chris@chris-wilson.co.uk Signed-off-by: Chris Wilson Reviewed-by: Jani Nikula Cc: Randy Dunlap Cc: Andy Gospodarek Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 3 ++- include/linux/net_dim.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index af419012d77d..7ddb1349394d 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,7 +4,8 @@ #include #include -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index db99240d00bd..c79e859408e6 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -363,7 +363,6 @@ static inline void net_dim_sample(u16 event_ctr, } #define NET_DIM_NEVENTS 64 -#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1)) static inline void net_dim_calc_stats(struct net_dim_sample *start, -- cgit v1.2.3 From feba04fd2cf8f6a74865338df2e3e1e94d6cfd13 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 21 Aug 2018 21:57:11 -0700 Subject: lib: add crc64 calculation routines Patch series "add crc64 calculation as kernel library", v5. This patchset adds basic implementation of crc64 calculation as a Linux kernel library. Since bcache already does crc64 by itself, this patchset also modifies bcache code to use the new crc64 library routine. Currently bcache is the only user of crc64 calculation, another potential user is bcachefs which is on the way to be in mainline kernel. Therefore it makes sense to make crc64 calculation to be a public library. bcache uses crc64 as storage checksum, if a change of crc lib routines results an inconsistent result, the unmatched checksum may make bcache 'think' the on-disk is corrupted, such a change should be avoided or detected as early as possible. Therefore a patch is being prepared which adds a crc test framework, to check consistency of different calculations. This patch (of 2): Add the re-write crc64 calculation routines for Linux kernel. The CRC64 polynomical arithmetic follows ECMA-182 specification, inspired by CRC paper of Dr. Ross N. Williams (see http://www.ross.net/crc/download/crc_v3.txt) and other public domain implementations. All the changes work in this way, - When Linux kernel is built, host program lib/gen_crc64table.c will be compiled to lib/gen_crc64table and executed. - The output of gen_crc64table execution is an array called as lookup table (a.k.a POLY 0x42f0e1eba9ea369) which contain 256 64-bit long numbers, this table is dumped into header file lib/crc64table.h. - Then the header file is included by lib/crc64.c for normal 64bit crc calculation. - Function declaration of the crc64 calculation routines is placed in include/linux/crc64.h Currently bcache is the only user of crc64_be(), another potential user is bcachefs which is on the way to be in mainline kernel. Therefore it makes sense to move crc64 calculation into lib/crc64.c as public code. [colyli@suse.de: fix review comments from v4] Link: http://lkml.kernel.org/r/20180726053352.2781-2-colyli@suse.de Link: http://lkml.kernel.org/r/20180718165545.1622-2-colyli@suse.de Signed-off-by: Coly Li Co-developed-by: Andy Shevchenko Signed-off-by: Andy Shevchenko Reviewed-by: Hannes Reinecke Cc: Greg Kroah-Hartman Cc: Andy Shevchenko Cc: Michael Lyle Cc: Kent Overstreet Cc: Thomas Gleixner Cc: Kate Stewart Cc: Eric Biggers Cc: Randy Dunlap Cc: Noah Massey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crc64.h | 11 +++++++++ lib/.gitignore | 2 ++ lib/Kconfig | 8 ++++++ lib/Makefile | 11 +++++++++ lib/crc64.c | 56 ++++++++++++++++++++++++++++++++++++++++++ lib/gen_crc64table.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 156 insertions(+) create mode 100644 include/linux/crc64.h create mode 100644 lib/crc64.c create mode 100644 lib/gen_crc64table.c (limited to 'include') diff --git a/include/linux/crc64.h b/include/linux/crc64.h new file mode 100644 index 000000000000..c756e65a1b58 --- /dev/null +++ b/include/linux/crc64.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * See lib/crc64.c for the related specification and polynomial arithmetic. + */ +#ifndef _LINUX_CRC64_H +#define _LINUX_CRC64_H + +#include + +u64 __pure crc64_be(u64 crc, const void *p, size_t len); +#endif /* _LINUX_CRC64_H */ diff --git a/lib/.gitignore b/lib/.gitignore index 09aae85418ab..f2a39c9e5485 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -2,5 +2,7 @@ # Generated files # gen_crc32table +gen_crc64table crc32table.h +crc64table.h oid_registry_data.c diff --git a/lib/Kconfig b/lib/Kconfig index 706836ec314d..9c10b9852563 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -170,6 +170,14 @@ config CRC32_BIT endchoice +config CRC64 + tristate "CRC64 functions" + help + This option is provided for the case where no in-kernel-tree + modules require CRC64 functions, but a module built outside + the kernel tree does. Such modules that use library CRC64 + functions require M here. + config CRC4 tristate "CRC4 functions" help diff --git a/lib/Makefile b/lib/Makefile index d95bb2525101..9baefb6cb1a1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_CRC16) += crc16.o obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o obj-$(CONFIG_CRC32) += crc32.o +obj-$(CONFIG_CRC64) += crc64.o obj-$(CONFIG_CRC32_SELFTEST) += crc32test.o obj-$(CONFIG_CRC4) += crc4.o obj-$(CONFIG_CRC7) += crc7.o @@ -217,7 +218,9 @@ obj-$(CONFIG_FONT_SUPPORT) += fonts/ obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o hostprogs-y := gen_crc32table +hostprogs-y += gen_crc64table clean-files := crc32table.h +clean-files += crc64table.h $(obj)/crc32.o: $(obj)/crc32table.h @@ -227,6 +230,14 @@ quiet_cmd_crc32 = GEN $@ $(obj)/crc32table.h: $(obj)/gen_crc32table $(call cmd,crc32) +$(obj)/crc64.o: $(obj)/crc64table.h + +quiet_cmd_crc64 = GEN $@ + cmd_crc64 = $< > $@ + +$(obj)/crc64table.h: $(obj)/gen_crc64table + $(call cmd,crc64) + # # Build a fast OID lookip registry from include/linux/oid_registry.h # diff --git a/lib/crc64.c b/lib/crc64.c new file mode 100644 index 000000000000..0ef8ae6ac047 --- /dev/null +++ b/lib/crc64.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Normal 64-bit CRC calculation. + * + * This is a basic crc64 implementation following ECMA-182 specification, + * which can be found from, + * http://www.ecma-international.org/publications/standards/Ecma-182.htm + * + * Dr. Ross N. Williams has a great document to introduce the idea of CRC + * algorithm, here the CRC64 code is also inspired by the table-driven + * algorithm and detail example from this paper. This paper can be found + * from, + * http://www.ross.net/crc/download/crc_v3.txt + * + * crc64table[256] is the lookup table of a table-driven 64-bit CRC + * calculation, which is generated by gen_crc64table.c in kernel build + * time. The polynomial of crc64 arithmetic is from ECMA-182 specification + * as well, which is defined as, + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li + */ + +#include +#include +#include "crc64table.h" + +MODULE_DESCRIPTION("CRC64 calculations"); +MODULE_LICENSE("GPL v2"); + +/** + * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64 + * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation, + or the previous crc64 value if computing incrementally. + * @p: pointer to buffer over which CRC64 is run + * @len: length of buffer @p + */ +u64 __pure crc64_be(u64 crc, const void *p, size_t len) +{ + size_t i, t; + + const unsigned char *_p = p; + + for (i = 0; i < len; i++) { + t = ((crc >> 56) ^ (*_p++)) & 0xFF; + crc = crc64table[t] ^ (crc << 8); + } + + return crc; +} +EXPORT_SYMBOL_GPL(crc64_be); diff --git a/lib/gen_crc64table.c b/lib/gen_crc64table.c new file mode 100644 index 000000000000..9011926e4162 --- /dev/null +++ b/lib/gen_crc64table.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate lookup table for the table-driven CRC64 calculation. + * + * gen_crc64table is executed in kernel build time and generates + * lib/crc64table.h. This header is included by lib/crc64.c for + * the table-driven CRC64 calculation. + * + * See lib/crc64.c for more information about which specification + * and polynomial arithmetic that gen_crc64table.c follows to + * generate the lookup table. + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li + */ +#include +#include + +#include + +#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL + +static uint64_t crc64_table[256] = {0}; + +static void generate_crc64_table(void) +{ + uint64_t i, j, c, crc; + + for (i = 0; i < 256; i++) { + crc = 0; + c = i << 56; + + for (j = 0; j < 8; j++) { + if ((crc ^ c) & 0x8000000000000000ULL) + crc = (crc << 1) ^ CRC64_ECMA182_POLY; + else + crc <<= 1; + c <<= 1; + } + + crc64_table[i] = crc; + } +} + +static void print_crc64_table(void) +{ + int i; + + printf("/* this file is generated - do not edit */\n\n"); + printf("#include \n"); + printf("#include \n\n"); + printf("static const u64 ____cacheline_aligned crc64table[256] = {\n"); + for (i = 0; i < 256; i++) { + printf("\t0x%016" PRIx64 "ULL", crc64_table[i]); + if (i & 0x1) + printf(",\n"); + else + printf(", "); + } + printf("};\n"); +} + +int main(int argc, char *argv[]) +{ + generate_crc64_table(); + print_crc64_table(); + return 0; +} -- cgit v1.2.3 From cbf6898fd69455092c43cd573b38d42c86ddb1e0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:59:01 -0700 Subject: autofs: add AUTOFS_EXP_FORCED flag The userspace automount(8) daemon is meant to perform a forced expire when sent a SIGUSR2. But since the expiration is routed through the kernel and the kernel doesn't send an expire request if the mount is busy this hasn't worked at least since autofs version 5. Add an AUTOFS_EXP_FORCED flag to allow implemention of the feature and bump the protocol version so user space can check if it's implemented if needed. Link: http://lkml.kernel.org/r/152937734715.21213.6594007182776598970.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/expire.c | 62 +++++++++++++++++++++++++++++++++++--------- include/uapi/linux/auto_fs.h | 8 +++--- 2 files changed, 55 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index dfb666c5b8a2..d441244b79df 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -29,7 +29,8 @@ static inline int autofs_can_expire(struct dentry *dentry, } /* Check a mount point for busyness */ -static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, + struct dentry *dentry, unsigned int how) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -50,6 +51,12 @@ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) goto done; } + /* Not a submount, has a forced expire been requested */ + if (how & AUTOFS_EXP_FORCED) { + status = 0; + goto done; + } + /* Update the expiry counter if fs is busy */ if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; @@ -189,6 +196,10 @@ static int autofs_direct_busy(struct vfsmount *mnt, { pr_debug("top %p %pd\n", top, top); + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return 0; + /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { struct autofs_info *ino; @@ -235,7 +246,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p, how)) { top_ino->last_used = jiffies; dput(p); return 1; @@ -258,6 +269,10 @@ static int autofs_tree_busy(struct vfsmount *mnt, } } + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return 0; + /* Timeout of a tree mount is ultimately determined by its top dentry */ if (!autofs_can_expire(top, timeout, how)) return 1; @@ -280,9 +295,15 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p, how)) continue; + /* This isn't a submount so if a forced expire + * has been requested, user space handles busy + * mounts */ + if (how & AUTOFS_EXP_FORCED) + return p; + /* Can we expire this guy */ if (autofs_can_expire(p, timeout, how)) return p; @@ -361,9 +382,15 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry, how)) return NULL; + /* This isn't a submount so if a forced expire + * has been requested, user space handles busy + * mounts */ + if (how & AUTOFS_EXP_FORCED) + return dentry; + /* Can we expire this guy */ if (autofs_can_expire(dentry, timeout, how)) return dentry; @@ -372,6 +399,11 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); + + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return dentry; + /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. @@ -386,10 +418,13 @@ static struct dentry *should_expire(struct dentry *dentry, /* Case 2: tree mount, expire iff entire tree is not busy */ if (!(how & AUTOFS_EXP_LEAVES)) { - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; + /* Not a forced expire? */ + if (!(how & AUTOFS_EXP_FORCED)) { + /* ref-walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + } if (!autofs_tree_busy(mnt, dentry, timeout, how)) return dentry; @@ -398,12 +433,15 @@ static struct dentry *should_expire(struct dentry *dentry, * (autofs-4.1). */ } else { - /* Path walk currently on this dentry? */ struct dentry *expired; - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; + /* Not a forced expire? */ + if (!(how & AUTOFS_EXP_FORCED)) { + /* ref-walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + } expired = autofs_check_leaves(mnt, dentry, timeout, how); if (expired) { diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index e13eec3dfb2f..df31aa9c9a8c 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 2 +#define AUTOFS_PROTO_SUBVERSION 3 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed @@ -90,8 +90,10 @@ enum { /* autofs version 4 and later definitions */ /* Mask for expire behaviour */ -#define AUTOFS_EXP_IMMEDIATE 1 -#define AUTOFS_EXP_LEAVES 2 +#define AUTOFS_EXP_NORMAL 0x00 +#define AUTOFS_EXP_IMMEDIATE 0x01 +#define AUTOFS_EXP_LEAVES 0x02 +#define AUTOFS_EXP_FORCED 0x04 #define AUTOFS_TYPE_ANY 0U #define AUTOFS_TYPE_INDIRECT 1U -- cgit v1.2.3 From 52cba1a274818c3ee6299c1a1b476e7bc5037a2f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 21:59:51 -0700 Subject: signal: make force_sigsegv() void Patch series "signal: refactor some functions", v3. This series refactors a bunch of functions in signal.c to simplify parts of the code. The greatest single change is declaring the static do_sigpending() helper as void which makes it possible to remove a bunch of unnecessary checks in the syscalls later on. This patch (of 17): force_sigsegv() returned 0 unconditionally so it doesn't make sense to have it return at all. In addition, there are no callers that check force_sigsegv()'s return value. Link: http://lkml.kernel.org/r/20180602103653.18181-2-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Al Viro Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/signal.h | 2 +- kernel/signal.c | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 113d1ad1ced7..e138ac16c650 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -314,7 +314,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_ptrace_errno_trap(int errno, void __user *addr); extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sigsegv(int, struct task_struct *); +extern void force_sigsegv(int sig, struct task_struct *p); extern int force_sig_info(int, struct siginfo *, struct task_struct *); extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); diff --git a/kernel/signal.c b/kernel/signal.c index 8d8a940422a8..8a828baa0f93 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1458,8 +1458,7 @@ send_sig(int sig, struct task_struct *p, int priv) return send_sig_info(sig, __si_special(priv), p); } -void -force_sig(int sig, struct task_struct *p) +void force_sig(int sig, struct task_struct *p) { force_sig_info(sig, SEND_SIG_PRIV, p); } @@ -1470,8 +1469,7 @@ force_sig(int sig, struct task_struct *p) * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -int -force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig, struct task_struct *p) { if (sig == SIGSEGV) { unsigned long flags; @@ -1480,7 +1478,6 @@ force_sigsegv(int sig, struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } force_sig(SIGSEGV, p); - return 0; } int force_sig_fault(int sig, int code, void __user *addr -- cgit v1.2.3 From 67a48a24478841525ba73ec6d64caaf079cf3b7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:34 -0700 Subject: signal: make unhandled_signal() return bool unhandled_signal() already behaves like a boolean function. Let's actually declare it as such too. All callers treat it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-13-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 +- kernel/signal.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3c5200137b24..fa23cae66758 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -287,7 +287,7 @@ static inline void disallow_signal(int sig) extern struct kmem_cache *sighand_cachep; -int unhandled_signal(struct task_struct *tsk, int sig); +extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) diff --git a/kernel/signal.c b/kernel/signal.c index 7eec2dba597e..c10c09fc4ec3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -505,13 +505,15 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } -int unhandled_signal(struct task_struct *tsk, int sig) +bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) - return 1; + return true; + if (handler != SIG_IGN && handler != SIG_DFL) - return 0; + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } -- cgit v1.2.3 From 20ab7218d2507b2dbec9c053c2f1b96054e266b6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:54 -0700 Subject: signal: make get_signal() return bool make get_signal() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-18-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 +- kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/signal.h b/include/linux/signal.h index fa23cae66758..e3d9e0640e6e 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -265,7 +265,7 @@ extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; -extern int get_signal(struct ksignal *ksig); +extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); diff --git a/kernel/signal.c b/kernel/signal.c index 9f9b4183178e..786bacc60649 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2287,7 +2287,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal(struct ksignal *ksig) +bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2297,7 +2297,7 @@ int get_signal(struct ksignal *ksig) task_work_run(); if (unlikely(uprobe_deny_signal())) - return 0; + return false; /* * Do this once, we can't return to user-mode if freezing() == T. -- cgit v1.2.3 From dc2c8c84def6ce450c63529e08c1db100020994e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 22:01:52 -0700 Subject: ipc: get rid of ids->tables_initialized hack In sysvipc we have an ids->tables_initialized regarding the rhashtable, introduced in 0cfb6aee70bd ("ipc: optimize semget/shmget/msgget for lots of keys") It's there, specifically, to prevent nil pointer dereferences, from using an uninitialized api. Considering how rhashtable_init() can fail (probably due to ENOMEM, if anything), this made the overall ipc initialization capable of failure as well. That alone is ugly, but fine, however I've spotted a few issues regarding the semantics of tables_initialized (however unlikely they may be): - There is inconsistency in what we return to userspace: ipc_addid() returns ENOSPC which is certainly _wrong_, while ipc_obtain_object_idr() returns EINVAL. - After we started using rhashtables, ipc_findkey() can return nil upon !tables_initialized, but the caller expects nil for when the ipc structure isn't found, and can therefore call into ipcget() callbacks. Now that rhashtable initialization cannot fail, we can properly get rid of the hack altogether. [manfred@colorfullife.com: commit id extended to 12 digits] Link: http://lkml.kernel.org/r/20180712185241.4017-10-manfred@colorfullife.com Signed-off-by: Davidlohr Bueso Signed-off-by: Manfred Spraul Cc: Dmitry Vyukov Cc: Herbert Xu Cc: Kees Cook Cc: Michael Kerrisk Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 1 - ipc/util.c | 23 ++++++++--------------- 2 files changed, 8 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6cea726612b7..e9ccdfdb1928 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -16,7 +16,6 @@ struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; - bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; int max_id; diff --git a/ipc/util.c b/ipc/util.c index bd1863b6ed39..7c1387c9510d 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -126,7 +126,6 @@ int ipc_init_ids(struct ipc_ids *ids) if (err) return err; idr_init(&ids->ipcs_idr); - ids->tables_initialized = true; ids->max_id = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; @@ -179,19 +178,16 @@ void __init ipc_init_proc_interface(const char *path, const char *header, */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { - struct kern_ipc_perm *ipcp = NULL; + struct kern_ipc_perm *ipcp; - if (likely(ids->tables_initialized)) - ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, + ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, ipc_kht_params); + if (!ipcp) + return NULL; - if (ipcp) { - rcu_read_lock(); - ipc_lock_object(ipcp); - return ipcp; - } - - return NULL; + rcu_read_lock(); + ipc_lock_object(ipcp); + return ipcp; } /* @@ -269,7 +265,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) if (limit > IPCMNI) limit = IPCMNI; - if (!ids->tables_initialized || ids->in_use >= limit) + if (ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); @@ -578,9 +574,6 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) struct kern_ipc_perm *out; int lid = ipcid_to_idx(id); - if (unlikely(!ids->tables_initialized)) - return ERR_PTR(-EINVAL); - out = idr_find(&ids->ipcs_idr, lid); if (!out) return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 27c331a174614208d0b539019583990967ad9479 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 21 Aug 2018 22:02:00 -0700 Subject: ipc/util.c: further variable name cleanups The varable names got a mess, thus standardize them again: id: user space id. Called semid, shmid, msgid if the type is known. Most functions use "id" already. idx: "index" for the idr lookup Right now, some functions use lid, ipc_addid() already uses idx as the variable name. seq: sequence number, to avoid quick collisions of the user space id key: user space key, used for the rhash tree Link: http://lkml.kernel.org/r/20180712185241.4017-12-manfred@colorfullife.com Signed-off-by: Manfred Spraul Cc: Dmitry Vyukov Cc: Davidlohr Bueso Cc: Davidlohr Bueso Cc: Herbert Xu Cc: Kees Cook Cc: Michael Kerrisk Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 +- ipc/msg.c | 6 +++--- ipc/sem.c | 6 +++--- ipc/shm.c | 4 ++-- ipc/util.c | 26 +++++++++++++------------- ipc/util.h | 10 +++++----- 6 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e9ccdfdb1928..6ab8c1bada3f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -18,7 +18,7 @@ struct ipc_ids { unsigned short seq; struct rw_semaphore rwsem; struct idr ipcs_idr; - int max_id; + int max_idx; #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif diff --git a/ipc/msg.c b/ipc/msg.c index ee78dad90460..883642cf2b27 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -456,7 +456,7 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, int cmd, struct msginfo *msginfo) { int err; - int max_id; + int max_idx; /* * We must not return kernel stack data. @@ -483,9 +483,9 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_id = ipc_get_maxid(&msg_ids(ns)); + max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); - return (max_id < 0) ? 0 : max_id; + return (max_idx < 0) ? 0 : max_idx; } static int msgctl_stat(struct ipc_namespace *ns, int msqid, diff --git a/ipc/sem.c b/ipc/sem.c index e4df102f3404..26f8e37fcdcb 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1294,7 +1294,7 @@ static int semctl_info(struct ipc_namespace *ns, int semid, int cmd, void __user *p) { struct seminfo seminfo; - int max_id; + int max_idx; int err; err = security_sem_semctl(NULL, cmd); @@ -1318,11 +1318,11 @@ static int semctl_info(struct ipc_namespace *ns, int semid, seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } - max_id = ipc_get_maxid(&sem_ids(ns)); + max_idx = ipc_get_maxidx(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; - return (max_id < 0) ? 0 : max_id; + return (max_idx < 0) ? 0 : max_idx; } static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, diff --git a/ipc/shm.c b/ipc/shm.c index c4fb359f1e53..b0eb3757ab89 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -948,7 +948,7 @@ static int shmctl_ipc_info(struct ipc_namespace *ns, shminfo->shmall = ns->shm_ctlall; shminfo->shmmin = SHMMIN; down_read(&shm_ids(ns).rwsem); - err = ipc_get_maxid(&shm_ids(ns)); + err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; @@ -968,7 +968,7 @@ static int shmctl_shm_info(struct ipc_namespace *ns, shm_info->shm_tot = ns->shm_tot; shm_info->swap_attempts = 0; shm_info->swap_successes = 0; - err = ipc_get_maxid(&shm_ids(ns)); + err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; diff --git a/ipc/util.c b/ipc/util.c index a2aae8c1410d..6f30ba80ca15 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -119,7 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids) init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); - ids->max_id = -1; + ids->max_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif @@ -237,7 +237,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is - * initialised and the first free entry is set up and the id assigned + * initialised and the first free entry is set up and the index assigned * is returned. The 'new' entry is returned in a locked state on success. * * On failure the entry is not locked and a negative err-code is returned. @@ -291,8 +291,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) } ids->in_use++; - if (idx > ids->max_id) - ids->max_id = idx; + if (idx > ids->max_idx) + ids->max_idx = idx; return idx; } @@ -431,20 +431,20 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { - int lid = ipcid_to_idx(ipcp->id); + int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, lid); + idr_remove(&ids->ipcs_idr, idx); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; - if (unlikely(lid == ids->max_id)) { + if (unlikely(idx == ids->max_idx)) { do { - lid--; - if (lid == -1) + idx--; + if (idx == -1) break; - } while (!idr_find(&ids->ipcs_idr, lid)); - ids->max_id = lid; + } while (!idr_find(&ids->ipcs_idr, idx)); + ids->max_idx = idx; } } @@ -564,9 +564,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; - int lid = ipcid_to_idx(id); + int idx = ipcid_to_idx(id); - out = idr_find(&ids->ipcs_idr, lid); + out = idr_find(&ids->ipcs_idr, idx); if (!out) return ERR_PTR(-EINVAL); diff --git a/ipc/util.h b/ipc/util.h index 6c5c77c61f85..e74564fe3375 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -113,12 +113,12 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *); int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); /** - * ipc_get_maxid - get the last assigned id + * ipc_get_maxidx - get the highest assigned index * @ids: ipc identifier set * * Called with ipc_ids.rwsem held for reading. */ -static inline int ipc_get_maxid(struct ipc_ids *ids) +static inline int ipc_get_maxidx(struct ipc_ids *ids) { if (ids->in_use == 0) return -1; @@ -126,7 +126,7 @@ static inline int ipc_get_maxid(struct ipc_ids *ids) if (ids->in_use == IPCMNI) return IPCMNI - 1; - return ids->max_id; + return ids->max_idx; } /* @@ -172,9 +172,9 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len); extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst); extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); -static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) +static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id) { - return uid / SEQ_MULTIPLIER != ipcp->seq; + return ipcid_to_seqx(id) != ipcp->seq; } static inline void ipc_lock_object(struct kern_ipc_perm *perm) -- cgit v1.2.3