diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 83 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/locking/lockdep_proc.c | 5 | ||||
-rw-r--r-- | kernel/memremap.c | 194 | ||||
-rw-r--r-- | kernel/power/Kconfig | 6 | ||||
-rw-r--r-- | kernel/resource.c | 39 | ||||
-rw-r--r-- | kernel/sched/core.c | 4 |
7 files changed, 229 insertions, 103 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 785d708f8553..eea9d52b010c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2553,6 +2553,9 @@ unlock: return ret; } +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx); + /* * Attach a performance event to a context. * @@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); + WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); + if (event->cpu != -1) event->cpu = cpu; @@ -4360,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return 0; /* @@ -4391,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ @@ -4411,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) return false; } -/* Called under the same ctx::mutex as perf_install_in_context() */ static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + lockdep_assert_held(&ctx->mutex); + + if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { @@ -4465,12 +4471,20 @@ static void _free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - if (event->ctx) - put_ctx(event->ctx); - + /* + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. + */ if (event->hw.target) put_task_struct(event->hw.target); + /* + * perf_event_free_task() relies on put_ctx() being 'last', in particular + * all task references must be cleaned up. + */ + if (event->ctx) + put_ctx(event->ctx); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -4650,8 +4664,17 @@ again: mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { + void *var = &child->ctx->refcount; + list_del(&child->child_list); free_event(child); + + /* + * Wake any perf_event_free_task() waiting for this event to be + * freed. + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); } no_ctx: @@ -10930,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } - if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { - err = -EBUSY; - goto err_context; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -11022,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open, move_group = 0; } } + + /* + * Failure to create exclusive events returns -EBUSY. + */ + err = -EBUSY; + if (!exclusive_event_installable(group_leader, ctx)) + goto err_locked; + + for_each_sibling_event(sibling, group_leader) { + if (!exclusive_event_installable(sibling, ctx)) + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } @@ -11058,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open, * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { - /* exclusive and group stuff are assumed mutually exclusive */ - WARN_ON_ONCE(move_group); - err = -EBUSY; goto err_locked; } @@ -11527,11 +11554,11 @@ static void perf_free_event(struct perf_event *event, } /* - * Free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. + * Free a context as created by inheritance by perf_event_init_task() below, + * used by fork() in case of fail. * - * Not all locks are strictly required, but take them anyway to be nice and - * help out with the lockdep assertions. + * Even though the task has never lived, the context and events have been + * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { @@ -11561,7 +11588,23 @@ void perf_event_free_task(struct task_struct *task) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); - put_ctx(ctx); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } } diff --git a/kernel/fork.c b/kernel/fork.c index ef1e05a68827..d8ae0f1b4148 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 9c49ec645d8b..65b6a1600c8f 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -210,6 +210,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, sum_forward_deps = 0; +#ifdef CONFIG_PROVE_LOCKING list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) @@ -241,13 +242,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; -#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); -#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif + +#endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", diff --git a/kernel/memremap.c b/kernel/memremap.c index 6e1970719dc2..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,41 +11,39 @@ #include <linux/types.h> #include <linux/wait_bit.h> #include <linux/xarray.h> -#include <linux/hmm.h> static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); +static atomic_t devmap_managed_enable; + +static void devmap_managed_enable_put(void *data) { - struct page *page = device_private_entry_to_page(entry); - struct hmm_devmem *devmem; + if (atomic_dec_and_test(&devmap_managed_enable)) + static_branch_disable(&devmap_managed_key); +} - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + if (!pgmap->ops || !pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return -EINVAL; + } - /* - * The page_fault() callback must migrate page back to system memory - * so that CPU can access it. This might fail for various reasons - * (device issue, device was unsafely unplugged, ...). When such - * error conditions happen, the callback must return VM_FAULT_SIGBUS. - * - * Note that because memory cgroup charges are accounted to the device - * memory, this should never fail because of memory restrictions (but - * allocation of regular system page might still fail because we are - * out of memory). - * - * There is a more in-depth description of what that callback can and - * cannot do, in include/linux/memremap.h - */ - return devmem->page_fault(vma, addr, page, flags, pmdp); + if (atomic_inc_return(&devmap_managed_enable) == 1) + static_branch_enable(&devmap_managed_key); + return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); } -#endif /* CONFIG_DEVICE_PRIVATE */ +#else +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + return -EINVAL; +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ static void pgmap_array_delete(struct resource *res) { @@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; - struct vmem_altmap *altmap = &pgmap->altmap; - unsigned long pfn; - - pfn = res->start >> PAGE_SHIFT; - if (pgmap->altmap_valid) - pfn += vmem_altmap_offset(altmap); - return pfn; + return (pgmap->res.start >> PAGE_SHIFT) + + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) @@ -83,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -92,10 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->cleanup(pgmap->ref); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data) align_size >> PAGE_SHIFT, NULL); } else { arch_remove_memory(nid, align_start, align_size, - pgmap->altmap_valid ? &pgmap->altmap : NULL); + pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(align_start), align_size); } mem_hotplug_done(); @@ -122,20 +132,29 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * 1/ At a minimum the res and type members of @pgmap must be initialized * by the caller before passing it to this function * - * 2/ The altmap field may optionally be initialized, in which case altmap_valid - * must be set to true + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped - * at devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -144,22 +163,66 @@ static void devm_memremap_pages_release(void *data) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; - struct vmem_altmap *altmap = pgmap->altmap_valid ? - &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap */ - .altmap = altmap, + .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + bool need_devmap_managed = true; + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || + IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_PCI_P2PDMA: + need_devmap_managed = false; + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } + + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } + } - if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); + if (need_devmap_managed) { + error = devmap_managed_enable_get(dev, pgmap); + if (error) + return ERR_PTR(error); } align_start = res->start & ~(SECTION_SIZE - 1); @@ -241,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); + align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); } mem_hotplug_done(); @@ -271,9 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); - + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -287,7 +349,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ - return altmap->reserve + altmap->free; + if (altmap) + return altmap->reserve + altmap->free; + return 0; } void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) @@ -329,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_enable; - -/* - * Toggle the static key for ->page_free() callbacks when dev_pagemap - * pages go idle. - */ -void dev_pagemap_get_ops(void) -{ - if (atomic_inc_return(&devmap_enable) == 1) - static_branch_enable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); - -void dev_pagemap_put_ops(void) -{ - if (atomic_dec_and_test(&devmap_enable)) - static_branch_disable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); - void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -366,7 +408,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page); } else if (!count) __put_page(page); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ff8592ddedee..d3667b4075c1 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -66,7 +66,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - <file:Documentation/power/swsusp-and-swap-files.txt>). + <file:Documentation/power/swsusp-and-swap-files.rst>). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -75,7 +75,7 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at <file:Documentation/power/swsusp.txt>. + For more information take a look at <file:Documentation/power/swsusp.rst>. config ARCH_SAVE_PAGE_KEYS bool @@ -256,7 +256,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/power/apm-acpi.txt> + and more information, read <file:Documentation/power/apm-acpi.rst> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/kernel/resource.c b/kernel/resource.c index 158f04ec1d4f..d22423e85cf8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); +#ifdef CONFIG_DEVICE_PRIVATE +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + resource_size_t end, addr; + struct resource *res; + + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); + addr = end - size + 1UL; + + for (; addr > size && addr >= base->start; addr -= size) { + if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) + continue; + + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (!res) + return ERR_PTR(-ENOMEM); + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + return res; + } + + return ERR_PTR(-ERANGE); +} +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); +#endif /* CONFIG_DEVICE_PRIVATE */ + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa43ce3962e7..2b037f195473 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ if (!(p->state & state)) - return false; + goto out; success = 1; cpu = task_cpu(p); @@ -2526,6 +2527,7 @@ unlock: out: if (success) ttwu_stat(p, cpu, wake_flags); + preempt_enable(); return success; } |