diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 372 | ||||
-rw-r--r-- | kernel/irq/manage.c | 9 | ||||
-rw-r--r-- | kernel/memremap.c | 30 | ||||
-rw-r--r-- | kernel/module.c | 4 | ||||
-rw-r--r-- | kernel/power/process.c | 12 | ||||
-rw-r--r-- | kernel/power/suspend.c | 6 | ||||
-rw-r--r-- | kernel/resource.c | 5 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/cpufreq.c | 37 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 26 | ||||
-rw-r--r-- | kernel/sched/rt.c | 4 | ||||
-rw-r--r-- | kernel/sched/sched.h | 48 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 36 | ||||
-rw-r--r-- | kernel/trace/power-traces.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 17 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 6 |
19 files changed, 430 insertions, 206 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 5946460b2425..614614821f00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -64,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event) * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * - * This is because we need a ctx->lock serialized variable (ctx->is_active) - * to reliably determine if a particular task/context is scheduled in. The - * task_curr() use in task_function_call() is racy in that a remote context - * switch is not a single atomic operation. - * - * As is, the situation is 'safe' because we set rq->curr before we do the - * actual context switch. This means that task_curr() will fail early, but - * we'll continue spinning on ctx->is_active until we've passed - * perf_event_task_sched_out(). - * - * Without this ctx->lock serialized variable we could have race where we find - * the task (and hence the context) would not be active while in fact they are. - * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ @@ -212,7 +212,7 @@ static int event_function(void *info) */ if (ctx->task) { if (ctx->task != current) { - ret = -EAGAIN; + ret = -ESRCH; goto unlock; } @@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da return; } -again: if (task == TASK_TOMBSTONE) return; +again: if (!task_function_call(task, event_function, &efs)) return; @@ -289,13 +289,15 @@ again: * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task != TASK_TOMBSTONE) { - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto again; - } - func(event, NULL, ctx, data); + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; } + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -314,6 +316,7 @@ again: enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -321,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -1288,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1645,7 +1656,7 @@ out: static bool is_orphaned_event(struct perf_event *event) { - return event->state == PERF_EVENT_STATE_EXIT; + return event->state == PERF_EVENT_STATE_DEAD; } static inline int pmu_filter_match(struct perf_event *event) @@ -1690,14 +1701,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1732,7 +1743,6 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL -#define DETACH_STATE 0x02UL /* * Cross CPU call to remove a performance event @@ -1752,8 +1762,6 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (flags & DETACH_STATE) - event->state = PERF_EVENT_STATE_EXIT; if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; @@ -2063,14 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2097,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { - struct perf_event_context *ctx = info; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + bool activate = true; + int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); - /* - * If we hit the 'wrong' task, we've since scheduled and - * everything should be sorted, nothing to do! - */ task_ctx = ctx; - if (ctx->task != current) + + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; goto unlock; + } /* - * If task_ctx is set, it had better be to us. + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. */ - WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); + } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - ctx_resched(cpuctx, task_ctx); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } + unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context + * Attach a performance event to a context. + * + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { - struct task_struct *task = NULL; + struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); @@ -2147,40 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx, if (event->cpu != -1) event->cpu = cpu; + if (!task) { + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) + return; + /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. - * - * So what we do is we add the event to the list here, which will allow - * a future context switch to DTRT and then send a racy IPI. If the IPI - * fails to hit the right task, this means a context switch must have - * happened and that will have taken care of business. */ - raw_spin_lock_irq(&ctx->lock); - task = ctx->task; +again: /* - * Worse, we cannot even rely on the ctx actually existing anymore. If - * between find_get_context() and perf_install_in_context() the task - * went through perf_event_exit_task() its dead and we should not be - * adding new events. + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. */ - if (task == TASK_TOMBSTONE) { + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { + /* + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). + */ raw_spin_unlock_irq(&ctx->lock); return; } - update_context_time(ctx); + raw_spin_unlock_irq(&ctx->lock); /* - * Update cgrp time only if current cgrp matches event->cgrp. - * Must be done before calling add_event_to_ctx(). + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. */ - update_cgrp_time_from_event(event); - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - - if (task) - task_function_call(task, __perf_install_in_context, ctx); - else - cpu_function_call(cpu, __perf_install_in_context, ctx); + goto again; } /* @@ -2219,17 +2265,18 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - update_context_time(ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + __perf_event_mark_enabled(event); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) { - perf_cgroup_set_timestamp(current, ctx); // XXX ? + if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - } + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2237,8 +2284,10 @@ static void __perf_event_enable(struct perf_event *event, * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; + } task_ctx = cpuctx->task_ctx; if (ctx->task) @@ -2344,24 +2393,33 @@ static void ctx_sched_out(struct perf_event_context *ctx, } ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2641,18 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); -} - /* * Called with IRQs disabled */ @@ -2735,7 +2781,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) return; - ctx->is_active |= event_type; + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; @@ -2743,18 +2789,24 @@ ctx_sched_in(struct perf_event_context *ctx, WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -3120,6 +3172,7 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) enabled |= event_enable_on_exec(event, ctx); @@ -3537,12 +3590,22 @@ static void unaccount_event(struct perf_event *event) if (has_branch_stack(event)) dec = true; - if (dec) - static_key_slow_dec_deferred(&perf_sched_events); + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3752,30 +3815,42 @@ static void put_event(struct perf_event *event) */ int perf_event_release_kernel(struct perf_event *event) { - struct perf_event_context *ctx; + struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); - perf_event_ctx_unlock(event, ctx); + perf_remove_from_context(event, DETACH_GROUP); + raw_spin_lock_irq(&ctx->lock); /* - * At this point we must have event->state == PERF_EVENT_STATE_EXIT, - * either from the above perf_remove_from_context() or through - * perf_event_exit_event(). + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * Therefore, anybody acquiring event->child_mutex after the below - * loop _must_ also see this, most importantly inherit_event() which - * will avoid placing more children on the list. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + + perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); @@ -3830,8 +3905,8 @@ again: } mutex_unlock(&event->child_mutex); - /* Must be the last reference */ - put_event(event); +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3988,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -7769,8 +7844,28 @@ static void account_event(struct perf_event *event) if (is_cgroup_event(event)) inc = true; - if (inc) - static_key_slow_inc(&perf_sched_events.key); + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -8389,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8509,7 +8613,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8563,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8577,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8695,7 +8810,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -9206,7 +9321,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9282,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; @@ -9315,9 +9428,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..e79e60f50bce 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1609,6 +1609,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1702,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; diff --git a/kernel/memremap.c b/kernel/memremap.c index 2c468dea60bc..6cf54615a9c4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) static void *try_ram_remap(resource_size_t offset, size_t size) { - struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (!PageHighMem(page)) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); return NULL; /* fallback to ioremap_cache */ } @@ -114,7 +114,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -136,8 +136,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } @@ -268,13 +270,16 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; + int error, nid, is_ram; unsigned long pfn; - int error, nid; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; + is_ram = region_intersects(align_start, align_size, "System RAM"); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -312,8 +317,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); align_end = align_start + align_size - 1; for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; @@ -349,8 +352,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, for_each_device_pfn(pfn, page_map) { struct page *page = pfn_to_page(pfn); - /* ZONE_DEVICE pages must never appear on a slab lru */ - list_force_poison(&page->lru); + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer. It is a bug if a ZONE_DEVICE page is ever + * freed or placed on a driver-private list. Seed the + * storage with LIST_POISON* values. + */ + list_del(&page->lru); page->pgmap = pgmap; } devres_add(dev, page_map); diff --git a/kernel/module.c b/kernel/module.c index 9537da37ce87..794ebe8e878d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -984,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -3313,6 +3315,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3389,6 +3392,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); + ftrace_module_enable(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..df058bed53ce 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..230a77225e2e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); diff --git a/kernel/resource.c b/kernel/resource.c index 09c0597840b0..3669d1bfc425 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..9507522164ac 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..41f6b2215aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> @@ -5096,6 +5097,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..2037cf432a45 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -420,7 +420,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..e2987a7e489d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..27f5b03cbdbe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..faf7e2758dd0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1738,3 +1738,51 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca592f977b2..57a6eea84694 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_module_enable(struct module *mod) +void ftrace_module_enable(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_module_enable(mod); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - default: - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5098,10 +5068,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..81b87451c0ea 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f333e57c4614..05ddc0820771 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_generic_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - field = __find_event_field(&ftrace_common_fields, name); + field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -171,8 +171,10 @@ static int trace_define_generic_fields(void) { int ret; - __generic_field(int, cpu, FILTER_OTHER); - __generic_field(char *, comm, FILTER_PTR_STRING); + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); return ret; } @@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f93a219b18da..6816302542b2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); - if (!strcmp(field->name, "comm")) { - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (field->filter_type == FILTER_STATIC_STRING) { + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - if (!strcmp(field->name, "cpu")) + if (field->filter_type == FILTER_CPU) fn = filter_pred_cpu; else fn = select_comparison_fn(pred->op, field->size, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 202df6cffcca..2a1abbaca10e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); |