From fd3cbdc0d1b5254a2e8793df58c409b469899a3f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 10 Aug 2014 08:53:39 +0200 Subject: jump_label: Fix small typos in the documentation Was reading through the documentation of this code and noticed a few typos, missing commas, etc. Cc: Jason Baron Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Borislav Petkov Cc: Andrew Morton Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Mel Gorman Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 784304b222b3..98f923b6a0ea 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -8,28 +8,28 @@ * Copyright (C) 2011-2012 Peter Zijlstra * * Jump labels provide an interface to generate dynamic branches using - * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_key_false(&key))" statement is a unconditional branch (which + * self-modifying code. Assuming toolchain and architecture support, the result + * of a "if (static_key_false(&key))" statement is an unconditional branch (which * defaults to false - and the true block is placed out of line). * * However at runtime we can change the branch target using * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key - * object and for as long as there are references all branches referring to + * object, and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the static_key_slow_{inc,dec}() functions + * Since this relies on modifying code, the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). - * OTOH, since the affected branches are unconditional their runtime overhead + * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * - * When the control is directly exposed to userspace it is prudent to delay the + * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * - * Lacking toolchain and or architecture support, it falls back to a simple + * Lacking toolchain and or architecture support, jump labels fall back to a simple * conditional branch. * * struct static_key my_key = STATIC_KEY_INIT_TRUE; @@ -43,8 +43,7 @@ * * Not initializing the key (static data is initialized to 0s anyway) is the * same as using STATIC_KEY_INIT_FALSE. - * -*/ + */ #include #include -- cgit v1.2.3 From fadfe7be6e50de7f03913833b33c56cd8fb66bac Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 1 Aug 2014 14:33:02 +0200 Subject: perf: Add queued work to remove orphaned child events In cases when the owner task exits before the workload and the workload made some forks, all the events stay in until the last workload process exits. Thats' because each child event holds parent reference. We want to release all children events once the parent is gone, because at that time there's no process to read them anyway, so they're just eating resources. This removal races with process exit, which removes all events and fork, which clone events. To be clear of those two, adding work queue to remove orphaned child for context in case such event is detected. Using delayed work queue (with delay == 1), because we queue this work under perf scheduler callbacks. Normal work queue tries to wake up the queue process, which deadlocks on rq->lock in this place. Also preventing clones from abandoned parent event. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Mark Rutland Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Mark Rutland Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1406896382-18404-4-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 +++ kernel/events/core.c | 87 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 707617a8c0f6..ef5b62bdb103 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -52,6 +52,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -507,6 +508,9 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; + + struct delayed_work orphans_remove; + bool orphans_remove_sched; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index bbb3ca22f07c..a25460559b4f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -46,6 +46,8 @@ #include +static struct workqueue_struct *perf_wq; + struct remote_function_call { struct task_struct *p; int (*func)(void *info); @@ -1381,6 +1383,45 @@ out: perf_event__header_size(tmp); } +/* + * User event without the task. + */ +static bool is_orphaned_event(struct perf_event *event) +{ + return event && !is_kernel_event(event) && !event->owner; +} + +/* + * Event has a parent but parent's task finished and it's + * alive only because of children holding refference. + */ +static bool is_orphaned_child(struct perf_event *event) +{ + return is_orphaned_event(event->parent); +} + +static void orphans_remove_work(struct work_struct *work); + +static void schedule_orphans_remove(struct perf_event_context *ctx) +{ + if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) + return; + + if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { + get_ctx(ctx); + ctx->orphans_remove_sched = true; + } +} + +static int __init perf_workqueue_init(void) +{ + perf_wq = create_singlethread_workqueue("perf"); + WARN(!perf_wq, "failed to create perf workqueue\n"); + return perf_wq ? 0 : -1; +} + +core_initcall(perf_workqueue_init); + static inline int event_filter_match(struct perf_event *event) { @@ -1430,6 +1471,9 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + perf_pmu_enable(event->pmu); } @@ -1732,6 +1776,9 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + out: perf_pmu_enable(event->pmu); @@ -3074,6 +3121,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); + INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3405,6 +3453,42 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +/* + * Remove all orphanes events from the context. + */ +static void orphans_remove_work(struct work_struct *work) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + + ctx = container_of(work, struct perf_event_context, + orphans_remove.work); + + mutex_lock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { + struct perf_event *parent_event = event->parent; + + if (!is_orphaned_child(event)) + continue; + + perf_remove_from_context(event, true); + + mutex_lock(&parent_event->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent_event->child_mutex); + + free_event(event); + put_event(parent_event); + } + + raw_spin_lock_irq(&ctx->lock); + ctx->orphans_remove_sched = false; + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -7709,7 +7793,8 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + if (is_orphaned_event(parent_event) || + !atomic_long_inc_not_zero(&parent_event->refcount)) { free_event(child_event); return NULL; } -- cgit v1.2.3 From 770eee1fd38c70a009b321f5dbe64358f42511fd Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 11 Aug 2014 21:27:12 +0200 Subject: perf/x86: Fix data source encoding issues for load latency/precise store This patch fixes issues introuduce by Andi's previous patch 'Revamp PEBS' series. This patch fixes the following: - precise_store_data_hsw() encode the mem op type whenever we can - precise_store_data_hsw set the default data source correctly - 0 is not a valid init value for data source. Define PERF_MEM_NA as the default value This bug was actually introduced by commit 722e76e60f2775c21b087ff12c5e678cf0ebcaaf Author: Stephane Eranian Date: Thu May 15 17:56:44 2014 +0200 fix Haswell precise store data source encoding Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1407785233-32193-4-git-send-email-eranian@google.com Cc: Arnaldo Carvalho de Melo Cc: ak@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 11 +++++++---- include/linux/perf_event.h | 9 ++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index a9b60f32064f..67919ce0f76a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -113,9 +113,12 @@ static u64 precise_store_data_hsw(struct perf_event *event, u64 status) union perf_mem_data_src dse; u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; - dse.val = 0; - dse.mem_op = PERF_MEM_OP_NA; - dse.mem_lvl = PERF_MEM_LVL_NA; + dse.val = PERF_MEM_NA; + + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + dse.mem_op = PERF_MEM_OP_STORE; + else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW) + dse.mem_op = PERF_MEM_OP_LOAD; /* * L1 info only valid for following events: @@ -126,7 +129,7 @@ static u64 precise_store_data_hsw(struct perf_event *event, u64 status) * MEM_UOPS_RETIRED.ALL_STORES */ if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) - return dse.mem_lvl; + return dse.val; if (status & 1) dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ef5b62bdb103..f0a1036b1911 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -608,6 +608,13 @@ struct perf_sample_data { u64 txn; }; +/* default value for data source */ +#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ + PERF_MEM_S(LVL, NA) |\ + PERF_MEM_S(SNOOP, NA) |\ + PERF_MEM_S(LOCK, NA) |\ + PERF_MEM_S(TLB, NA)) + static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { @@ -620,7 +627,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->regs_user.regs = NULL; data->stack_user_size = 0; data->weight = 0; - data->data_src.val = 0; + data->data_src.val = PERF_MEM_NA; data->txn = 0; } -- cgit v1.2.3 From 2e39465abc4b7856a0ea6fcf4f6b4668bb5db877 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Aug 2014 12:07:15 +0200 Subject: locking: Remove deprecated smp_mb__() barriers Its been a while and there are no in-tree users left, so remove the deprecated barriers. Signed-off-by: Peter Zijlstra Cc: Chen, Gong Cc: Jacob Pan Cc: Joe Perches Cc: John Sullivan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Srinivas Pandruvada Cc: Theodore Ts'o Signed-off-by: Ingo Molnar --- include/linux/atomic.h | 36 ------------------------------------ include/linux/bitops.h | 20 -------------------- kernel/sched/core.c | 16 ---------------- 3 files changed, 72 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index fef3a809e7cf..5b08a8540ecf 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -3,42 +3,6 @@ #define _LINUX_ATOMIC_H #include -/* - * Provide __deprecated wrappers for the new interface, avoid flag day changes. - * We need the ugly external functions to break header recursion hell. - */ -#ifndef smp_mb__before_atomic_inc -static inline void __deprecated smp_mb__before_atomic_inc(void) -{ - extern void __smp_mb__before_atomic(void); - __smp_mb__before_atomic(); -} -#endif - -#ifndef smp_mb__after_atomic_inc -static inline void __deprecated smp_mb__after_atomic_inc(void) -{ - extern void __smp_mb__after_atomic(void); - __smp_mb__after_atomic(); -} -#endif - -#ifndef smp_mb__before_atomic_dec -static inline void __deprecated smp_mb__before_atomic_dec(void) -{ - extern void __smp_mb__before_atomic(void); - __smp_mb__before_atomic(); -} -#endif - -#ifndef smp_mb__after_atomic_dec -static inline void __deprecated smp_mb__after_atomic_dec(void) -{ - extern void __smp_mb__after_atomic(void); - __smp_mb__after_atomic(); -} -#endif - /** * atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cbc5833fb221..be5fd38bd5a0 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,26 +32,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include -/* - * Provide __deprecated wrappers for the new interface, avoid flag day changes. - * We need the ugly external functions to break header recursion hell. - */ -#ifndef smp_mb__before_clear_bit -static inline void __deprecated smp_mb__before_clear_bit(void) -{ - extern void __smp_mb__before_atomic(void); - __smp_mb__before_atomic(); -} -#endif - -#ifndef smp_mb__after_clear_bit -static inline void __deprecated smp_mb__after_clear_bit(void) -{ - extern void __smp_mb__after_atomic(void); - __smp_mb__after_atomic(); -} -#endif - #define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ (bit) < (size); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575a2208..76c518c9b3a7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,22 +90,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef smp_mb__before_atomic -void __smp_mb__before_atomic(void) -{ - smp_mb__before_atomic(); -} -EXPORT_SYMBOL(__smp_mb__before_atomic); -#endif - -#ifdef smp_mb__after_atomic -void __smp_mb__after_atomic(void) -{ - smp_mb__after_atomic(); -} -EXPORT_SYMBOL(__smp_mb__after_atomic); -#endif - void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; -- cgit v1.2.3 From 7608a43d8f2e02f8b532f8e11481d7ecf8b5d3f9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:54 -0700 Subject: locking/mutexes: Use MUTEX_SPIN_ON_OWNER when appropriate 4badad35 ("locking/mutex: Disable optimistic spinning on some architectures") added a ARCH_SUPPORTS_ATOMIC_RMW flag to disable the mutex optimistic feature on specific archs. Because CONFIG_MUTEX_SPIN_ON_OWNER only depended on DEBUG and SMP, it was ok to have the ->owner field conditional a bit flexible. However by adding a new variable to the matter, we can waste space with the unused field, ie: CONFIG_SMP && (!CONFIG_MUTEX_SPIN_ON_OWNER && !CONFIG_DEBUG_MUTEX). Signed-off-by: Davidlohr Bueso Acked-by: Jason Low Signed-off-by: Peter Zijlstra Cc: aswin@hp.com Cc: Davidlohr Bueso Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Tim Chen Link: http://lkml.kernel.org/r/1406752916-3341-5-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 2 +- kernel/locking/mutex.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 8d5535c58cc2..e4c29418f407 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -52,7 +52,7 @@ struct mutex { atomic_t count; spinlock_t wait_lock; struct list_head wait_list; -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) +#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER) struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..5cda397607f2 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,7 +16,7 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline void mutex_set_owner(struct mutex *lock) { lock->owner = current; -- cgit v1.2.3 From 214e0aed639ef40987bf6159fad303171a6de31e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:55 -0700 Subject: locking/Documentation: Move locking related docs into Documentation/locking/ Specifically: Documentation/locking/lockdep-design.txt Documentation/locking/lockstat.txt Documentation/locking/mutex-design.txt Documentation/locking/rt-mutex-design.txt Documentation/locking/rt-mutex.txt Documentation/locking/spinlocks.txt Documentation/locking/ww-mutex-design.txt Signed-off-by: Davidlohr Bueso Acked-by: Randy Dunlap Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Cc: aswin@hp.com Cc: Alexei Starovoitov Cc: Al Viro Cc: Andrew Morton Cc: Chris Mason Cc: Dan Streetman Cc: David Airlie Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Low Cc: Josef Bacik Cc: Kees Cook Cc: Linus Torvalds Cc: Lubomir Rintel Cc: Masanari Iida Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Tim Chen Cc: Vineet Gupta Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1406752916-3341-6-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- Documentation/00-INDEX | 2 + Documentation/DocBook/kernel-locking.tmpl | 2 +- Documentation/lockdep-design.txt | 286 ----------- Documentation/locking/lockdep-design.txt | 286 +++++++++++ Documentation/locking/lockstat.txt | 178 +++++++ Documentation/locking/mutex-design.txt | 157 ++++++ Documentation/locking/rt-mutex-design.txt | 781 ++++++++++++++++++++++++++++++ Documentation/locking/rt-mutex.txt | 79 +++ Documentation/locking/spinlocks.txt | 167 +++++++ Documentation/locking/ww-mutex-design.txt | 344 +++++++++++++ Documentation/lockstat.txt | 178 ------- Documentation/mutex-design.txt | 157 ------ Documentation/rt-mutex-design.txt | 781 ------------------------------ Documentation/rt-mutex.txt | 79 --- Documentation/spinlocks.txt | 167 ------- Documentation/ww-mutex-design.txt | 344 ------------- MAINTAINERS | 4 +- drivers/gpu/drm/drm_modeset_lock.c | 2 +- include/linux/lockdep.h | 2 +- include/linux/mutex.h | 2 +- include/linux/rwsem.h | 2 +- kernel/locking/mutex.c | 2 +- kernel/locking/rtmutex.c | 2 +- lib/Kconfig.debug | 4 +- 24 files changed, 2005 insertions(+), 2003 deletions(-) delete mode 100644 Documentation/lockdep-design.txt create mode 100644 Documentation/locking/lockdep-design.txt create mode 100644 Documentation/locking/lockstat.txt create mode 100644 Documentation/locking/mutex-design.txt create mode 100644 Documentation/locking/rt-mutex-design.txt create mode 100644 Documentation/locking/rt-mutex.txt create mode 100644 Documentation/locking/spinlocks.txt create mode 100644 Documentation/locking/ww-mutex-design.txt delete mode 100644 Documentation/lockstat.txt delete mode 100644 Documentation/mutex-design.txt delete mode 100644 Documentation/rt-mutex-design.txt delete mode 100644 Documentation/rt-mutex.txt delete mode 100644 Documentation/spinlocks.txt delete mode 100644 Documentation/ww-mutex-design.txt (limited to 'include/linux') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 27e67a98b7be..1750fcef1ab4 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -287,6 +287,8 @@ local_ops.txt - semantics and behavior of local atomic operations. lockdep-design.txt - documentation on the runtime locking correctness validator. +locking/ + - directory with info about kernel locking primitives lockstat.txt - info on collecting statistics on locks (and contention). lockup-watchdogs.txt diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index e584ee12a1e7..7c9cc4846cb6 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1972,7 +1972,7 @@ machines due to caching. - Documentation/spinlocks.txt: + Documentation/locking/spinlocks.txt: Linus Torvalds' spinlocking tutorial in the kernel sources. diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt deleted file mode 100644 index 5dbc99c04f6e..000000000000 --- a/Documentation/lockdep-design.txt +++ /dev/null @@ -1,286 +0,0 @@ -Runtime locking correctness validator -===================================== - -started by Ingo Molnar -additions by Arjan van de Ven - -Lock-class ----------- - -The basic object the validator operates upon is a 'class' of locks. - -A class of locks is a group of locks that are logically the same with -respect to locking rules, even if the locks may have multiple (possibly -tens of thousands of) instantiations. For example a lock in the inode -struct is one class, while each inode has its own instantiation of that -lock class. - -The validator tracks the 'state' of lock-classes, and it tracks -dependencies between different lock-classes. The validator maintains a -rolling proof that the state and the dependencies are correct. - -Unlike an lock instantiation, the lock-class itself never goes away: when -a lock-class is used for the first time after bootup it gets registered, -and all subsequent uses of that lock-class will be attached to this -lock-class. - -State ------ - -The validator tracks lock-class usage history into 4n + 1 separate state bits: - -- 'ever held in STATE context' -- 'ever held as readlock in STATE context' -- 'ever held with STATE enabled' -- 'ever held as readlock with STATE enabled' - -Where STATE can be either one of (kernel/lockdep_states.h) - - hardirq - - softirq - - reclaim_fs - -- 'ever used' [ == !unused ] - -When locking rules are violated, these state bits are presented in the -locking error messages, inside curlies. A contrived example: - - modprobe/2287 is trying to acquire lock: - (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 - - but task is already holding lock: - (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 - - -The bit position indicates STATE, STATE-read, for each of the states listed -above, and the character displayed in each indicates: - - '.' acquired while irqs disabled and not in irq context - '-' acquired in irq context - '+' acquired with irqs enabled - '?' acquired in irq context with irqs enabled. - -Unused mutexes cannot be part of the cause of an error. - - -Single-lock state rules: ------------------------- - -A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The -following states are exclusive, and only one of them is allowed to be -set for any lock-class: - - and - and - -The validator detects and reports lock usage that violate these -single-lock state rules. - -Multi-lock dependency rules: ----------------------------- - -The same lock-class must not be acquired twice, because this could lead -to lock recursion deadlocks. - -Furthermore, two locks may not be taken in different order: - - -> - -> - -because this could lead to lock inversion deadlocks. (The validator -finds such dependencies in arbitrary complexity, i.e. there can be any -other locking sequence between the acquire-lock operations, the -validator will still track all dependencies between locks.) - -Furthermore, the following usage based lock dependencies are not allowed -between any two lock-classes: - - -> - -> - -The first rule comes from the fact the a hardirq-safe lock could be -taken by a hardirq context, interrupting a hardirq-unsafe lock - and -thus could result in a lock inversion deadlock. Likewise, a softirq-safe -lock could be taken by an softirq context, interrupting a softirq-unsafe -lock. - -The above rules are enforced for any locking sequence that occurs in the -kernel: when acquiring a new lock, the validator checks whether there is -any rule violation between the new lock and any of the held locks. - -When a lock-class changes its state, the following aspects of the above -dependency rules are enforced: - -- if a new hardirq-safe lock is discovered, we check whether it - took any hardirq-unsafe lock in the past. - -- if a new softirq-safe lock is discovered, we check whether it took - any softirq-unsafe lock in the past. - -- if a new hardirq-unsafe lock is discovered, we check whether any - hardirq-safe lock took it in the past. - -- if a new softirq-unsafe lock is discovered, we check whether any - softirq-safe lock took it in the past. - -(Again, we do these checks too on the basis that an interrupt context -could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which -could lead to a lock inversion deadlock - even if that lock scenario did -not trigger in practice yet.) - -Exception: Nested data dependencies leading to nested locking -------------------------------------------------------------- - -There are a few cases where the Linux kernel acquires more than one -instance of the same lock-class. Such cases typically happen when there -is some sort of hierarchy within objects of the same type. In these -cases there is an inherent "natural" ordering between the two objects -(defined by the properties of the hierarchy), and the kernel grabs the -locks in this fixed order on each of the objects. - -An example of such an object hierarchy that results in "nested locking" -is that of a "whole disk" block-dev object and a "partition" block-dev -object; the partition is "part of" the whole device and as long as one -always takes the whole disk lock as a higher lock than the partition -lock, the lock ordering is fully correct. The validator does not -automatically detect this natural ordering, as the locking rule behind -the ordering is not static. - -In order to teach the validator about this correct usage model, new -versions of the various locking primitives were added that allow you to -specify a "nesting level". An example call, for the block device mutex, -looks like this: - -enum bdev_bd_mutex_lock_class -{ - BD_MUTEX_NORMAL, - BD_MUTEX_WHOLE, - BD_MUTEX_PARTITION -}; - - mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); - -In this case the locking is done on a bdev object that is known to be a -partition. - -The validator treats a lock that is taken in such a nested fashion as a -separate (sub)class for the purposes of validation. - -Note: When changing code to use the _nested() primitives, be careful and -check really thoroughly that the hierarchy is correctly mapped; otherwise -you can get false positives or false negatives. - -Proof of 100% correctness: --------------------------- - -The validator achieves perfect, mathematical 'closure' (proof of locking -correctness) in the sense that for every simple, standalone single-task -locking sequence that occurred at least once during the lifetime of the -kernel, the validator proves it with a 100% certainty that no -combination and timing of these locking sequences can cause any class of -lock related deadlock. [*] - -I.e. complex multi-CPU and multi-task locking scenarios do not have to -occur in practice to prove a deadlock: only the simple 'component' -locking chains have to occur at least once (anytime, in any -task/context) for the validator to be able to prove correctness. (For -example, complex deadlocks that would normally need more than 3 CPUs and -a very unlikely constellation of tasks, irq-contexts and timings to -occur, can be detected on a plain, lightly loaded single-CPU system as -well!) - -This radically decreases the complexity of locking related QA of the -kernel: what has to be done during QA is to trigger as many "simple" -single-task locking dependencies in the kernel as possible, at least -once, to prove locking correctness - instead of having to trigger every -possible combination of locking interaction between CPUs, combined with -every possible hardirq and softirq nesting scenario (which is impossible -to do in practice). - -[*] assuming that the validator itself is 100% correct, and no other - part of the system corrupts the state of the validator in any way. - We also assume that all NMI/SMM paths [which could interrupt - even hardirq-disabled codepaths] are correct and do not interfere - with the validator. We also assume that the 64-bit 'chain hash' - value is unique for every lock-chain in the system. Also, lock - recursion must not be higher than 20. - -Performance: ------------- - -The above rules require _massive_ amounts of runtime checking. If we did -that for every lock taken and for every irqs-enable event, it would -render the system practically unusably slow. The complexity of checking -is O(N^2), so even with just a few hundred lock-classes we'd have to do -tens of thousands of checks for every event. - -This problem is solved by checking any given 'locking scenario' (unique -sequence of locks taken after each other) only once. A simple stack of -held locks is maintained, and a lightweight 64-bit hash value is -calculated, which hash is unique for every lock chain. The hash value, -when the chain is validated for the first time, is then put into a hash -table, which hash-table can be checked in a lockfree manner. If the -locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. - -Troubleshooting: ----------------- - -The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. -Exceeding this number will trigger the following lockdep warning: - - (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - -By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical -desktop systems have less than 1,000 lock classes, so this warning -normally results from lock-class leakage or failure to properly -initialize locks. These two problems are illustrated below: - -1. Repeated module loading and unloading while running the validator - will result in lock-class leakage. The issue here is that each - load of the module will create a new set of lock classes for - that module's locks, but module unloading does not remove old - classes (see below discussion of reuse of lock classes for why). - Therefore, if that module is loaded and unloaded repeatedly, - the number of lock classes will eventually reach the maximum. - -2. Using structures such as arrays that have large numbers of - locks that are not explicitly initialized. For example, - a hash table with 8192 buckets where each bucket has its own - spinlock_t will consume 8192 lock classes -unless- each spinlock - is explicitly initialized at runtime, for example, using the - run-time spin_lock_init() as opposed to compile-time initializers - such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize - the per-bucket spinlocks would guarantee lock-class overflow. - In contrast, a loop that called spin_lock_init() on each lock - would place all 8192 locks into a single lock class. - - The moral of this story is that you should always explicitly - initialize your locks. - -One might argue that the validator should be modified to allow -lock classes to be reused. However, if you are tempted to make this -argument, first review the code and think through the changes that would -be required, keeping in mind that the lock classes to be removed are -likely to be linked into the lock-dependency graph. This turns out to -be harder to do than to say. - -Of course, if you do run out of lock classes, the next thing to do is -to find the offending lock classes. First, the following command gives -you the number of lock classes currently in use along with the maximum: - - grep "lock-classes" /proc/lockdep_stats - -This command produces the following output on a modest system: - - lock-classes: 748 [max: 8191] - -If the number allocated (748 above) increases continually over time, -then there is likely a leak. The following command can be used to -identify the leaking lock classes: - - grep "BD" /proc/lockdep - -Run the command and save the output, then compare against the output from -a later run of this command to identify the leakers. This same output -can also help you find situations where runtime lock initialization has -been omitted. diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt new file mode 100644 index 000000000000..5dbc99c04f6e --- /dev/null +++ b/Documentation/locking/lockdep-design.txt @@ -0,0 +1,286 @@ +Runtime locking correctness validator +===================================== + +started by Ingo Molnar +additions by Arjan van de Ven + +Lock-class +---------- + +The basic object the validator operates upon is a 'class' of locks. + +A class of locks is a group of locks that are logically the same with +respect to locking rules, even if the locks may have multiple (possibly +tens of thousands of) instantiations. For example a lock in the inode +struct is one class, while each inode has its own instantiation of that +lock class. + +The validator tracks the 'state' of lock-classes, and it tracks +dependencies between different lock-classes. The validator maintains a +rolling proof that the state and the dependencies are correct. + +Unlike an lock instantiation, the lock-class itself never goes away: when +a lock-class is used for the first time after bootup it gets registered, +and all subsequent uses of that lock-class will be attached to this +lock-class. + +State +----- + +The validator tracks lock-class usage history into 4n + 1 separate state bits: + +- 'ever held in STATE context' +- 'ever held as readlock in STATE context' +- 'ever held with STATE enabled' +- 'ever held as readlock with STATE enabled' + +Where STATE can be either one of (kernel/lockdep_states.h) + - hardirq + - softirq + - reclaim_fs + +- 'ever used' [ == !unused ] + +When locking rules are violated, these state bits are presented in the +locking error messages, inside curlies. A contrived example: + + modprobe/2287 is trying to acquire lock: + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + but task is already holding lock: + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + +The bit position indicates STATE, STATE-read, for each of the states listed +above, and the character displayed in each indicates: + + '.' acquired while irqs disabled and not in irq context + '-' acquired in irq context + '+' acquired with irqs enabled + '?' acquired in irq context with irqs enabled. + +Unused mutexes cannot be part of the cause of an error. + + +Single-lock state rules: +------------------------ + +A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The +following states are exclusive, and only one of them is allowed to be +set for any lock-class: + + and + and + +The validator detects and reports lock usage that violate these +single-lock state rules. + +Multi-lock dependency rules: +---------------------------- + +The same lock-class must not be acquired twice, because this could lead +to lock recursion deadlocks. + +Furthermore, two locks may not be taken in different order: + + -> + -> + +because this could lead to lock inversion deadlocks. (The validator +finds such dependencies in arbitrary complexity, i.e. there can be any +other locking sequence between the acquire-lock operations, the +validator will still track all dependencies between locks.) + +Furthermore, the following usage based lock dependencies are not allowed +between any two lock-classes: + + -> + -> + +The first rule comes from the fact the a hardirq-safe lock could be +taken by a hardirq context, interrupting a hardirq-unsafe lock - and +thus could result in a lock inversion deadlock. Likewise, a softirq-safe +lock could be taken by an softirq context, interrupting a softirq-unsafe +lock. + +The above rules are enforced for any locking sequence that occurs in the +kernel: when acquiring a new lock, the validator checks whether there is +any rule violation between the new lock and any of the held locks. + +When a lock-class changes its state, the following aspects of the above +dependency rules are enforced: + +- if a new hardirq-safe lock is discovered, we check whether it + took any hardirq-unsafe lock in the past. + +- if a new softirq-safe lock is discovered, we check whether it took + any softirq-unsafe lock in the past. + +- if a new hardirq-unsafe lock is discovered, we check whether any + hardirq-safe lock took it in the past. + +- if a new softirq-unsafe lock is discovered, we check whether any + softirq-safe lock took it in the past. + +(Again, we do these checks too on the basis that an interrupt context +could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which +could lead to a lock inversion deadlock - even if that lock scenario did +not trigger in practice yet.) + +Exception: Nested data dependencies leading to nested locking +------------------------------------------------------------- + +There are a few cases where the Linux kernel acquires more than one +instance of the same lock-class. Such cases typically happen when there +is some sort of hierarchy within objects of the same type. In these +cases there is an inherent "natural" ordering between the two objects +(defined by the properties of the hierarchy), and the kernel grabs the +locks in this fixed order on each of the objects. + +An example of such an object hierarchy that results in "nested locking" +is that of a "whole disk" block-dev object and a "partition" block-dev +object; the partition is "part of" the whole device and as long as one +always takes the whole disk lock as a higher lock than the partition +lock, the lock ordering is fully correct. The validator does not +automatically detect this natural ordering, as the locking rule behind +the ordering is not static. + +In order to teach the validator about this correct usage model, new +versions of the various locking primitives were added that allow you to +specify a "nesting level". An example call, for the block device mutex, +looks like this: + +enum bdev_bd_mutex_lock_class +{ + BD_MUTEX_NORMAL, + BD_MUTEX_WHOLE, + BD_MUTEX_PARTITION +}; + + mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); + +In this case the locking is done on a bdev object that is known to be a +partition. + +The validator treats a lock that is taken in such a nested fashion as a +separate (sub)class for the purposes of validation. + +Note: When changing code to use the _nested() primitives, be careful and +check really thoroughly that the hierarchy is correctly mapped; otherwise +you can get false positives or false negatives. + +Proof of 100% correctness: +-------------------------- + +The validator achieves perfect, mathematical 'closure' (proof of locking +correctness) in the sense that for every simple, standalone single-task +locking sequence that occurred at least once during the lifetime of the +kernel, the validator proves it with a 100% certainty that no +combination and timing of these locking sequences can cause any class of +lock related deadlock. [*] + +I.e. complex multi-CPU and multi-task locking scenarios do not have to +occur in practice to prove a deadlock: only the simple 'component' +locking chains have to occur at least once (anytime, in any +task/context) for the validator to be able to prove correctness. (For +example, complex deadlocks that would normally need more than 3 CPUs and +a very unlikely constellation of tasks, irq-contexts and timings to +occur, can be detected on a plain, lightly loaded single-CPU system as +well!) + +This radically decreases the complexity of locking related QA of the +kernel: what has to be done during QA is to trigger as many "simple" +single-task locking dependencies in the kernel as possible, at least +once, to prove locking correctness - instead of having to trigger every +possible combination of locking interaction between CPUs, combined with +every possible hardirq and softirq nesting scenario (which is impossible +to do in practice). + +[*] assuming that the validator itself is 100% correct, and no other + part of the system corrupts the state of the validator in any way. + We also assume that all NMI/SMM paths [which could interrupt + even hardirq-disabled codepaths] are correct and do not interfere + with the validator. We also assume that the 64-bit 'chain hash' + value is unique for every lock-chain in the system. Also, lock + recursion must not be higher than 20. + +Performance: +------------ + +The above rules require _massive_ amounts of runtime checking. If we did +that for every lock taken and for every irqs-enable event, it would +render the system practically unusably slow. The complexity of checking +is O(N^2), so even with just a few hundred lock-classes we'd have to do +tens of thousands of checks for every event. + +This problem is solved by checking any given 'locking scenario' (unique +sequence of locks taken after each other) only once. A simple stack of +held locks is maintained, and a lightweight 64-bit hash value is +calculated, which hash is unique for every lock chain. The hash value, +when the chain is validated for the first time, is then put into a hash +table, which hash-table can be checked in a lockfree manner. If the +locking chain occurs again later on, the hash table tells us that we +dont have to validate the chain again. + +Troubleshooting: +---------------- + +The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. +Exceeding this number will trigger the following lockdep warning: + + (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + +By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical +desktop systems have less than 1,000 lock classes, so this warning +normally results from lock-class leakage or failure to properly +initialize locks. These two problems are illustrated below: + +1. Repeated module loading and unloading while running the validator + will result in lock-class leakage. The issue here is that each + load of the module will create a new set of lock classes for + that module's locks, but module unloading does not remove old + classes (see below discussion of reuse of lock classes for why). + Therefore, if that module is loaded and unloaded repeatedly, + the number of lock classes will eventually reach the maximum. + +2. Using structures such as arrays that have large numbers of + locks that are not explicitly initialized. For example, + a hash table with 8192 buckets where each bucket has its own + spinlock_t will consume 8192 lock classes -unless- each spinlock + is explicitly initialized at runtime, for example, using the + run-time spin_lock_init() as opposed to compile-time initializers + such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize + the per-bucket spinlocks would guarantee lock-class overflow. + In contrast, a loop that called spin_lock_init() on each lock + would place all 8192 locks into a single lock class. + + The moral of this story is that you should always explicitly + initialize your locks. + +One might argue that the validator should be modified to allow +lock classes to be reused. However, if you are tempted to make this +argument, first review the code and think through the changes that would +be required, keeping in mind that the lock classes to be removed are +likely to be linked into the lock-dependency graph. This turns out to +be harder to do than to say. + +Of course, if you do run out of lock classes, the next thing to do is +to find the offending lock classes. First, the following command gives +you the number of lock classes currently in use along with the maximum: + + grep "lock-classes" /proc/lockdep_stats + +This command produces the following output on a modest system: + + lock-classes: 748 [max: 8191] + +If the number allocated (748 above) increases continually over time, +then there is likely a leak. The following command can be used to +identify the leaking lock classes: + + grep "BD" /proc/lockdep + +Run the command and save the output, then compare against the output from +a later run of this command to identify the leakers. This same output +can also help you find situations where runtime lock initialization has +been omitted. diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt new file mode 100644 index 000000000000..7428773a1e69 --- /dev/null +++ b/Documentation/locking/lockstat.txt @@ -0,0 +1,178 @@ + +LOCK STATISTICS + +- WHAT + +As the name suggests, it provides statistics on locks. + +- WHY + +Because things like lock contention can severely impact performance. + +- HOW + +Lockdep already has hooks in the lock functions and maps lock instances to +lock classes. We build on that (see Documentation/lokcing/lockdep-design.txt). +The graph below shows the relation between the lock functions and the various +hooks therein. + + __acquire + | + lock _____ + | \ + | __contended + | | + | + | _______/ + |/ + | + __acquired + | + . + + . + | + __release + | + unlock + +lock, unlock - the regular lock functions +__* - the hooks +<> - states + +With these hooks we provide the following statistics: + + con-bounces - number of lock contention that involved x-cpu data + contentions - number of lock acquisitions that had to wait + wait time min - shortest (non-0) time we ever had to wait for a lock + max - longest time we ever had to wait for a lock + total - total time we spend waiting on this lock + avg - average time spent waiting on this lock + acq-bounces - number of lock acquisitions that involved x-cpu data + acquisitions - number of times we took the lock + hold time min - shortest (non-0) time we ever held the lock + max - longest time we ever held the lock + total - total time this lock was held + avg - average time this lock was held + +These numbers are gathered per lock class, per read/write state (when +applicable). + +It also tracks 4 contention points per class. A contention point is a call site +that had to wait on lock acquisition. + + - CONFIGURATION + +Lock statistics are enabled via CONFIG_LOCK_STAT. + + - USAGE + +Enable collection of statistics: + +# echo 1 >/proc/sys/kernel/lock_stat + +Disable collection of statistics: + +# echo 0 >/proc/sys/kernel/lock_stat + +Look at the current lock statistics: + +( line numbers not part of actual output, done for clarity in the explanation + below ) + +# less /proc/lock_stat + +01 lock_stat version 0.4 +02----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg +04----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +05 +06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99 +07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 +08 --------------- +09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 +19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 +11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 +12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 +13 --------------- +14 &mm->mmap_sem 1 [] dup_mmap+0x2a/0x3f0 +15 &mm->mmap_sem 60 [] SyS_mprotect+0xe9/0x250 +16 &mm->mmap_sem 41 [] __do_page_fault+0x1d4/0x510 +17 &mm->mmap_sem 68 [] vm_mmap_pgoff+0x87/0xd0 +18 +19............................................................................................................................................................................................................................. +20 +21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48 +22 --------------- +23 unix_table_lock 45 [] unix_create1+0x16e/0x1b0 +24 unix_table_lock 47 [] unix_release_sock+0x31/0x250 +25 unix_table_lock 15 [] unix_find_other+0x117/0x230 +26 unix_table_lock 5 [] unix_autobind+0x11f/0x1b0 +27 --------------- +28 unix_table_lock 39 [] unix_release_sock+0x31/0x250 +29 unix_table_lock 49 [] unix_create1+0x16e/0x1b0 +30 unix_table_lock 20 [] unix_find_other+0x117/0x230 +31 unix_table_lock 4 [] unix_autobind+0x11f/0x1b0 + + +This excerpt shows the first two lock class statistics. Line 01 shows the +output version - each time the format changes this will be updated. Line 02-04 +show the header with column descriptions. Lines 05-18 and 20-31 show the actual +statistics. These statistics come in two parts; the actual stats separated by a +short separator (line 08, 13) from the contention points. + +The first lock (05-18) is a read/write lock, and shows two lines above the +short separator. The contention points don't match the column descriptors, +they have two: contentions and [] symbol. The second set of contention +points are the points we're contending with. + +The integer part of the time values is in us. + +Dealing with nested locks, subclasses may appear: + +32........................................................................................................................................................................................................................... +33 +34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82 +35 --------- +36 &rq->lock 645 [] task_rq_lock+0x43/0x75 +37 &rq->lock 297 [] try_to_wake_up+0x127/0x25a +38 &rq->lock 360 [] select_task_rq_fair+0x1f0/0x74a +39 &rq->lock 428 [] scheduler_tick+0x46/0x1fb +40 --------- +41 &rq->lock 77 [] task_rq_lock+0x43/0x75 +42 &rq->lock 174 [] try_to_wake_up+0x127/0x25a +43 &rq->lock 4715 [] double_rq_lock+0x42/0x54 +44 &rq->lock 893 [] schedule+0x157/0x7b8 +45 +46........................................................................................................................................................................................................................... +47 +48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84 +49 ----------- +50 &rq->lock/1 11526 [] double_rq_lock+0x4f/0x54 +51 ----------- +52 &rq->lock/1 5645 [] double_rq_lock+0x42/0x54 +53 &rq->lock/1 1224 [] schedule+0x157/0x7b8 +54 &rq->lock/1 4336 [] double_rq_lock+0x4f/0x54 +55 &rq->lock/1 181 [] try_to_wake_up+0x127/0x25a + +Line 48 shows statistics for the second subclass (/1) of &rq->lock class +(subclass starts from 0), since in this case, as line 50 suggests, +double_rq_lock actually acquires a nested lock of two spinlocks. + +View the top contending locks: + +# grep : /proc/lock_stat | head + clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71 + tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59 + &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59 + &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69 + &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93 + tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72 + tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89 + rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89 + &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27 + rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76 + +Clear the statistics: + +# echo 0 > /proc/lock_stat diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt new file mode 100644 index 000000000000..ee231ed09ec6 --- /dev/null +++ b/Documentation/locking/mutex-design.txt @@ -0,0 +1,157 @@ +Generic Mutex Subsystem + +started by Ingo Molnar +updated by Davidlohr Bueso + +What are mutexes? +----------------- + +In the Linux kernel, mutexes refer to a particular locking primitive +that enforces serialization on shared memory systems, and not only to +the generic term referring to 'mutual exclusion' found in academia +or similar theoretical text books. Mutexes are sleeping locks which +behave similarly to binary semaphores, and were introduced in 2006[1] +as an alternative to these. This new data structure provided a number +of advantages, including simpler interfaces, and at that time smaller +code (see Disadvantages). + +[1] http://lwn.net/Articles/164802/ + +Implementation +-------------- + +Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h +and implemented in kernel/locking/mutex.c. These locks use a three +state atomic counter (->count) to represent the different possible +transitions that can occur during the lifetime of a lock: + + 1: unlocked + 0: locked, no waiters + negative: locked, with potential waiters + +In its most basic form it also includes a wait-queue and a spinlock +that serializes access to it. CONFIG_SMP systems can also include +a pointer to the lock task owner (->owner) as well as a spinner MCS +lock (->osq), both described below in (ii). + +When acquiring a mutex, there are three possible paths that can be +taken, depending on the state of the lock: + +(i) fastpath: tries to atomically acquire the lock by decrementing the + counter. If it was already taken by another task it goes to the next + possible path. This logic is architecture specific. On x86-64, the + locking fastpath is 2 instructions: + + 0000000000000e10 : + e21: f0 ff 0b lock decl (%rbx) + e24: 79 08 jns e2e + + the unlocking fastpath is equally tight: + + 0000000000000bc0 : + bc8: f0 ff 07 lock incl (%rdi) + bcb: 7f 0a jg bd7 + + +(ii) midpath: aka optimistic spinning, tries to spin for acquisition + while the lock owner is running and there are no other tasks ready + to run that have higher priority (need_resched). The rationale is + that if the lock owner is running, it is likely to release the lock + soon. The mutex spinners are queued up using MCS lock so that only + one spinner can compete for the mutex. + + The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock + with the desirable properties of being fair and with each cpu trying + to acquire the lock spinning on a local variable. It avoids expensive + cacheline bouncing that common test-and-set spinlock implementations + incur. An MCS-like lock is specially tailored for optimistic spinning + for sleeping lock implementation. An important feature of the customized + MCS lock is that it has the extra property that spinners are able to exit + the MCS spinlock queue when they need to reschedule. This further helps + avoid situations where MCS spinners that need to reschedule would continue + waiting to spin on mutex owner, only to go directly to slowpath upon + obtaining the MCS lock. + + +(iii) slowpath: last resort, if the lock is still unable to be acquired, + the task is added to the wait-queue and sleeps until woken up by the + unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. + +While formally kernel mutexes are sleepable locks, it is path (ii) that +makes them more practically a hybrid type. By simply not interrupting a +task and busy-waiting for a few cycles instead of immediately sleeping, +the performance of this lock has been seen to significantly improve a +number of workloads. Note that this technique is also used for rw-semaphores. + +Semantics +--------- + +The mutex subsystem checks and enforces the following rules: + + - Only one task can hold the mutex at a time. + - Only the owner can unlock the mutex. + - Multiple unlocks are not permitted. + - Recursive locking/unlocking is not permitted. + - A mutex must only be initialized via the API (see below). + - A task may not exit with a mutex held. + - Memory areas where held locks reside must not be freed. + - Held mutexes must not be reinitialized. + - Mutexes may not be used in hardware or software interrupt + contexts such as tasklets and timers. + +These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. +In addition, the mutex debugging code also implements a number of other +features that make lock debugging easier and faster: + + - Uses symbolic names of mutexes, whenever they are printed + in debug output. + - Point-of-acquire tracking, symbolic lookup of function names, + list of all locks held in the system, printout of them. + - Owner tracking. + - Detects self-recursing locks and prints out all relevant info. + - Detects multi-task circular deadlocks and prints out all affected + locks and tasks (and only those tasks). + + +Interfaces +---------- +Statically define the mutex: + DEFINE_MUTEX(name); + +Dynamically initialize the mutex: + mutex_init(mutex); + +Acquire the mutex, uninterruptible: + void mutex_lock(struct mutex *lock); + void mutex_lock_nested(struct mutex *lock, unsigned int subclass); + int mutex_trylock(struct mutex *lock); + +Acquire the mutex, interruptible: + int mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass); + int mutex_lock_interruptible(struct mutex *lock); + +Acquire the mutex, interruptible, if dec to 0: + int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); + +Unlock the mutex: + void mutex_unlock(struct mutex *lock); + +Test if the mutex is taken: + int mutex_is_locked(struct mutex *lock); + +Disadvantages +------------- + +Unlike its original design and purpose, 'struct mutex' is larger than +most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice +as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the +'struct rw_semaphore' variant. Larger structure sizes mean more CPU +cache and memory footprint. + +When to use mutexes +------------------- + +Unless the strict semantics of mutexes are unsuitable and/or the critical +region prevents the lock from being shared, always prefer them to any other +locking primitive. diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt new file mode 100644 index 000000000000..8666070d3189 --- /dev/null +++ b/Documentation/locking/rt-mutex-design.txt @@ -0,0 +1,781 @@ +# +# Copyright (c) 2006 Steven Rostedt +# Licensed under the GNU Free Documentation License, Version 1.2 +# + +RT-mutex implementation design +------------------------------ + +This document tries to describe the design of the rtmutex.c implementation. +It doesn't describe the reasons why rtmutex.c exists. For that please see +Documentation/rt-mutex.txt. Although this document does explain problems +that happen without this code, but that is in the concept to understand +what the code actually is doing. + +The goal of this document is to help others understand the priority +inheritance (PI) algorithm that is used, as well as reasons for the +decisions that were made to implement PI in the manner that was done. + + +Unbounded Priority Inversion +---------------------------- + +Priority inversion is when a lower priority process executes while a higher +priority process wants to run. This happens for several reasons, and +most of the time it can't be helped. Anytime a high priority process wants +to use a resource that a lower priority process has (a mutex for example), +the high priority process must wait until the lower priority process is done +with the resource. This is a priority inversion. What we want to prevent +is something called unbounded priority inversion. That is when the high +priority process is prevented from running by a lower priority process for +an undetermined amount of time. + +The classic example of unbounded priority inversion is where you have three +processes, let's call them processes A, B, and C, where A is the highest +priority process, C is the lowest, and B is in between. A tries to grab a lock +that C owns and must wait and lets C run to release the lock. But in the +meantime, B executes, and since B is of a higher priority than C, it preempts C, +but by doing so, it is in fact preempting A which is a higher priority process. +Now there's no way of knowing how long A will be sleeping waiting for C +to release the lock, because for all we know, B is a CPU hog and will +never give C a chance to release the lock. This is called unbounded priority +inversion. + +Here's a little ASCII art to show the problem. + + grab lock L1 (owned by C) + | +A ---+ + C preempted by B + | +C +----+ + +B +--------> + B now keeps A from running. + + +Priority Inheritance (PI) +------------------------- + +There are several ways to solve this issue, but other ways are out of scope +for this document. Here we only discuss PI. + +PI is where a process inherits the priority of another process if the other +process blocks on a lock owned by the current process. To make this easier +to understand, let's use the previous example, with processes A, B, and C again. + +This time, when A blocks on the lock owned by C, C would inherit the priority +of A. So now if B becomes runnable, it would not preempt C, since C now has +the high priority of A. As soon as C releases the lock, it loses its +inherited priority, and A then can continue with the resource that C had. + +Terminology +----------- + +Here I explain some terminology that is used in this document to help describe +the design that is used to implement PI. + +PI chain - The PI chain is an ordered series of locks and processes that cause + processes to inherit priorities from a previous process that is + blocked on one of its locks. This is described in more detail + later in this document. + +mutex - In this document, to differentiate from locks that implement + PI and spin locks that are used in the PI code, from now on + the PI locks will be called a mutex. + +lock - In this document from now on, I will use the term lock when + referring to spin locks that are used to protect parts of the PI + algorithm. These locks disable preemption for UP (when + CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from + entering critical sections simultaneously. + +spin lock - Same as lock above. + +waiter - A waiter is a struct that is stored on the stack of a blocked + process. Since the scope of the waiter is within the code for + a process being blocked on the mutex, it is fine to allocate + the waiter on the process's stack (local variable). This + structure holds a pointer to the task, as well as the mutex that + the task is blocked on. It also has the plist node structures to + place the task in the waiter_list of a mutex as well as the + pi_list of a mutex owner task (described below). + + waiter is sometimes used in reference to the task that is waiting + on a mutex. This is the same as waiter->task. + +waiters - A list of processes that are blocked on a mutex. + +top waiter - The highest priority process waiting on a specific mutex. + +top pi waiter - The highest priority process waiting on one of the mutexes + that a specific process owns. + +Note: task and process are used interchangeably in this document, mostly to + differentiate between two processes that are being described together. + + +PI chain +-------- + +The PI chain is a list of processes and mutexes that may cause priority +inheritance to take place. Multiple chains may converge, but a chain +would never diverge, since a process can't be blocked on more than one +mutex at a time. + +Example: + + Process: A, B, C, D, E + Mutexes: L1, L2, L3, L4 + + A owns: L1 + B blocked on L1 + B owns L2 + C blocked on L2 + C owns L3 + D blocked on L3 + D owns L4 + E blocked on L4 + +The chain would be: + + E->L4->D->L3->C->L2->B->L1->A + +To show where two chains merge, we could add another process F and +another mutex L5 where B owns L5 and F is blocked on mutex L5. + +The chain for F would be: + + F->L5->B->L1->A + +Since a process may own more than one mutex, but never be blocked on more than +one, the chains merge. + +Here we show both chains: + + E->L4->D->L3->C->L2-+ + | + +->B->L1->A + | + F->L5-+ + +For PI to work, the processes at the right end of these chains (or we may +also call it the Top of the chain) must be equal to or higher in priority +than the processes to the left or below in the chain. + +Also since a mutex may have more than one process blocked on it, we can +have multiple chains merge at mutexes. If we add another process G that is +blocked on mutex L2: + + G->L2->B->L1->A + +And once again, to show how this can grow I will show the merging chains +again. + + E->L4->D->L3->C-+ + +->L2-+ + | | + G-+ +->B->L1->A + | + F->L5-+ + + +Plist +----- + +Before I go further and talk about how the PI chain is stored through lists +on both mutexes and processes, I'll explain the plist. This is similar to +the struct list_head functionality that is already in the kernel. +The implementation of plist is out of scope for this document, but it is +very important to understand what it does. + +There are a few differences between plist and list, the most important one +being that plist is a priority sorted linked list. This means that the +priorities of the plist are sorted, such that it takes O(1) to retrieve the +highest priority item in the list. Obviously this is useful to store processes +based on their priorities. + +Another difference, which is important for implementation, is that, unlike +list, the head of the list is a different element than the nodes of a list. +So the head of the list is declared as struct plist_head and nodes that will +be added to the list are declared as struct plist_node. + + +Mutex Waiter List +----------------- + +Every mutex keeps track of all the waiters that are blocked on itself. The mutex +has a plist to store these waiters by priority. This list is protected by +a spin lock that is located in the struct of the mutex. This lock is called +wait_lock. Since the modification of the waiter list is never done in +interrupt context, the wait_lock can be taken without disabling interrupts. + + +Task PI List +------------ + +To keep track of the PI chains, each process has its own PI list. This is +a list of all top waiters of the mutexes that are owned by the process. +Note that this list only holds the top waiters and not all waiters that are +blocked on mutexes owned by the process. + +The top of the task's PI list is always the highest priority task that +is waiting on a mutex that is owned by the task. So if the task has +inherited a priority, it will always be the priority of the task that is +at the top of this list. + +This list is stored in the task structure of a process as a plist called +pi_list. This list is protected by a spin lock also in the task structure, +called pi_lock. This lock may also be taken in interrupt context, so when +locking the pi_lock, interrupts must be disabled. + + +Depth of the PI Chain +--------------------- + +The maximum depth of the PI chain is not dynamic, and could actually be +defined. But is very complex to figure it out, since it depends on all +the nesting of mutexes. Let's look at the example where we have 3 mutexes, +L1, L2, and L3, and four separate functions func1, func2, func3 and func4. +The following shows a locking order of L1->L2->L3, but may not actually +be directly nested that way. + +void func1(void) +{ + mutex_lock(L1); + + /* do anything */ + + mutex_unlock(L1); +} + +void func2(void) +{ + mutex_lock(L1); + mutex_lock(L2); + + /* do something */ + + mutex_unlock(L2); + mutex_unlock(L1); +} + +void func3(void) +{ + mutex_lock(L2); + mutex_lock(L3); + + /* do something else */ + + mutex_unlock(L3); + mutex_unlock(L2); +} + +void func4(void) +{ + mutex_lock(L3); + + /* do something again */ + + mutex_unlock(L3); +} + +Now we add 4 processes that run each of these functions separately. +Processes A, B, C, and D which run functions func1, func2, func3 and func4 +respectively, and such that D runs first and A last. With D being preempted +in func4 in the "do something again" area, we have a locking that follows: + +D owns L3 + C blocked on L3 + C owns L2 + B blocked on L2 + B owns L1 + A blocked on L1 + +And thus we have the chain A->L1->B->L2->C->L3->D. + +This gives us a PI depth of 4 (four processes), but looking at any of the +functions individually, it seems as though they only have at most a locking +depth of two. So, although the locking depth is defined at compile time, +it still is very difficult to find the possibilities of that depth. + +Now since mutexes can be defined by user-land applications, we don't want a DOS +type of application that nests large amounts of mutexes to create a large +PI chain, and have the code holding spin locks while looking at a large +amount of data. So to prevent this, the implementation not only implements +a maximum lock depth, but also only holds at most two different locks at a +time, as it walks the PI chain. More about this below. + + +Mutex owner and flags +--------------------- + +The mutex structure contains a pointer to the owner of the mutex. If the +mutex is not owned, this owner is set to NULL. Since all architectures +have the task structure on at least a four byte alignment (and if this is +not true, the rtmutex.c code will be broken!), this allows for the two +least significant bits to be used as flags. This part is also described +in Documentation/rt-mutex.txt, but will also be briefly described here. + +Bit 0 is used as the "Pending Owner" flag. This is described later. +Bit 1 is used as the "Has Waiters" flags. This is also described later + in more detail, but is set whenever there are waiters on a mutex. + + +cmpxchg Tricks +-------------- + +Some architectures implement an atomic cmpxchg (Compare and Exchange). This +is used (when applicable) to keep the fast path of grabbing and releasing +mutexes short. + +cmpxchg is basically the following function performed atomically: + +unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) +{ + unsigned long T = *A; + if (*A == *B) { + *A = *C; + } + return T; +} +#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) + +This is really nice to have, since it allows you to only update a variable +if the variable is what you expect it to be. You know if it succeeded if +the return value (the old value of A) is equal to B. + +The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If +the architecture does not support CMPXCHG, then this macro is simply set +to fail every time. But if CMPXCHG is supported, then this will +help out extremely to keep the fast path short. + +The use of rt_mutex_cmpxchg with the flags in the owner field help optimize +the system for architectures that support it. This will also be explained +later in this document. + + +Priority adjustments +-------------------- + +The implementation of the PI code in rtmutex.c has several places that a +process must adjust its priority. With the help of the pi_list of a +process this is rather easy to know what needs to be adjusted. + +The functions implementing the task adjustments are rt_mutex_adjust_prio, +__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock +to already be taken), rt_mutex_getprio, and rt_mutex_setprio. + +rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. + +rt_mutex_getprio returns the priority that the task should have. Either the +task's own normal priority, or if a process of a higher priority is waiting on +a mutex owned by the task, then that higher priority should be returned. +Since the pi_list of a task holds an order by priority list of all the top +waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs +to compare the top pi waiter to its own normal priority, and return the higher +priority back. + +(Note: if looking at the code, you will notice that the lower number of + prio is returned. This is because the prio field in the task structure + is an inverse order of the actual priority. So a "prio" of 5 is + of higher priority than a "prio" of 10.) + +__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the +result does not equal the task's current priority, then rt_mutex_setprio +is called to adjust the priority of the task to the new priority. +Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the +actual change in priority. + +It is interesting to note that __rt_mutex_adjust_prio can either increase +or decrease the priority of the task. In the case that a higher priority +process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio +would increase/boost the task's priority. But if a higher priority task +were for some reason to leave the mutex (timeout or signal), this same function +would decrease/unboost the priority of the task. That is because the pi_list +always contains the highest priority task that is waiting on a mutex owned +by the task, so we only need to compare the priority of that top pi waiter +to the normal priority of the given task. + + +High level overview of the PI chain walk +---------------------------------------- + +The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. + +The implementation has gone through several iterations, and has ended up +with what we believe is the best. It walks the PI chain by only grabbing +at most two locks at a time, and is very efficient. + +The rt_mutex_adjust_prio_chain can be used either to boost or lower process +priorities. + +rt_mutex_adjust_prio_chain is called with a task to be checked for PI +(de)boosting (the owner of a mutex that a process is blocking on), a flag to +check for deadlocking, the mutex that the task owns, and a pointer to a waiter +that is the process's waiter struct that is blocked on the mutex (although this +parameter may be NULL for deboosting). + +For this explanation, I will not mention deadlock detection. This explanation +will try to stay at a high level. + +When this function is called, there are no locks held. That also means +that the state of the owner and lock can change when entered into this function. + +Before this function is called, the task has already had rt_mutex_adjust_prio +performed on it. This means that the task is set to the priority that it +should be at, but the plist nodes of the task's waiter have not been updated +with the new priorities, and that this task may not be in the proper locations +in the pi_lists and wait_lists that the task is blocked on. This function +solves all that. + +A loop is entered, where task is the owner to be checked for PI changes that +was passed by parameter (for the first iteration). The pi_lock of this task is +taken to prevent any more changes to the pi_list of the task. This also +prevents new tasks from completing the blocking on a mutex that is owned by this +task. + +If the task is not blocked on a mutex then the loop is exited. We are at +the top of the PI chain. + +A check is now done to see if the original waiter (the process that is blocked +on the current mutex) is the top pi waiter of the task. That is, is this +waiter on the top of the task's pi_list. If it is not, it either means that +there is another process higher in priority that is blocked on one of the +mutexes that the task owns, or that the waiter has just woken up via a signal +or timeout and has left the PI chain. In either case, the loop is exited, since +we don't need to do any more changes to the priority of the current task, or any +task that owns a mutex that this current task is waiting on. A priority chain +walk is only needed when a new top pi waiter is made to a task. + +The next check sees if the task's waiter plist node has the priority equal to +the priority the task is set at. If they are equal, then we are done with +the loop. Remember that the function started with the priority of the +task adjusted, but the plist nodes that hold the task in other processes +pi_lists have not been adjusted. + +Next, we look at the mutex that the task is blocked on. The mutex's wait_lock +is taken. This is done by a spin_trylock, because the locking order of the +pi_lock and wait_lock goes in the opposite direction. If we fail to grab the +lock, the pi_lock is released, and we restart the loop. + +Now that we have both the pi_lock of the task as well as the wait_lock of +the mutex the task is blocked on, we update the task's waiter's plist node +that is located on the mutex's wait_list. + +Now we release the pi_lock of the task. + +Next the owner of the mutex has its pi_lock taken, so we can update the +task's entry in the owner's pi_list. If the task is the highest priority +process on the mutex's wait_list, then we remove the previous top waiter +from the owner's pi_list, and replace it with the task. + +Note: It is possible that the task was the current top waiter on the mutex, + in which case the task is not yet on the pi_list of the waiter. This + is OK, since plist_del does nothing if the plist node is not on any + list. + +If the task was not the top waiter of the mutex, but it was before we +did the priority updates, that means we are deboosting/lowering the +task. In this case, the task is removed from the pi_list of the owner, +and the new top waiter is added. + +Lastly, we unlock both the pi_lock of the task, as well as the mutex's +wait_lock, and continue the loop again. On the next iteration of the +loop, the previous owner of the mutex will be the task that will be +processed. + +Note: One might think that the owner of this mutex might have changed + since we just grab the mutex's wait_lock. And one could be right. + The important thing to remember is that the owner could not have + become the task that is being processed in the PI chain, since + we have taken that task's pi_lock at the beginning of the loop. + So as long as there is an owner of this mutex that is not the same + process as the tasked being worked on, we are OK. + + Looking closely at the code, one might be confused. The check for the + end of the PI chain is when the task isn't blocked on anything or the + task's waiter structure "task" element is NULL. This check is + protected only by the task's pi_lock. But the code to unlock the mutex + sets the task's waiter structure "task" element to NULL with only + the protection of the mutex's wait_lock, which was not taken yet. + Isn't this a race condition if the task becomes the new owner? + + The answer is No! The trick is the spin_trylock of the mutex's + wait_lock. If we fail that lock, we release the pi_lock of the + task and continue the loop, doing the end of PI chain check again. + + In the code to release the lock, the wait_lock of the mutex is held + the entire time, and it is not let go when we grab the pi_lock of the + new owner of the mutex. So if the switch of a new owner were to happen + after the check for end of the PI chain and the grabbing of the + wait_lock, the unlocking code would spin on the new owner's pi_lock + but never give up the wait_lock. So the PI chain loop is guaranteed to + fail the spin_trylock on the wait_lock, release the pi_lock, and + try again. + + If you don't quite understand the above, that's OK. You don't have to, + unless you really want to make a proof out of it ;) + + +Pending Owners and Lock stealing +-------------------------------- + +One of the flags in the owner field of the mutex structure is "Pending Owner". +What this means is that an owner was chosen by the process releasing the +mutex, but that owner has yet to wake up and actually take the mutex. + +Why is this important? Why can't we just give the mutex to another process +and be done with it? + +The PI code is to help with real-time processes, and to let the highest +priority process run as long as possible with little latencies and delays. +If a high priority process owns a mutex that a lower priority process is +blocked on, when the mutex is released it would be given to the lower priority +process. What if the higher priority process wants to take that mutex again. +The high priority process would fail to take that mutex that it just gave up +and it would need to boost the lower priority process to run with full +latency of that critical section (since the low priority process just entered +it). + +There's no reason a high priority process that gives up a mutex should be +penalized if it tries to take that mutex again. If the new owner of the +mutex has not woken up yet, there's no reason that the higher priority process +could not take that mutex away. + +To solve this, we introduced Pending Ownership and Lock Stealing. When a +new process is given a mutex that it was blocked on, it is only given +pending ownership. This means that it's the new owner, unless a higher +priority process comes in and tries to grab that mutex. If a higher priority +process does come along and wants that mutex, we let the higher priority +process "steal" the mutex from the pending owner (only if it is still pending) +and continue with the mutex. + + +Taking of a mutex (The walk through) +------------------------------------ + +OK, now let's take a look at the detailed walk through of what happens when +taking a mutex. + +The first thing that is tried is the fast taking of the mutex. This is +done when we have CMPXCHG enabled (otherwise the fast taking automatically +fails). Only when the owner field of the mutex is NULL can the lock be +taken with the CMPXCHG and nothing else needs to be done. + +If there is contention on the lock, whether it is owned or pending owner +we go about the slow path (rt_mutex_slowlock). + +The slow path function is where the task's waiter structure is created on +the stack. This is because the waiter structure is only needed for the +scope of this function. The waiter structure holds the nodes to store +the task on the wait_list of the mutex, and if need be, the pi_list of +the owner. + +The wait_lock of the mutex is taken since the slow path of unlocking the +mutex also takes this lock. + +We then call try_to_take_rt_mutex. This is where the architecture that +does not implement CMPXCHG would always grab the lock (if there's no +contention). + +try_to_take_rt_mutex is used every time the task tries to grab a mutex in the +slow path. The first thing that is done here is an atomic setting of +the "Has Waiters" flag of the mutex's owner field. Yes, this could really +be false, because if the mutex has no owner, there are no waiters and +the current task also won't have any waiters. But we don't have the lock +yet, so we assume we are going to be a waiter. The reason for this is to +play nice for those architectures that do have CMPXCHG. By setting this flag +now, the owner of the mutex can't release the mutex without going into the +slow unlock path, and it would then need to grab the wait_lock, which this +code currently holds. So setting the "Has Waiters" flag forces the owner +to synchronize with this code. + +Now that we know that we can't have any races with the owner releasing the +mutex, we check to see if we can take the ownership. This is done if the +mutex doesn't have a owner, or if we can steal the mutex from a pending +owner. Let's look at the situations we have here. + + 1) Has owner that is pending + ---------------------------- + + The mutex has a owner, but it hasn't woken up and the mutex flag + "Pending Owner" is set. The first check is to see if the owner isn't the + current task. This is because this function is also used for the pending + owner to grab the mutex. When a pending owner wakes up, it checks to see + if it can take the mutex, and this is done if the owner is already set to + itself. If so, we succeed and leave the function, clearing the "Pending + Owner" bit. + + If the pending owner is not current, we check to see if the current priority is + higher than the pending owner. If not, we fail the function and return. + + There's also something special about a pending owner. That is a pending owner + is never blocked on a mutex. So there is no PI chain to worry about. It also + means that if the mutex doesn't have any waiters, there's no accounting needed + to update the pending owner's pi_list, since we only worry about processes + blocked on the current mutex. + + If there are waiters on this mutex, and we just stole the ownership, we need + to take the top waiter, remove it from the pi_list of the pending owner, and + add it to the current pi_list. Note that at this moment, the pending owner + is no longer on the list of waiters. This is fine, since the pending owner + would add itself back when it realizes that it had the ownership stolen + from itself. When the pending owner tries to grab the mutex, it will fail + in try_to_take_rt_mutex if the owner field points to another process. + + 2) No owner + ----------- + + If there is no owner (or we successfully stole the lock), we set the owner + of the mutex to current, and set the flag of "Has Waiters" if the current + mutex actually has waiters, or we clear the flag if it doesn't. See, it was + OK that we set that flag early, since now it is cleared. + + 3) Failed to grab ownership + --------------------------- + + The most interesting case is when we fail to take ownership. This means that + there exists an owner, or there's a pending owner with equal or higher + priority than the current task. + +We'll continue on the failed case. + +If the mutex has a timeout, we set up a timer to go off to break us out +of this mutex if we failed to get it after a specified amount of time. + +Now we enter a loop that will continue to try to take ownership of the mutex, or +fail from a timeout or signal. + +Once again we try to take the mutex. This will usually fail the first time +in the loop, since it had just failed to get the mutex. But the second time +in the loop, this would likely succeed, since the task would likely be +the pending owner. + +If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done +here. + +The waiter structure has a "task" field that points to the task that is blocked +on the mutex. This field can be NULL the first time it goes through the loop +or if the task is a pending owner and had its mutex stolen. If the "task" +field is NULL then we need to set up the accounting for it. + +Task blocks on mutex +-------------------- + +The accounting of a mutex and process is done with the waiter structure of +the process. The "task" field is set to the process, and the "lock" field +to the mutex. The plist nodes are initialized to the processes current +priority. + +Since the wait_lock was taken at the entry of the slow lock, we can safely +add the waiter to the wait_list. If the current process is the highest +priority process currently waiting on this mutex, then we remove the +previous top waiter process (if it exists) from the pi_list of the owner, +and add the current process to that list. Since the pi_list of the owner +has changed, we call rt_mutex_adjust_prio on the owner to see if the owner +should adjust its priority accordingly. + +If the owner is also blocked on a lock, and had its pi_list changed +(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead +and run rt_mutex_adjust_prio_chain on the owner, as described earlier. + +Now all locks are released, and if the current process is still blocked on a +mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). + +Waking up in the loop +--------------------- + +The schedule can then wake up for a few reasons. + 1) we were given pending ownership of the mutex. + 2) we received a signal and was TASK_INTERRUPTIBLE + 3) we had a timeout and was TASK_INTERRUPTIBLE + +In any of these cases, we continue the loop and once again try to grab the +ownership of the mutex. If we succeed, we exit the loop, otherwise we continue +and on signal and timeout, will exit the loop, or if we had the mutex stolen +we just simply add ourselves back on the lists and go back to sleep. + +Note: For various reasons, because of timeout and signals, the steal mutex + algorithm needs to be careful. This is because the current process is + still on the wait_list. And because of dynamic changing of priorities, + especially on SCHED_OTHER tasks, the current process can be the + highest priority task on the wait_list. + +Failed to get mutex on Timeout or Signal +---------------------------------------- + +If a timeout or signal occurred, the waiter's "task" field would not be +NULL and the task needs to be taken off the wait_list of the mutex and perhaps +pi_list of the owner. If this process was a high priority process, then +the rt_mutex_adjust_prio_chain needs to be executed again on the owner, +but this time it will be lowering the priorities. + + +Unlocking the Mutex +------------------- + +The unlocking of a mutex also has a fast path for those architectures with +CMPXCHG. Since the taking of a mutex on contention always sets the +"Has Waiters" flag of the mutex's owner, we use this to know if we need to +take the slow path when unlocking the mutex. If the mutex doesn't have any +waiters, the owner field of the mutex would equal the current process and +the mutex can be unlocked by just replacing the owner field with NULL. + +If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), +the slow unlock path is taken. + +The first thing done in the slow unlock path is to take the wait_lock of the +mutex. This synchronizes the locking and unlocking of the mutex. + +A check is made to see if the mutex has waiters or not. On architectures that +do not have CMPXCHG, this is the location that the owner of the mutex will +determine if a waiter needs to be awoken or not. On architectures that +do have CMPXCHG, that check is done in the fast path, but it is still needed +in the slow path too. If a waiter of a mutex woke up because of a signal +or timeout between the time the owner failed the fast path CMPXCHG check and +the grabbing of the wait_lock, the mutex may not have any waiters, thus the +owner still needs to make this check. If there are no waiters then the mutex +owner field is set to NULL, the wait_lock is released and nothing more is +needed. + +If there are waiters, then we need to wake one up and give that waiter +pending ownership. + +On the wake up code, the pi_lock of the current owner is taken. The top +waiter of the lock is found and removed from the wait_list of the mutex +as well as the pi_list of the current owner. The task field of the new +pending owner's waiter structure is set to NULL, and the owner field of the +mutex is set to the new owner with the "Pending Owner" bit set, as well +as the "Has Waiters" bit if there still are other processes blocked on the +mutex. + +The pi_lock of the previous owner is released, and the new pending owner's +pi_lock is taken. Remember that this is the trick to prevent the race +condition in rt_mutex_adjust_prio_chain from adding itself as a waiter +on the mutex. + +We now clear the "pi_blocked_on" field of the new pending owner, and if +the mutex still has waiters pending, we add the new top waiter to the pi_list +of the pending owner. + +Finally we unlock the pi_lock of the pending owner and wake it up. + + +Contact +------- + +For updates on this document, please email Steven Rostedt + + +Credits +------- + +Author: Steven Rostedt + +Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap + +Updates +------- + +This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt new file mode 100644 index 000000000000..243393d882ee --- /dev/null +++ b/Documentation/locking/rt-mutex.txt @@ -0,0 +1,79 @@ +RT-mutex subsystem with PI support +---------------------------------- + +RT-mutexes with priority inheritance are used to support PI-futexes, +which enable pthread_mutex_t priority inheritance attributes +(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details +about PI-futexes.] + +This technology was developed in the -rt tree and streamlined for +pthread_mutex support. + +Basic principles: +----------------- + +RT-mutexes extend the semantics of simple mutexes by the priority +inheritance protocol. + +A low priority owner of a rt-mutex inherits the priority of a higher +priority waiter until the rt-mutex is released. If the temporarily +boosted owner blocks on a rt-mutex itself it propagates the priority +boosting to the owner of the other rt_mutex it gets blocked on. The +priority boosting is immediately removed once the rt_mutex has been +unlocked. + +This approach allows us to shorten the block of high-prio tasks on +mutexes which protect shared resources. Priority inheritance is not a +magic bullet for poorly designed applications, but it allows +well-designed applications to use userspace locks in critical parts of +an high priority thread, without losing determinism. + +The enqueueing of the waiters into the rtmutex waiter list is done in +priority order. For same priorities FIFO order is chosen. For each +rtmutex, only the top priority waiter is enqueued into the owner's +priority waiters list. This list too queues in priority order. Whenever +the top priority waiter of a task changes (for example it timed out or +got a signal), the priority of the owner task is readjusted. [The +priority enqueueing is handled by "plists", see include/linux/plist.h +for more details.] + +RT-mutexes are optimized for fastpath operations and have no internal +locking overhead when locking an uncontended mutex or unlocking a mutex +without waiters. The optimized fastpath operations require cmpxchg +support. [If that is not available then the rt-mutex internal spinlock +is used] + +The state of the rt-mutex is tracked via the owner field of the rt-mutex +structure: + +rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 +are used to keep track of the "owner is pending" and "rtmutex has +waiters" state. + + owner bit1 bit0 + NULL 0 0 mutex is free (fast acquire possible) + NULL 0 1 invalid state + NULL 1 0 Transitional state* + NULL 1 1 invalid state + taskpointer 0 0 mutex is held (fast release possible) + taskpointer 0 1 task is pending owner + taskpointer 1 0 mutex is held and has waiters + taskpointer 1 1 task is pending owner and mutex has waiters + +Pending-ownership handling is a performance optimization: +pending-ownership is assigned to the first (highest priority) waiter of +the mutex, when the mutex is released. The thread is woken up and once +it starts executing it can acquire the mutex. Until the mutex is taken +by it (bit 0 is cleared) a competing higher priority thread can "steal" +the mutex which puts the woken up thread back on the waiters list. + +The pending-ownership optimization is especially important for the +uninterrupted workflow of high-prio tasks which repeatedly +takes/releases locks that have lower-prio waiters. Without this +optimization the higher-prio thread would ping-pong to the lower-prio +task [because at unlock time we always assign a new owner]. + +(*) The "mutex has waiters" bit gets set to take the lock. If the lock +doesn't already have an owner, this bit is quickly cleared if there are +no waiters. So this is a transitional state to synchronize with looking +at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt new file mode 100644 index 000000000000..ff35e40bdf5b --- /dev/null +++ b/Documentation/locking/spinlocks.txt @@ -0,0 +1,167 @@ +Lesson 1: Spin locks + +The most basic primitive for locking is spinlock. + +static DEFINE_SPINLOCK(xxx_lock); + + unsigned long flags; + + spin_lock_irqsave(&xxx_lock, flags); + ... critical section here .. + spin_unlock_irqrestore(&xxx_lock, flags); + +The above is always safe. It will disable interrupts _locally_, but the +spinlock itself will guarantee the global lock, so it will guarantee that +there is only one thread-of-control within the region(s) protected by that +lock. This works well even under UP also, so the code does _not_ need to +worry about UP vs SMP issues: the spinlocks work correctly under both. + + NOTE! Implications of spin_locks for memory are further described in: + + Documentation/memory-barriers.txt + (5) LOCK operations. + (6) UNLOCK operations. + +The above is usually pretty simple (you usually need and want only one +spinlock for most things - using more than one spinlock can make things a +lot more complex and even slower and is usually worth it only for +sequences that you _know_ need to be split up: avoid it at all cost if you +aren't sure). + +This is really the only really hard part about spinlocks: once you start +using spinlocks they tend to expand to areas you might not have noticed +before, because you have to make sure the spinlocks correctly protect the +shared data structures _everywhere_ they are used. The spinlocks are most +easily added to places that are completely independent of other code (for +example, internal driver data structures that nobody else ever touches). + + NOTE! The spin-lock is safe only when you _also_ use the lock itself + to do locking across CPU's, which implies that EVERYTHING that + touches a shared variable has to agree about the spinlock they want + to use. + +---- + +Lesson 2: reader-writer spinlocks. + +If your data accesses have a very natural pattern where you usually tend +to mostly read from the shared variables, the reader-writer locks +(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple +readers to be in the same critical region at once, but if somebody wants +to change the variables it has to get an exclusive write lock. + + NOTE! reader-writer locks require more atomic memory operations than + simple spinlocks. Unless the reader critical section is long, you + are better off just using spinlocks. + +The routines look the same as above: + + rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); + + unsigned long flags; + + read_lock_irqsave(&xxx_lock, flags); + .. critical section that only reads the info ... + read_unlock_irqrestore(&xxx_lock, flags); + + write_lock_irqsave(&xxx_lock, flags); + .. read and write exclusive access to the info ... + write_unlock_irqrestore(&xxx_lock, flags); + +The above kind of lock may be useful for complex data structures like +linked lists, especially searching for entries without changing the list +itself. The read lock allows many concurrent readers. Anything that +_changes_ the list will have to get the write lock. + + NOTE! RCU is better for list traversal, but requires careful + attention to design detail (see Documentation/RCU/listRCU.txt). + +Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ +time need to do any changes (even if you don't do it every time), you have +to get the write-lock at the very beginning. + + NOTE! We are working hard to remove reader-writer spinlocks in most + cases, so please don't add a new one without consensus. (Instead, see + Documentation/RCU/rcu.txt for complete information.) + +---- + +Lesson 3: spinlocks revisited. + +The single spin-lock primitives above are by no means the only ones. They +are the most safe ones, and the ones that work under all circumstances, +but partly _because_ they are safe they are also fairly slow. They are slower +than they'd need to be, because they do have to disable interrupts +(which is just a single instruction on a x86, but it's an expensive one - +and on other architectures it can be worse). + +If you have a case where you have to protect a data structure across +several CPU's and you want to use spinlocks you can potentially use +cheaper versions of the spinlocks. IFF you know that the spinlocks are +never used in interrupt handlers, you can use the non-irq versions: + + spin_lock(&lock); + ... + spin_unlock(&lock); + +(and the equivalent read-write versions too, of course). The spinlock will +guarantee the same kind of exclusive access, and it will be much faster. +This is useful if you know that the data in question is only ever +manipulated from a "process context", ie no interrupts involved. + +The reasons you mustn't use these versions if you have interrupts that +play with the spinlock is that you can get deadlocks: + + spin_lock(&lock); + ... + <- interrupt comes in: + spin_lock(&lock); + +where an interrupt tries to lock an already locked variable. This is ok if +the other interrupt happens on another CPU, but it is _not_ ok if the +interrupt happens on the same CPU that already holds the lock, because the +lock will obviously never be released (because the interrupt is waiting +for the lock, and the lock-holder is interrupted by the interrupt and will +not continue until the interrupt has been processed). + +(This is also the reason why the irq-versions of the spinlocks only need +to disable the _local_ interrupts - it's ok to use spinlocks in interrupts +on other CPU's, because an interrupt on another CPU doesn't interrupt the +CPU that holds the lock, so the lock-holder can continue and eventually +releases the lock). + +Note that you can be clever with read-write locks and interrupts. For +example, if you know that the interrupt only ever gets a read-lock, then +you can use a non-irq version of read locks everywhere - because they +don't block on each other (and thus there is no dead-lock wrt interrupts. +But when you do the write-lock, you have to use the irq-safe version. + +For an example of being clever with rw-locks, see the "waitqueue_lock" +handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from +within an interrupt, they only read the queue in order to know whom to +wake up. So read-locks are safe (which is good: they are very common +indeed), while write-locks need to protect themselves against interrupts. + + Linus + +---- + +Reference information: + +For dynamic initialization, use spin_lock_init() or rwlock_init() as +appropriate: + + spinlock_t xxx_lock; + rwlock_t xxx_rw_lock; + + static int __init xxx_init(void) + { + spin_lock_init(&xxx_lock); + rwlock_init(&xxx_rw_lock); + ... + } + + module_init(xxx_init); + +For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or +__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt new file mode 100644 index 000000000000..8a112dc304c3 --- /dev/null +++ b/Documentation/locking/ww-mutex-design.txt @@ -0,0 +1,344 @@ +Wait/Wound Deadlock-Proof Mutex Design +====================================== + +Please read mutex-design.txt first, as it applies to wait/wound mutexes too. + +Motivation for WW-Mutexes +------------------------- + +GPU's do operations that commonly involve many buffers. Those buffers +can be shared across contexts/processes, exist in different memory +domains (for example VRAM vs system memory), and so on. And with +PRIME / dmabuf, they can even be shared across devices. So there are +a handful of situations where the driver needs to wait for buffers to +become ready. If you think about this in terms of waiting on a buffer +mutex for it to become available, this presents a problem because +there is no way to guarantee that buffers appear in a execbuf/batch in +the same order in all contexts. That is directly under control of +userspace, and a result of the sequence of GL calls that an application +makes. Which results in the potential for deadlock. The problem gets +more complex when you consider that the kernel may need to migrate the +buffer(s) into VRAM before the GPU operates on the buffer(s), which +may in turn require evicting some other buffers (and you don't want to +evict other buffers which are already queued up to the GPU), but for a +simplified understanding of the problem you can ignore this. + +The algorithm that the TTM graphics subsystem came up with for dealing with +this problem is quite simple. For each group of buffers (execbuf) that need +to be locked, the caller would be assigned a unique reservation id/ticket, +from a global counter. In case of deadlock while locking all the buffers +associated with a execbuf, the one with the lowest reservation ticket (i.e. +the oldest task) wins, and the one with the higher reservation id (i.e. the +younger task) unlocks all of the buffers that it has already locked, and then +tries again. + +In the RDBMS literature this deadlock handling approach is called wait/wound: +The older tasks waits until it can acquire the contended lock. The younger tasks +needs to back off and drop all the locks it is currently holding, i.e. the +younger task is wounded. + +Concepts +-------- + +Compared to normal mutexes two additional concepts/objects show up in the lock +interface for w/w mutexes: + +Acquire context: To ensure eventual forward progress it is important the a task +trying to acquire locks doesn't grab a new reservation id, but keeps the one it +acquired when starting the lock acquisition. This ticket is stored in the +acquire context. Furthermore the acquire context keeps track of debugging state +to catch w/w mutex interface abuse. + +W/w class: In contrast to normal mutexes the lock class needs to be explicit for +w/w mutexes, since it is required to initialize the acquire context. + +Furthermore there are three different class of w/w lock acquire functions: + +* Normal lock acquisition with a context, using ww_mutex_lock. + +* Slowpath lock acquisition on the contending lock, used by the wounded task + after having dropped all already acquired locks. These functions have the + _slow postfix. + + From a simple semantics point-of-view the _slow functions are not strictly + required, since simply calling the normal ww_mutex_lock functions on the + contending lock (after having dropped all other already acquired locks) will + work correctly. After all if no other ww mutex has been acquired yet there's + no deadlock potential and hence the ww_mutex_lock call will block and not + prematurely return -EDEADLK. The advantage of the _slow functions is in + interface safety: + - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow + has a void return type. Note that since ww mutex code needs loops/retries + anyway the __must_check doesn't result in spurious warnings, even though the + very first lock operation can never fail. + - When full debugging is enabled ww_mutex_lock_slow checks that all acquired + ww mutex have been released (preventing deadlocks) and makes sure that we + block on the contending lock (preventing spinning through the -EDEADLK + slowpath until the contended lock can be acquired). + +* Functions to only acquire a single w/w mutex, which results in the exact same + semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL + context. + + Again this is not strictly required. But often you only want to acquire a + single lock in which case it's pointless to set up an acquire context (and so + better to avoid grabbing a deadlock avoidance ticket). + +Of course, all the usual variants for handling wake-ups due to signals are also +provided. + +Usage +----- + +Three different ways to acquire locks within the same w/w class. Common +definitions for methods #1 and #2: + +static DEFINE_WW_CLASS(ww_class); + +struct obj { + struct ww_mutex lock; + /* obj data */ +}; + +struct obj_entry { + struct list_head head; + struct obj *obj; +}; + +Method 1, using a list in execbuf->buffers that's not allowed to be reordered. +This is useful if a list of required objects is already tracked somewhere. +Furthermore the lock helper can use propagate the -EALREADY return code back to +the caller as a signal that an object is twice on the list. This is useful if +the list is constructed from userspace input and the ABI requires userspace to +not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl). + +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj *res_obj = NULL; + struct obj_entry *contended_entry = NULL; + struct obj_entry *entry; + + ww_acquire_init(ctx, &ww_class); + +retry: + list_for_each_entry (entry, list, head) { + if (entry->obj == res_obj) { + res_obj = NULL; + continue; + } + ret = ww_mutex_lock(&entry->obj->lock, ctx); + if (ret < 0) { + contended_entry = entry; + goto err; + } + } + + ww_acquire_done(ctx); + return 0; + +err: + list_for_each_entry_continue_reverse (entry, list, head) + ww_mutex_unlock(&entry->obj->lock); + + if (res_obj) + ww_mutex_unlock(&res_obj->lock); + + if (ret == -EDEADLK) { + /* we lost out in a seqno race, lock and retry.. */ + ww_mutex_lock_slow(&contended_entry->obj->lock, ctx); + res_obj = contended_entry->obj; + goto retry; + } + ww_acquire_fini(ctx); + + return ret; +} + +Method 2, using a list in execbuf->buffers that can be reordered. Same semantics +of duplicate entry detection using -EALREADY as method 1 above. But the +list-reordering allows for a bit more idiomatic code. + +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj_entry *entry, *entry2; + + ww_acquire_init(ctx, &ww_class); + + list_for_each_entry (entry, list, head) { + ret = ww_mutex_lock(&entry->obj->lock, ctx); + if (ret < 0) { + entry2 = entry; + + list_for_each_entry_continue_reverse (entry2, list, head) + ww_mutex_unlock(&entry2->obj->lock); + + if (ret != -EDEADLK) { + ww_acquire_fini(ctx); + return ret; + } + + /* we lost out in a seqno race, lock and retry.. */ + ww_mutex_lock_slow(&entry->obj->lock, ctx); + + /* + * Move buf to head of the list, this will point + * buf->next to the first unlocked entry, + * restarting the for loop. + */ + list_del(&entry->head); + list_add(&entry->head, list); + } + } + + ww_acquire_done(ctx); + return 0; +} + +Unlocking works the same way for both methods #1 and #2: + +void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj_entry *entry; + + list_for_each_entry (entry, list, head) + ww_mutex_unlock(&entry->obj->lock); + + ww_acquire_fini(ctx); +} + +Method 3 is useful if the list of objects is constructed ad-hoc and not upfront, +e.g. when adjusting edges in a graph where each node has its own ww_mutex lock, +and edges can only be changed when holding the locks of all involved nodes. w/w +mutexes are a natural fit for such a case for two reasons: +- They can handle lock-acquisition in any order which allows us to start walking + a graph from a starting point and then iteratively discovering new edges and + locking down the nodes those edges connect to. +- Due to the -EALREADY return code signalling that a given objects is already + held there's no need for additional book-keeping to break cycles in the graph + or keep track off which looks are already held (when using more than one node + as a starting point). + +Note that this approach differs in two important ways from the above methods: +- Since the list of objects is dynamically constructed (and might very well be + different when retrying due to hitting the -EDEADLK wound condition) there's + no need to keep any object on a persistent list when it's not locked. We can + therefore move the list_head into the object itself. +- On the other hand the dynamic object list construction also means that the -EALREADY return + code can't be propagated. + +Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a +list of starting nodes (passed in from userspace) using one of the above +methods. And then lock any additional objects affected by the operations using +method #3 below. The backoff/retry procedure will be a bit more involved, since +when the dynamic locking step hits -EDEADLK we also need to unlock all the +objects acquired with the fixed list. But the w/w mutex debug checks will catch +any interface misuse for these cases. + +Also, method 3 can't fail the lock acquisition step since it doesn't return +-EALREADY. Of course this would be different when using the _interruptible +variants, but that's outside of the scope of these examples here. + +struct obj { + struct ww_mutex ww_mutex; + struct list_head locked_list; +}; + +static DEFINE_WW_CLASS(ww_class); + +void __unlock_objs(struct list_head *list) +{ + struct obj *entry, *temp; + + list_for_each_entry_safe (entry, temp, list, locked_list) { + /* need to do that before unlocking, since only the current lock holder is + allowed to use object */ + list_del(&entry->locked_list); + ww_mutex_unlock(entry->ww_mutex) + } +} + +void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj *obj; + + ww_acquire_init(ctx, &ww_class); + +retry: + /* re-init loop start state */ + loop { + /* magic code which walks over a graph and decides which objects + * to lock */ + + ret = ww_mutex_lock(obj->ww_mutex, ctx); + if (ret == -EALREADY) { + /* we have that one already, get to the next object */ + continue; + } + if (ret == -EDEADLK) { + __unlock_objs(list); + + ww_mutex_lock_slow(obj, ctx); + list_add(&entry->locked_list, list); + goto retry; + } + + /* locked a new object, add it to the list */ + list_add_tail(&entry->locked_list, list); + } + + ww_acquire_done(ctx); + return 0; +} + +void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + __unlock_objs(list); + ww_acquire_fini(ctx); +} + +Method 4: Only lock one single objects. In that case deadlock detection and +prevention is obviously overkill, since with grabbing just one lock you can't +produce a deadlock within just one class. To simplify this case the w/w mutex +api can be used with a NULL context. + +Implementation Details +---------------------- + +Design: + ww_mutex currently encapsulates a struct mutex, this means no extra overhead for + normal mutex locks, which are far more common. As such there is only a small + increase in code size if wait/wound mutexes are not used. + + In general, not much contention is expected. The locks are typically used to + serialize access to resources for devices. The only way to make wakeups + smarter would be at the cost of adding a field to struct mutex_waiter. This + would add overhead to all cases where normal mutexes are used, and + ww_mutexes are generally less performance sensitive. + +Lockdep: + Special care has been taken to warn for as many cases of api abuse + as possible. Some common api abuses will be caught with + CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended. + + Some of the errors which will be warned about: + - Forgetting to call ww_acquire_fini or ww_acquire_init. + - Attempting to lock more mutexes after ww_acquire_done. + - Attempting to lock the wrong mutex after -EDEADLK and + unlocking all mutexes. + - Attempting to lock the right mutex after -EDEADLK, + before unlocking all mutexes. + + - Calling ww_mutex_lock_slow before -EDEADLK was returned. + + - Unlocking mutexes with the wrong unlock function. + - Calling one of the ww_acquire_* twice on the same context. + - Using a different ww_class for the mutex than for the ww_acquire_ctx. + - Normal lockdep errors that can result in deadlocks. + + Some of the lockdep errors that can result in deadlocks: + - Calling ww_acquire_init to initialize a second ww_acquire_ctx before + having called ww_acquire_fini on the first. + - 'normal' deadlocks that can occur. + +FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic +implemented. diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt deleted file mode 100644 index 72d010689751..000000000000 --- a/Documentation/lockstat.txt +++ /dev/null @@ -1,178 +0,0 @@ - -LOCK STATISTICS - -- WHAT - -As the name suggests, it provides statistics on locks. - -- WHY - -Because things like lock contention can severely impact performance. - -- HOW - -Lockdep already has hooks in the lock functions and maps lock instances to -lock classes. We build on that (see Documentation/lockdep-design.txt). -The graph below shows the relation between the lock functions and the various -hooks therein. - - __acquire - | - lock _____ - | \ - | __contended - | | - | - | _______/ - |/ - | - __acquired - | - . - - . - | - __release - | - unlock - -lock, unlock - the regular lock functions -__* - the hooks -<> - states - -With these hooks we provide the following statistics: - - con-bounces - number of lock contention that involved x-cpu data - contentions - number of lock acquisitions that had to wait - wait time min - shortest (non-0) time we ever had to wait for a lock - max - longest time we ever had to wait for a lock - total - total time we spend waiting on this lock - avg - average time spent waiting on this lock - acq-bounces - number of lock acquisitions that involved x-cpu data - acquisitions - number of times we took the lock - hold time min - shortest (non-0) time we ever held the lock - max - longest time we ever held the lock - total - total time this lock was held - avg - average time this lock was held - -These numbers are gathered per lock class, per read/write state (when -applicable). - -It also tracks 4 contention points per class. A contention point is a call site -that had to wait on lock acquisition. - - - CONFIGURATION - -Lock statistics are enabled via CONFIG_LOCK_STAT. - - - USAGE - -Enable collection of statistics: - -# echo 1 >/proc/sys/kernel/lock_stat - -Disable collection of statistics: - -# echo 0 >/proc/sys/kernel/lock_stat - -Look at the current lock statistics: - -( line numbers not part of actual output, done for clarity in the explanation - below ) - -# less /proc/lock_stat - -01 lock_stat version 0.4 -02----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg -04----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -05 -06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99 -07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 -08 --------------- -09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 -19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 -11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 -12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 -13 --------------- -14 &mm->mmap_sem 1 [] dup_mmap+0x2a/0x3f0 -15 &mm->mmap_sem 60 [] SyS_mprotect+0xe9/0x250 -16 &mm->mmap_sem 41 [] __do_page_fault+0x1d4/0x510 -17 &mm->mmap_sem 68 [] vm_mmap_pgoff+0x87/0xd0 -18 -19............................................................................................................................................................................................................................. -20 -21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48 -22 --------------- -23 unix_table_lock 45 [] unix_create1+0x16e/0x1b0 -24 unix_table_lock 47 [] unix_release_sock+0x31/0x250 -25 unix_table_lock 15 [] unix_find_other+0x117/0x230 -26 unix_table_lock 5 [] unix_autobind+0x11f/0x1b0 -27 --------------- -28 unix_table_lock 39 [] unix_release_sock+0x31/0x250 -29 unix_table_lock 49 [] unix_create1+0x16e/0x1b0 -30 unix_table_lock 20 [] unix_find_other+0x117/0x230 -31 unix_table_lock 4 [] unix_autobind+0x11f/0x1b0 - - -This excerpt shows the first two lock class statistics. Line 01 shows the -output version - each time the format changes this will be updated. Line 02-04 -show the header with column descriptions. Lines 05-18 and 20-31 show the actual -statistics. These statistics come in two parts; the actual stats separated by a -short separator (line 08, 13) from the contention points. - -The first lock (05-18) is a read/write lock, and shows two lines above the -short separator. The contention points don't match the column descriptors, -they have two: contentions and [] symbol. The second set of contention -points are the points we're contending with. - -The integer part of the time values is in us. - -Dealing with nested locks, subclasses may appear: - -32........................................................................................................................................................................................................................... -33 -34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82 -35 --------- -36 &rq->lock 645 [] task_rq_lock+0x43/0x75 -37 &rq->lock 297 [] try_to_wake_up+0x127/0x25a -38 &rq->lock 360 [] select_task_rq_fair+0x1f0/0x74a -39 &rq->lock 428 [] scheduler_tick+0x46/0x1fb -40 --------- -41 &rq->lock 77 [] task_rq_lock+0x43/0x75 -42 &rq->lock 174 [] try_to_wake_up+0x127/0x25a -43 &rq->lock 4715 [] double_rq_lock+0x42/0x54 -44 &rq->lock 893 [] schedule+0x157/0x7b8 -45 -46........................................................................................................................................................................................................................... -47 -48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84 -49 ----------- -50 &rq->lock/1 11526 [] double_rq_lock+0x4f/0x54 -51 ----------- -52 &rq->lock/1 5645 [] double_rq_lock+0x42/0x54 -53 &rq->lock/1 1224 [] schedule+0x157/0x7b8 -54 &rq->lock/1 4336 [] double_rq_lock+0x4f/0x54 -55 &rq->lock/1 181 [] try_to_wake_up+0x127/0x25a - -Line 48 shows statistics for the second subclass (/1) of &rq->lock class -(subclass starts from 0), since in this case, as line 50 suggests, -double_rq_lock actually acquires a nested lock of two spinlocks. - -View the top contending locks: - -# grep : /proc/lock_stat | head - clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71 - tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59 - &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59 - &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69 - &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93 - tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72 - tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89 - rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89 - &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27 - rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76 - -Clear the statistics: - -# echo 0 > /proc/lock_stat diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt deleted file mode 100644 index ee231ed09ec6..000000000000 --- a/Documentation/mutex-design.txt +++ /dev/null @@ -1,157 +0,0 @@ -Generic Mutex Subsystem - -started by Ingo Molnar -updated by Davidlohr Bueso - -What are mutexes? ------------------ - -In the Linux kernel, mutexes refer to a particular locking primitive -that enforces serialization on shared memory systems, and not only to -the generic term referring to 'mutual exclusion' found in academia -or similar theoretical text books. Mutexes are sleeping locks which -behave similarly to binary semaphores, and were introduced in 2006[1] -as an alternative to these. This new data structure provided a number -of advantages, including simpler interfaces, and at that time smaller -code (see Disadvantages). - -[1] http://lwn.net/Articles/164802/ - -Implementation --------------- - -Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h -and implemented in kernel/locking/mutex.c. These locks use a three -state atomic counter (->count) to represent the different possible -transitions that can occur during the lifetime of a lock: - - 1: unlocked - 0: locked, no waiters - negative: locked, with potential waiters - -In its most basic form it also includes a wait-queue and a spinlock -that serializes access to it. CONFIG_SMP systems can also include -a pointer to the lock task owner (->owner) as well as a spinner MCS -lock (->osq), both described below in (ii). - -When acquiring a mutex, there are three possible paths that can be -taken, depending on the state of the lock: - -(i) fastpath: tries to atomically acquire the lock by decrementing the - counter. If it was already taken by another task it goes to the next - possible path. This logic is architecture specific. On x86-64, the - locking fastpath is 2 instructions: - - 0000000000000e10 : - e21: f0 ff 0b lock decl (%rbx) - e24: 79 08 jns e2e - - the unlocking fastpath is equally tight: - - 0000000000000bc0 : - bc8: f0 ff 07 lock incl (%rdi) - bcb: 7f 0a jg bd7 - - -(ii) midpath: aka optimistic spinning, tries to spin for acquisition - while the lock owner is running and there are no other tasks ready - to run that have higher priority (need_resched). The rationale is - that if the lock owner is running, it is likely to release the lock - soon. The mutex spinners are queued up using MCS lock so that only - one spinner can compete for the mutex. - - The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock - with the desirable properties of being fair and with each cpu trying - to acquire the lock spinning on a local variable. It avoids expensive - cacheline bouncing that common test-and-set spinlock implementations - incur. An MCS-like lock is specially tailored for optimistic spinning - for sleeping lock implementation. An important feature of the customized - MCS lock is that it has the extra property that spinners are able to exit - the MCS spinlock queue when they need to reschedule. This further helps - avoid situations where MCS spinners that need to reschedule would continue - waiting to spin on mutex owner, only to go directly to slowpath upon - obtaining the MCS lock. - - -(iii) slowpath: last resort, if the lock is still unable to be acquired, - the task is added to the wait-queue and sleeps until woken up by the - unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. - -While formally kernel mutexes are sleepable locks, it is path (ii) that -makes them more practically a hybrid type. By simply not interrupting a -task and busy-waiting for a few cycles instead of immediately sleeping, -the performance of this lock has been seen to significantly improve a -number of workloads. Note that this technique is also used for rw-semaphores. - -Semantics ---------- - -The mutex subsystem checks and enforces the following rules: - - - Only one task can hold the mutex at a time. - - Only the owner can unlock the mutex. - - Multiple unlocks are not permitted. - - Recursive locking/unlocking is not permitted. - - A mutex must only be initialized via the API (see below). - - A task may not exit with a mutex held. - - Memory areas where held locks reside must not be freed. - - Held mutexes must not be reinitialized. - - Mutexes may not be used in hardware or software interrupt - contexts such as tasklets and timers. - -These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. -In addition, the mutex debugging code also implements a number of other -features that make lock debugging easier and faster: - - - Uses symbolic names of mutexes, whenever they are printed - in debug output. - - Point-of-acquire tracking, symbolic lookup of function names, - list of all locks held in the system, printout of them. - - Owner tracking. - - Detects self-recursing locks and prints out all relevant info. - - Detects multi-task circular deadlocks and prints out all affected - locks and tasks (and only those tasks). - - -Interfaces ----------- -Statically define the mutex: - DEFINE_MUTEX(name); - -Dynamically initialize the mutex: - mutex_init(mutex); - -Acquire the mutex, uninterruptible: - void mutex_lock(struct mutex *lock); - void mutex_lock_nested(struct mutex *lock, unsigned int subclass); - int mutex_trylock(struct mutex *lock); - -Acquire the mutex, interruptible: - int mutex_lock_interruptible_nested(struct mutex *lock, - unsigned int subclass); - int mutex_lock_interruptible(struct mutex *lock); - -Acquire the mutex, interruptible, if dec to 0: - int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); - -Unlock the mutex: - void mutex_unlock(struct mutex *lock); - -Test if the mutex is taken: - int mutex_is_locked(struct mutex *lock); - -Disadvantages -------------- - -Unlike its original design and purpose, 'struct mutex' is larger than -most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice -as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the -'struct rw_semaphore' variant. Larger structure sizes mean more CPU -cache and memory footprint. - -When to use mutexes -------------------- - -Unless the strict semantics of mutexes are unsuitable and/or the critical -region prevents the lock from being shared, always prefer them to any other -locking primitive. diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt deleted file mode 100644 index 8666070d3189..000000000000 --- a/Documentation/rt-mutex-design.txt +++ /dev/null @@ -1,781 +0,0 @@ -# -# Copyright (c) 2006 Steven Rostedt -# Licensed under the GNU Free Documentation License, Version 1.2 -# - -RT-mutex implementation design ------------------------------- - -This document tries to describe the design of the rtmutex.c implementation. -It doesn't describe the reasons why rtmutex.c exists. For that please see -Documentation/rt-mutex.txt. Although this document does explain problems -that happen without this code, but that is in the concept to understand -what the code actually is doing. - -The goal of this document is to help others understand the priority -inheritance (PI) algorithm that is used, as well as reasons for the -decisions that were made to implement PI in the manner that was done. - - -Unbounded Priority Inversion ----------------------------- - -Priority inversion is when a lower priority process executes while a higher -priority process wants to run. This happens for several reasons, and -most of the time it can't be helped. Anytime a high priority process wants -to use a resource that a lower priority process has (a mutex for example), -the high priority process must wait until the lower priority process is done -with the resource. This is a priority inversion. What we want to prevent -is something called unbounded priority inversion. That is when the high -priority process is prevented from running by a lower priority process for -an undetermined amount of time. - -The classic example of unbounded priority inversion is where you have three -processes, let's call them processes A, B, and C, where A is the highest -priority process, C is the lowest, and B is in between. A tries to grab a lock -that C owns and must wait and lets C run to release the lock. But in the -meantime, B executes, and since B is of a higher priority than C, it preempts C, -but by doing so, it is in fact preempting A which is a higher priority process. -Now there's no way of knowing how long A will be sleeping waiting for C -to release the lock, because for all we know, B is a CPU hog and will -never give C a chance to release the lock. This is called unbounded priority -inversion. - -Here's a little ASCII art to show the problem. - - grab lock L1 (owned by C) - | -A ---+ - C preempted by B - | -C +----+ - -B +--------> - B now keeps A from running. - - -Priority Inheritance (PI) -------------------------- - -There are several ways to solve this issue, but other ways are out of scope -for this document. Here we only discuss PI. - -PI is where a process inherits the priority of another process if the other -process blocks on a lock owned by the current process. To make this easier -to understand, let's use the previous example, with processes A, B, and C again. - -This time, when A blocks on the lock owned by C, C would inherit the priority -of A. So now if B becomes runnable, it would not preempt C, since C now has -the high priority of A. As soon as C releases the lock, it loses its -inherited priority, and A then can continue with the resource that C had. - -Terminology ------------ - -Here I explain some terminology that is used in this document to help describe -the design that is used to implement PI. - -PI chain - The PI chain is an ordered series of locks and processes that cause - processes to inherit priorities from a previous process that is - blocked on one of its locks. This is described in more detail - later in this document. - -mutex - In this document, to differentiate from locks that implement - PI and spin locks that are used in the PI code, from now on - the PI locks will be called a mutex. - -lock - In this document from now on, I will use the term lock when - referring to spin locks that are used to protect parts of the PI - algorithm. These locks disable preemption for UP (when - CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from - entering critical sections simultaneously. - -spin lock - Same as lock above. - -waiter - A waiter is a struct that is stored on the stack of a blocked - process. Since the scope of the waiter is within the code for - a process being blocked on the mutex, it is fine to allocate - the waiter on the process's stack (local variable). This - structure holds a pointer to the task, as well as the mutex that - the task is blocked on. It also has the plist node structures to - place the task in the waiter_list of a mutex as well as the - pi_list of a mutex owner task (described below). - - waiter is sometimes used in reference to the task that is waiting - on a mutex. This is the same as waiter->task. - -waiters - A list of processes that are blocked on a mutex. - -top waiter - The highest priority process waiting on a specific mutex. - -top pi waiter - The highest priority process waiting on one of the mutexes - that a specific process owns. - -Note: task and process are used interchangeably in this document, mostly to - differentiate between two processes that are being described together. - - -PI chain --------- - -The PI chain is a list of processes and mutexes that may cause priority -inheritance to take place. Multiple chains may converge, but a chain -would never diverge, since a process can't be blocked on more than one -mutex at a time. - -Example: - - Process: A, B, C, D, E - Mutexes: L1, L2, L3, L4 - - A owns: L1 - B blocked on L1 - B owns L2 - C blocked on L2 - C owns L3 - D blocked on L3 - D owns L4 - E blocked on L4 - -The chain would be: - - E->L4->D->L3->C->L2->B->L1->A - -To show where two chains merge, we could add another process F and -another mutex L5 where B owns L5 and F is blocked on mutex L5. - -The chain for F would be: - - F->L5->B->L1->A - -Since a process may own more than one mutex, but never be blocked on more than -one, the chains merge. - -Here we show both chains: - - E->L4->D->L3->C->L2-+ - | - +->B->L1->A - | - F->L5-+ - -For PI to work, the processes at the right end of these chains (or we may -also call it the Top of the chain) must be equal to or higher in priority -than the processes to the left or below in the chain. - -Also since a mutex may have more than one process blocked on it, we can -have multiple chains merge at mutexes. If we add another process G that is -blocked on mutex L2: - - G->L2->B->L1->A - -And once again, to show how this can grow I will show the merging chains -again. - - E->L4->D->L3->C-+ - +->L2-+ - | | - G-+ +->B->L1->A - | - F->L5-+ - - -Plist ------ - -Before I go further and talk about how the PI chain is stored through lists -on both mutexes and processes, I'll explain the plist. This is similar to -the struct list_head functionality that is already in the kernel. -The implementation of plist is out of scope for this document, but it is -very important to understand what it does. - -There are a few differences between plist and list, the most important one -being that plist is a priority sorted linked list. This means that the -priorities of the plist are sorted, such that it takes O(1) to retrieve the -highest priority item in the list. Obviously this is useful to store processes -based on their priorities. - -Another difference, which is important for implementation, is that, unlike -list, the head of the list is a different element than the nodes of a list. -So the head of the list is declared as struct plist_head and nodes that will -be added to the list are declared as struct plist_node. - - -Mutex Waiter List ------------------ - -Every mutex keeps track of all the waiters that are blocked on itself. The mutex -has a plist to store these waiters by priority. This list is protected by -a spin lock that is located in the struct of the mutex. This lock is called -wait_lock. Since the modification of the waiter list is never done in -interrupt context, the wait_lock can be taken without disabling interrupts. - - -Task PI List ------------- - -To keep track of the PI chains, each process has its own PI list. This is -a list of all top waiters of the mutexes that are owned by the process. -Note that this list only holds the top waiters and not all waiters that are -blocked on mutexes owned by the process. - -The top of the task's PI list is always the highest priority task that -is waiting on a mutex that is owned by the task. So if the task has -inherited a priority, it will always be the priority of the task that is -at the top of this list. - -This list is stored in the task structure of a process as a plist called -pi_list. This list is protected by a spin lock also in the task structure, -called pi_lock. This lock may also be taken in interrupt context, so when -locking the pi_lock, interrupts must be disabled. - - -Depth of the PI Chain ---------------------- - -The maximum depth of the PI chain is not dynamic, and could actually be -defined. But is very complex to figure it out, since it depends on all -the nesting of mutexes. Let's look at the example where we have 3 mutexes, -L1, L2, and L3, and four separate functions func1, func2, func3 and func4. -The following shows a locking order of L1->L2->L3, but may not actually -be directly nested that way. - -void func1(void) -{ - mutex_lock(L1); - - /* do anything */ - - mutex_unlock(L1); -} - -void func2(void) -{ - mutex_lock(L1); - mutex_lock(L2); - - /* do something */ - - mutex_unlock(L2); - mutex_unlock(L1); -} - -void func3(void) -{ - mutex_lock(L2); - mutex_lock(L3); - - /* do something else */ - - mutex_unlock(L3); - mutex_unlock(L2); -} - -void func4(void) -{ - mutex_lock(L3); - - /* do something again */ - - mutex_unlock(L3); -} - -Now we add 4 processes that run each of these functions separately. -Processes A, B, C, and D which run functions func1, func2, func3 and func4 -respectively, and such that D runs first and A last. With D being preempted -in func4 in the "do something again" area, we have a locking that follows: - -D owns L3 - C blocked on L3 - C owns L2 - B blocked on L2 - B owns L1 - A blocked on L1 - -And thus we have the chain A->L1->B->L2->C->L3->D. - -This gives us a PI depth of 4 (four processes), but looking at any of the -functions individually, it seems as though they only have at most a locking -depth of two. So, although the locking depth is defined at compile time, -it still is very difficult to find the possibilities of that depth. - -Now since mutexes can be defined by user-land applications, we don't want a DOS -type of application that nests large amounts of mutexes to create a large -PI chain, and have the code holding spin locks while looking at a large -amount of data. So to prevent this, the implementation not only implements -a maximum lock depth, but also only holds at most two different locks at a -time, as it walks the PI chain. More about this below. - - -Mutex owner and flags ---------------------- - -The mutex structure contains a pointer to the owner of the mutex. If the -mutex is not owned, this owner is set to NULL. Since all architectures -have the task structure on at least a four byte alignment (and if this is -not true, the rtmutex.c code will be broken!), this allows for the two -least significant bits to be used as flags. This part is also described -in Documentation/rt-mutex.txt, but will also be briefly described here. - -Bit 0 is used as the "Pending Owner" flag. This is described later. -Bit 1 is used as the "Has Waiters" flags. This is also described later - in more detail, but is set whenever there are waiters on a mutex. - - -cmpxchg Tricks --------------- - -Some architectures implement an atomic cmpxchg (Compare and Exchange). This -is used (when applicable) to keep the fast path of grabbing and releasing -mutexes short. - -cmpxchg is basically the following function performed atomically: - -unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) -{ - unsigned long T = *A; - if (*A == *B) { - *A = *C; - } - return T; -} -#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) - -This is really nice to have, since it allows you to only update a variable -if the variable is what you expect it to be. You know if it succeeded if -the return value (the old value of A) is equal to B. - -The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If -the architecture does not support CMPXCHG, then this macro is simply set -to fail every time. But if CMPXCHG is supported, then this will -help out extremely to keep the fast path short. - -The use of rt_mutex_cmpxchg with the flags in the owner field help optimize -the system for architectures that support it. This will also be explained -later in this document. - - -Priority adjustments --------------------- - -The implementation of the PI code in rtmutex.c has several places that a -process must adjust its priority. With the help of the pi_list of a -process this is rather easy to know what needs to be adjusted. - -The functions implementing the task adjustments are rt_mutex_adjust_prio, -__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock -to already be taken), rt_mutex_getprio, and rt_mutex_setprio. - -rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. - -rt_mutex_getprio returns the priority that the task should have. Either the -task's own normal priority, or if a process of a higher priority is waiting on -a mutex owned by the task, then that higher priority should be returned. -Since the pi_list of a task holds an order by priority list of all the top -waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs -to compare the top pi waiter to its own normal priority, and return the higher -priority back. - -(Note: if looking at the code, you will notice that the lower number of - prio is returned. This is because the prio field in the task structure - is an inverse order of the actual priority. So a "prio" of 5 is - of higher priority than a "prio" of 10.) - -__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the -result does not equal the task's current priority, then rt_mutex_setprio -is called to adjust the priority of the task to the new priority. -Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the -actual change in priority. - -It is interesting to note that __rt_mutex_adjust_prio can either increase -or decrease the priority of the task. In the case that a higher priority -process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio -would increase/boost the task's priority. But if a higher priority task -were for some reason to leave the mutex (timeout or signal), this same function -would decrease/unboost the priority of the task. That is because the pi_list -always contains the highest priority task that is waiting on a mutex owned -by the task, so we only need to compare the priority of that top pi waiter -to the normal priority of the given task. - - -High level overview of the PI chain walk ----------------------------------------- - -The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. - -The implementation has gone through several iterations, and has ended up -with what we believe is the best. It walks the PI chain by only grabbing -at most two locks at a time, and is very efficient. - -The rt_mutex_adjust_prio_chain can be used either to boost or lower process -priorities. - -rt_mutex_adjust_prio_chain is called with a task to be checked for PI -(de)boosting (the owner of a mutex that a process is blocking on), a flag to -check for deadlocking, the mutex that the task owns, and a pointer to a waiter -that is the process's waiter struct that is blocked on the mutex (although this -parameter may be NULL for deboosting). - -For this explanation, I will not mention deadlock detection. This explanation -will try to stay at a high level. - -When this function is called, there are no locks held. That also means -that the state of the owner and lock can change when entered into this function. - -Before this function is called, the task has already had rt_mutex_adjust_prio -performed on it. This means that the task is set to the priority that it -should be at, but the plist nodes of the task's waiter have not been updated -with the new priorities, and that this task may not be in the proper locations -in the pi_lists and wait_lists that the task is blocked on. This function -solves all that. - -A loop is entered, where task is the owner to be checked for PI changes that -was passed by parameter (for the first iteration). The pi_lock of this task is -taken to prevent any more changes to the pi_list of the task. This also -prevents new tasks from completing the blocking on a mutex that is owned by this -task. - -If the task is not blocked on a mutex then the loop is exited. We are at -the top of the PI chain. - -A check is now done to see if the original waiter (the process that is blocked -on the current mutex) is the top pi waiter of the task. That is, is this -waiter on the top of the task's pi_list. If it is not, it either means that -there is another process higher in priority that is blocked on one of the -mutexes that the task owns, or that the waiter has just woken up via a signal -or timeout and has left the PI chain. In either case, the loop is exited, since -we don't need to do any more changes to the priority of the current task, or any -task that owns a mutex that this current task is waiting on. A priority chain -walk is only needed when a new top pi waiter is made to a task. - -The next check sees if the task's waiter plist node has the priority equal to -the priority the task is set at. If they are equal, then we are done with -the loop. Remember that the function started with the priority of the -task adjusted, but the plist nodes that hold the task in other processes -pi_lists have not been adjusted. - -Next, we look at the mutex that the task is blocked on. The mutex's wait_lock -is taken. This is done by a spin_trylock, because the locking order of the -pi_lock and wait_lock goes in the opposite direction. If we fail to grab the -lock, the pi_lock is released, and we restart the loop. - -Now that we have both the pi_lock of the task as well as the wait_lock of -the mutex the task is blocked on, we update the task's waiter's plist node -that is located on the mutex's wait_list. - -Now we release the pi_lock of the task. - -Next the owner of the mutex has its pi_lock taken, so we can update the -task's entry in the owner's pi_list. If the task is the highest priority -process on the mutex's wait_list, then we remove the previous top waiter -from the owner's pi_list, and replace it with the task. - -Note: It is possible that the task was the current top waiter on the mutex, - in which case the task is not yet on the pi_list of the waiter. This - is OK, since plist_del does nothing if the plist node is not on any - list. - -If the task was not the top waiter of the mutex, but it was before we -did the priority updates, that means we are deboosting/lowering the -task. In this case, the task is removed from the pi_list of the owner, -and the new top waiter is added. - -Lastly, we unlock both the pi_lock of the task, as well as the mutex's -wait_lock, and continue the loop again. On the next iteration of the -loop, the previous owner of the mutex will be the task that will be -processed. - -Note: One might think that the owner of this mutex might have changed - since we just grab the mutex's wait_lock. And one could be right. - The important thing to remember is that the owner could not have - become the task that is being processed in the PI chain, since - we have taken that task's pi_lock at the beginning of the loop. - So as long as there is an owner of this mutex that is not the same - process as the tasked being worked on, we are OK. - - Looking closely at the code, one might be confused. The check for the - end of the PI chain is when the task isn't blocked on anything or the - task's waiter structure "task" element is NULL. This check is - protected only by the task's pi_lock. But the code to unlock the mutex - sets the task's waiter structure "task" element to NULL with only - the protection of the mutex's wait_lock, which was not taken yet. - Isn't this a race condition if the task becomes the new owner? - - The answer is No! The trick is the spin_trylock of the mutex's - wait_lock. If we fail that lock, we release the pi_lock of the - task and continue the loop, doing the end of PI chain check again. - - In the code to release the lock, the wait_lock of the mutex is held - the entire time, and it is not let go when we grab the pi_lock of the - new owner of the mutex. So if the switch of a new owner were to happen - after the check for end of the PI chain and the grabbing of the - wait_lock, the unlocking code would spin on the new owner's pi_lock - but never give up the wait_lock. So the PI chain loop is guaranteed to - fail the spin_trylock on the wait_lock, release the pi_lock, and - try again. - - If you don't quite understand the above, that's OK. You don't have to, - unless you really want to make a proof out of it ;) - - -Pending Owners and Lock stealing --------------------------------- - -One of the flags in the owner field of the mutex structure is "Pending Owner". -What this means is that an owner was chosen by the process releasing the -mutex, but that owner has yet to wake up and actually take the mutex. - -Why is this important? Why can't we just give the mutex to another process -and be done with it? - -The PI code is to help with real-time processes, and to let the highest -priority process run as long as possible with little latencies and delays. -If a high priority process owns a mutex that a lower priority process is -blocked on, when the mutex is released it would be given to the lower priority -process. What if the higher priority process wants to take that mutex again. -The high priority process would fail to take that mutex that it just gave up -and it would need to boost the lower priority process to run with full -latency of that critical section (since the low priority process just entered -it). - -There's no reason a high priority process that gives up a mutex should be -penalized if it tries to take that mutex again. If the new owner of the -mutex has not woken up yet, there's no reason that the higher priority process -could not take that mutex away. - -To solve this, we introduced Pending Ownership and Lock Stealing. When a -new process is given a mutex that it was blocked on, it is only given -pending ownership. This means that it's the new owner, unless a higher -priority process comes in and tries to grab that mutex. If a higher priority -process does come along and wants that mutex, we let the higher priority -process "steal" the mutex from the pending owner (only if it is still pending) -and continue with the mutex. - - -Taking of a mutex (The walk through) ------------------------------------- - -OK, now let's take a look at the detailed walk through of what happens when -taking a mutex. - -The first thing that is tried is the fast taking of the mutex. This is -done when we have CMPXCHG enabled (otherwise the fast taking automatically -fails). Only when the owner field of the mutex is NULL can the lock be -taken with the CMPXCHG and nothing else needs to be done. - -If there is contention on the lock, whether it is owned or pending owner -we go about the slow path (rt_mutex_slowlock). - -The slow path function is where the task's waiter structure is created on -the stack. This is because the waiter structure is only needed for the -scope of this function. The waiter structure holds the nodes to store -the task on the wait_list of the mutex, and if need be, the pi_list of -the owner. - -The wait_lock of the mutex is taken since the slow path of unlocking the -mutex also takes this lock. - -We then call try_to_take_rt_mutex. This is where the architecture that -does not implement CMPXCHG would always grab the lock (if there's no -contention). - -try_to_take_rt_mutex is used every time the task tries to grab a mutex in the -slow path. The first thing that is done here is an atomic setting of -the "Has Waiters" flag of the mutex's owner field. Yes, this could really -be false, because if the mutex has no owner, there are no waiters and -the current task also won't have any waiters. But we don't have the lock -yet, so we assume we are going to be a waiter. The reason for this is to -play nice for those architectures that do have CMPXCHG. By setting this flag -now, the owner of the mutex can't release the mutex without going into the -slow unlock path, and it would then need to grab the wait_lock, which this -code currently holds. So setting the "Has Waiters" flag forces the owner -to synchronize with this code. - -Now that we know that we can't have any races with the owner releasing the -mutex, we check to see if we can take the ownership. This is done if the -mutex doesn't have a owner, or if we can steal the mutex from a pending -owner. Let's look at the situations we have here. - - 1) Has owner that is pending - ---------------------------- - - The mutex has a owner, but it hasn't woken up and the mutex flag - "Pending Owner" is set. The first check is to see if the owner isn't the - current task. This is because this function is also used for the pending - owner to grab the mutex. When a pending owner wakes up, it checks to see - if it can take the mutex, and this is done if the owner is already set to - itself. If so, we succeed and leave the function, clearing the "Pending - Owner" bit. - - If the pending owner is not current, we check to see if the current priority is - higher than the pending owner. If not, we fail the function and return. - - There's also something special about a pending owner. That is a pending owner - is never blocked on a mutex. So there is no PI chain to worry about. It also - means that if the mutex doesn't have any waiters, there's no accounting needed - to update the pending owner's pi_list, since we only worry about processes - blocked on the current mutex. - - If there are waiters on this mutex, and we just stole the ownership, we need - to take the top waiter, remove it from the pi_list of the pending owner, and - add it to the current pi_list. Note that at this moment, the pending owner - is no longer on the list of waiters. This is fine, since the pending owner - would add itself back when it realizes that it had the ownership stolen - from itself. When the pending owner tries to grab the mutex, it will fail - in try_to_take_rt_mutex if the owner field points to another process. - - 2) No owner - ----------- - - If there is no owner (or we successfully stole the lock), we set the owner - of the mutex to current, and set the flag of "Has Waiters" if the current - mutex actually has waiters, or we clear the flag if it doesn't. See, it was - OK that we set that flag early, since now it is cleared. - - 3) Failed to grab ownership - --------------------------- - - The most interesting case is when we fail to take ownership. This means that - there exists an owner, or there's a pending owner with equal or higher - priority than the current task. - -We'll continue on the failed case. - -If the mutex has a timeout, we set up a timer to go off to break us out -of this mutex if we failed to get it after a specified amount of time. - -Now we enter a loop that will continue to try to take ownership of the mutex, or -fail from a timeout or signal. - -Once again we try to take the mutex. This will usually fail the first time -in the loop, since it had just failed to get the mutex. But the second time -in the loop, this would likely succeed, since the task would likely be -the pending owner. - -If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done -here. - -The waiter structure has a "task" field that points to the task that is blocked -on the mutex. This field can be NULL the first time it goes through the loop -or if the task is a pending owner and had its mutex stolen. If the "task" -field is NULL then we need to set up the accounting for it. - -Task blocks on mutex --------------------- - -The accounting of a mutex and process is done with the waiter structure of -the process. The "task" field is set to the process, and the "lock" field -to the mutex. The plist nodes are initialized to the processes current -priority. - -Since the wait_lock was taken at the entry of the slow lock, we can safely -add the waiter to the wait_list. If the current process is the highest -priority process currently waiting on this mutex, then we remove the -previous top waiter process (if it exists) from the pi_list of the owner, -and add the current process to that list. Since the pi_list of the owner -has changed, we call rt_mutex_adjust_prio on the owner to see if the owner -should adjust its priority accordingly. - -If the owner is also blocked on a lock, and had its pi_list changed -(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead -and run rt_mutex_adjust_prio_chain on the owner, as described earlier. - -Now all locks are released, and if the current process is still blocked on a -mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). - -Waking up in the loop ---------------------- - -The schedule can then wake up for a few reasons. - 1) we were given pending ownership of the mutex. - 2) we received a signal and was TASK_INTERRUPTIBLE - 3) we had a timeout and was TASK_INTERRUPTIBLE - -In any of these cases, we continue the loop and once again try to grab the -ownership of the mutex. If we succeed, we exit the loop, otherwise we continue -and on signal and timeout, will exit the loop, or if we had the mutex stolen -we just simply add ourselves back on the lists and go back to sleep. - -Note: For various reasons, because of timeout and signals, the steal mutex - algorithm needs to be careful. This is because the current process is - still on the wait_list. And because of dynamic changing of priorities, - especially on SCHED_OTHER tasks, the current process can be the - highest priority task on the wait_list. - -Failed to get mutex on Timeout or Signal ----------------------------------------- - -If a timeout or signal occurred, the waiter's "task" field would not be -NULL and the task needs to be taken off the wait_list of the mutex and perhaps -pi_list of the owner. If this process was a high priority process, then -the rt_mutex_adjust_prio_chain needs to be executed again on the owner, -but this time it will be lowering the priorities. - - -Unlocking the Mutex -------------------- - -The unlocking of a mutex also has a fast path for those architectures with -CMPXCHG. Since the taking of a mutex on contention always sets the -"Has Waiters" flag of the mutex's owner, we use this to know if we need to -take the slow path when unlocking the mutex. If the mutex doesn't have any -waiters, the owner field of the mutex would equal the current process and -the mutex can be unlocked by just replacing the owner field with NULL. - -If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), -the slow unlock path is taken. - -The first thing done in the slow unlock path is to take the wait_lock of the -mutex. This synchronizes the locking and unlocking of the mutex. - -A check is made to see if the mutex has waiters or not. On architectures that -do not have CMPXCHG, this is the location that the owner of the mutex will -determine if a waiter needs to be awoken or not. On architectures that -do have CMPXCHG, that check is done in the fast path, but it is still needed -in the slow path too. If a waiter of a mutex woke up because of a signal -or timeout between the time the owner failed the fast path CMPXCHG check and -the grabbing of the wait_lock, the mutex may not have any waiters, thus the -owner still needs to make this check. If there are no waiters then the mutex -owner field is set to NULL, the wait_lock is released and nothing more is -needed. - -If there are waiters, then we need to wake one up and give that waiter -pending ownership. - -On the wake up code, the pi_lock of the current owner is taken. The top -waiter of the lock is found and removed from the wait_list of the mutex -as well as the pi_list of the current owner. The task field of the new -pending owner's waiter structure is set to NULL, and the owner field of the -mutex is set to the new owner with the "Pending Owner" bit set, as well -as the "Has Waiters" bit if there still are other processes blocked on the -mutex. - -The pi_lock of the previous owner is released, and the new pending owner's -pi_lock is taken. Remember that this is the trick to prevent the race -condition in rt_mutex_adjust_prio_chain from adding itself as a waiter -on the mutex. - -We now clear the "pi_blocked_on" field of the new pending owner, and if -the mutex still has waiters pending, we add the new top waiter to the pi_list -of the pending owner. - -Finally we unlock the pi_lock of the pending owner and wake it up. - - -Contact -------- - -For updates on this document, please email Steven Rostedt - - -Credits -------- - -Author: Steven Rostedt - -Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap - -Updates -------- - -This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt deleted file mode 100644 index 243393d882ee..000000000000 --- a/Documentation/rt-mutex.txt +++ /dev/null @@ -1,79 +0,0 @@ -RT-mutex subsystem with PI support ----------------------------------- - -RT-mutexes with priority inheritance are used to support PI-futexes, -which enable pthread_mutex_t priority inheritance attributes -(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details -about PI-futexes.] - -This technology was developed in the -rt tree and streamlined for -pthread_mutex support. - -Basic principles: ------------------ - -RT-mutexes extend the semantics of simple mutexes by the priority -inheritance protocol. - -A low priority owner of a rt-mutex inherits the priority of a higher -priority waiter until the rt-mutex is released. If the temporarily -boosted owner blocks on a rt-mutex itself it propagates the priority -boosting to the owner of the other rt_mutex it gets blocked on. The -priority boosting is immediately removed once the rt_mutex has been -unlocked. - -This approach allows us to shorten the block of high-prio tasks on -mutexes which protect shared resources. Priority inheritance is not a -magic bullet for poorly designed applications, but it allows -well-designed applications to use userspace locks in critical parts of -an high priority thread, without losing determinism. - -The enqueueing of the waiters into the rtmutex waiter list is done in -priority order. For same priorities FIFO order is chosen. For each -rtmutex, only the top priority waiter is enqueued into the owner's -priority waiters list. This list too queues in priority order. Whenever -the top priority waiter of a task changes (for example it timed out or -got a signal), the priority of the owner task is readjusted. [The -priority enqueueing is handled by "plists", see include/linux/plist.h -for more details.] - -RT-mutexes are optimized for fastpath operations and have no internal -locking overhead when locking an uncontended mutex or unlocking a mutex -without waiters. The optimized fastpath operations require cmpxchg -support. [If that is not available then the rt-mutex internal spinlock -is used] - -The state of the rt-mutex is tracked via the owner field of the rt-mutex -structure: - -rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 -are used to keep track of the "owner is pending" and "rtmutex has -waiters" state. - - owner bit1 bit0 - NULL 0 0 mutex is free (fast acquire possible) - NULL 0 1 invalid state - NULL 1 0 Transitional state* - NULL 1 1 invalid state - taskpointer 0 0 mutex is held (fast release possible) - taskpointer 0 1 task is pending owner - taskpointer 1 0 mutex is held and has waiters - taskpointer 1 1 task is pending owner and mutex has waiters - -Pending-ownership handling is a performance optimization: -pending-ownership is assigned to the first (highest priority) waiter of -the mutex, when the mutex is released. The thread is woken up and once -it starts executing it can acquire the mutex. Until the mutex is taken -by it (bit 0 is cleared) a competing higher priority thread can "steal" -the mutex which puts the woken up thread back on the waiters list. - -The pending-ownership optimization is especially important for the -uninterrupted workflow of high-prio tasks which repeatedly -takes/releases locks that have lower-prio waiters. Without this -optimization the higher-prio thread would ping-pong to the lower-prio -task [because at unlock time we always assign a new owner]. - -(*) The "mutex has waiters" bit gets set to take the lock. If the lock -doesn't already have an owner, this bit is quickly cleared if there are -no waiters. So this is a transitional state to synchronize with looking -at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt deleted file mode 100644 index 97eaf5727178..000000000000 --- a/Documentation/spinlocks.txt +++ /dev/null @@ -1,167 +0,0 @@ -Lesson 1: Spin locks - -The most basic primitive for locking is spinlock. - -static DEFINE_SPINLOCK(xxx_lock); - - unsigned long flags; - - spin_lock_irqsave(&xxx_lock, flags); - ... critical section here .. - spin_unlock_irqrestore(&xxx_lock, flags); - -The above is always safe. It will disable interrupts _locally_, but the -spinlock itself will guarantee the global lock, so it will guarantee that -there is only one thread-of-control within the region(s) protected by that -lock. This works well even under UP also, so the code does _not_ need to -worry about UP vs SMP issues: the spinlocks work correctly under both. - - NOTE! Implications of spin_locks for memory are further described in: - - Documentation/memory-barriers.txt - (5) LOCK operations. - (6) UNLOCK operations. - -The above is usually pretty simple (you usually need and want only one -spinlock for most things - using more than one spinlock can make things a -lot more complex and even slower and is usually worth it only for -sequences that you _know_ need to be split up: avoid it at all cost if you -aren't sure). - -This is really the only really hard part about spinlocks: once you start -using spinlocks they tend to expand to areas you might not have noticed -before, because you have to make sure the spinlocks correctly protect the -shared data structures _everywhere_ they are used. The spinlocks are most -easily added to places that are completely independent of other code (for -example, internal driver data structures that nobody else ever touches). - - NOTE! The spin-lock is safe only when you _also_ use the lock itself - to do locking across CPU's, which implies that EVERYTHING that - touches a shared variable has to agree about the spinlock they want - to use. - ----- - -Lesson 2: reader-writer spinlocks. - -If your data accesses have a very natural pattern where you usually tend -to mostly read from the shared variables, the reader-writer locks -(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple -readers to be in the same critical region at once, but if somebody wants -to change the variables it has to get an exclusive write lock. - - NOTE! reader-writer locks require more atomic memory operations than - simple spinlocks. Unless the reader critical section is long, you - are better off just using spinlocks. - -The routines look the same as above: - - rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); - - unsigned long flags; - - read_lock_irqsave(&xxx_lock, flags); - .. critical section that only reads the info ... - read_unlock_irqrestore(&xxx_lock, flags); - - write_lock_irqsave(&xxx_lock, flags); - .. read and write exclusive access to the info ... - write_unlock_irqrestore(&xxx_lock, flags); - -The above kind of lock may be useful for complex data structures like -linked lists, especially searching for entries without changing the list -itself. The read lock allows many concurrent readers. Anything that -_changes_ the list will have to get the write lock. - - NOTE! RCU is better for list traversal, but requires careful - attention to design detail (see Documentation/RCU/listRCU.txt). - -Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ -time need to do any changes (even if you don't do it every time), you have -to get the write-lock at the very beginning. - - NOTE! We are working hard to remove reader-writer spinlocks in most - cases, so please don't add a new one without consensus. (Instead, see - Documentation/RCU/rcu.txt for complete information.) - ----- - -Lesson 3: spinlocks revisited. - -The single spin-lock primitives above are by no means the only ones. They -are the most safe ones, and the ones that work under all circumstances, -but partly _because_ they are safe they are also fairly slow. They are slower -than they'd need to be, because they do have to disable interrupts -(which is just a single instruction on a x86, but it's an expensive one - -and on other architectures it can be worse). - -If you have a case where you have to protect a data structure across -several CPU's and you want to use spinlocks you can potentially use -cheaper versions of the spinlocks. IFF you know that the spinlocks are -never used in interrupt handlers, you can use the non-irq versions: - - spin_lock(&lock); - ... - spin_unlock(&lock); - -(and the equivalent read-write versions too, of course). The spinlock will -guarantee the same kind of exclusive access, and it will be much faster. -This is useful if you know that the data in question is only ever -manipulated from a "process context", ie no interrupts involved. - -The reasons you mustn't use these versions if you have interrupts that -play with the spinlock is that you can get deadlocks: - - spin_lock(&lock); - ... - <- interrupt comes in: - spin_lock(&lock); - -where an interrupt tries to lock an already locked variable. This is ok if -the other interrupt happens on another CPU, but it is _not_ ok if the -interrupt happens on the same CPU that already holds the lock, because the -lock will obviously never be released (because the interrupt is waiting -for the lock, and the lock-holder is interrupted by the interrupt and will -not continue until the interrupt has been processed). - -(This is also the reason why the irq-versions of the spinlocks only need -to disable the _local_ interrupts - it's ok to use spinlocks in interrupts -on other CPU's, because an interrupt on another CPU doesn't interrupt the -CPU that holds the lock, so the lock-holder can continue and eventually -releases the lock). - -Note that you can be clever with read-write locks and interrupts. For -example, if you know that the interrupt only ever gets a read-lock, then -you can use a non-irq version of read locks everywhere - because they -don't block on each other (and thus there is no dead-lock wrt interrupts. -But when you do the write-lock, you have to use the irq-safe version. - -For an example of being clever with rw-locks, see the "waitqueue_lock" -handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from -within an interrupt, they only read the queue in order to know whom to -wake up. So read-locks are safe (which is good: they are very common -indeed), while write-locks need to protect themselves against interrupts. - - Linus - ----- - -Reference information: - -For dynamic initialization, use spin_lock_init() or rwlock_init() as -appropriate: - - spinlock_t xxx_lock; - rwlock_t xxx_rw_lock; - - static int __init xxx_init(void) - { - spin_lock_init(&xxx_lock); - rwlock_init(&xxx_rw_lock); - ... - } - - module_init(xxx_init); - -For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or -__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt deleted file mode 100644 index 8a112dc304c3..000000000000 --- a/Documentation/ww-mutex-design.txt +++ /dev/null @@ -1,344 +0,0 @@ -Wait/Wound Deadlock-Proof Mutex Design -====================================== - -Please read mutex-design.txt first, as it applies to wait/wound mutexes too. - -Motivation for WW-Mutexes -------------------------- - -GPU's do operations that commonly involve many buffers. Those buffers -can be shared across contexts/processes, exist in different memory -domains (for example VRAM vs system memory), and so on. And with -PRIME / dmabuf, they can even be shared across devices. So there are -a handful of situations where the driver needs to wait for buffers to -become ready. If you think about this in terms of waiting on a buffer -mutex for it to become available, this presents a problem because -there is no way to guarantee that buffers appear in a execbuf/batch in -the same order in all contexts. That is directly under control of -userspace, and a result of the sequence of GL calls that an application -makes. Which results in the potential for deadlock. The problem gets -more complex when you consider that the kernel may need to migrate the -buffer(s) into VRAM before the GPU operates on the buffer(s), which -may in turn require evicting some other buffers (and you don't want to -evict other buffers which are already queued up to the GPU), but for a -simplified understanding of the problem you can ignore this. - -The algorithm that the TTM graphics subsystem came up with for dealing with -this problem is quite simple. For each group of buffers (execbuf) that need -to be locked, the caller would be assigned a unique reservation id/ticket, -from a global counter. In case of deadlock while locking all the buffers -associated with a execbuf, the one with the lowest reservation ticket (i.e. -the oldest task) wins, and the one with the higher reservation id (i.e. the -younger task) unlocks all of the buffers that it has already locked, and then -tries again. - -In the RDBMS literature this deadlock handling approach is called wait/wound: -The older tasks waits until it can acquire the contended lock. The younger tasks -needs to back off and drop all the locks it is currently holding, i.e. the -younger task is wounded. - -Concepts --------- - -Compared to normal mutexes two additional concepts/objects show up in the lock -interface for w/w mutexes: - -Acquire context: To ensure eventual forward progress it is important the a task -trying to acquire locks doesn't grab a new reservation id, but keeps the one it -acquired when starting the lock acquisition. This ticket is stored in the -acquire context. Furthermore the acquire context keeps track of debugging state -to catch w/w mutex interface abuse. - -W/w class: In contrast to normal mutexes the lock class needs to be explicit for -w/w mutexes, since it is required to initialize the acquire context. - -Furthermore there are three different class of w/w lock acquire functions: - -* Normal lock acquisition with a context, using ww_mutex_lock. - -* Slowpath lock acquisition on the contending lock, used by the wounded task - after having dropped all already acquired locks. These functions have the - _slow postfix. - - From a simple semantics point-of-view the _slow functions are not strictly - required, since simply calling the normal ww_mutex_lock functions on the - contending lock (after having dropped all other already acquired locks) will - work correctly. After all if no other ww mutex has been acquired yet there's - no deadlock potential and hence the ww_mutex_lock call will block and not - prematurely return -EDEADLK. The advantage of the _slow functions is in - interface safety: - - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow - has a void return type. Note that since ww mutex code needs loops/retries - anyway the __must_check doesn't result in spurious warnings, even though the - very first lock operation can never fail. - - When full debugging is enabled ww_mutex_lock_slow checks that all acquired - ww mutex have been released (preventing deadlocks) and makes sure that we - block on the contending lock (preventing spinning through the -EDEADLK - slowpath until the contended lock can be acquired). - -* Functions to only acquire a single w/w mutex, which results in the exact same - semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL - context. - - Again this is not strictly required. But often you only want to acquire a - single lock in which case it's pointless to set up an acquire context (and so - better to avoid grabbing a deadlock avoidance ticket). - -Of course, all the usual variants for handling wake-ups due to signals are also -provided. - -Usage ------ - -Three different ways to acquire locks within the same w/w class. Common -definitions for methods #1 and #2: - -static DEFINE_WW_CLASS(ww_class); - -struct obj { - struct ww_mutex lock; - /* obj data */ -}; - -struct obj_entry { - struct list_head head; - struct obj *obj; -}; - -Method 1, using a list in execbuf->buffers that's not allowed to be reordered. -This is useful if a list of required objects is already tracked somewhere. -Furthermore the lock helper can use propagate the -EALREADY return code back to -the caller as a signal that an object is twice on the list. This is useful if -the list is constructed from userspace input and the ABI requires userspace to -not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl). - -int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj *res_obj = NULL; - struct obj_entry *contended_entry = NULL; - struct obj_entry *entry; - - ww_acquire_init(ctx, &ww_class); - -retry: - list_for_each_entry (entry, list, head) { - if (entry->obj == res_obj) { - res_obj = NULL; - continue; - } - ret = ww_mutex_lock(&entry->obj->lock, ctx); - if (ret < 0) { - contended_entry = entry; - goto err; - } - } - - ww_acquire_done(ctx); - return 0; - -err: - list_for_each_entry_continue_reverse (entry, list, head) - ww_mutex_unlock(&entry->obj->lock); - - if (res_obj) - ww_mutex_unlock(&res_obj->lock); - - if (ret == -EDEADLK) { - /* we lost out in a seqno race, lock and retry.. */ - ww_mutex_lock_slow(&contended_entry->obj->lock, ctx); - res_obj = contended_entry->obj; - goto retry; - } - ww_acquire_fini(ctx); - - return ret; -} - -Method 2, using a list in execbuf->buffers that can be reordered. Same semantics -of duplicate entry detection using -EALREADY as method 1 above. But the -list-reordering allows for a bit more idiomatic code. - -int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj_entry *entry, *entry2; - - ww_acquire_init(ctx, &ww_class); - - list_for_each_entry (entry, list, head) { - ret = ww_mutex_lock(&entry->obj->lock, ctx); - if (ret < 0) { - entry2 = entry; - - list_for_each_entry_continue_reverse (entry2, list, head) - ww_mutex_unlock(&entry2->obj->lock); - - if (ret != -EDEADLK) { - ww_acquire_fini(ctx); - return ret; - } - - /* we lost out in a seqno race, lock and retry.. */ - ww_mutex_lock_slow(&entry->obj->lock, ctx); - - /* - * Move buf to head of the list, this will point - * buf->next to the first unlocked entry, - * restarting the for loop. - */ - list_del(&entry->head); - list_add(&entry->head, list); - } - } - - ww_acquire_done(ctx); - return 0; -} - -Unlocking works the same way for both methods #1 and #2: - -void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj_entry *entry; - - list_for_each_entry (entry, list, head) - ww_mutex_unlock(&entry->obj->lock); - - ww_acquire_fini(ctx); -} - -Method 3 is useful if the list of objects is constructed ad-hoc and not upfront, -e.g. when adjusting edges in a graph where each node has its own ww_mutex lock, -and edges can only be changed when holding the locks of all involved nodes. w/w -mutexes are a natural fit for such a case for two reasons: -- They can handle lock-acquisition in any order which allows us to start walking - a graph from a starting point and then iteratively discovering new edges and - locking down the nodes those edges connect to. -- Due to the -EALREADY return code signalling that a given objects is already - held there's no need for additional book-keeping to break cycles in the graph - or keep track off which looks are already held (when using more than one node - as a starting point). - -Note that this approach differs in two important ways from the above methods: -- Since the list of objects is dynamically constructed (and might very well be - different when retrying due to hitting the -EDEADLK wound condition) there's - no need to keep any object on a persistent list when it's not locked. We can - therefore move the list_head into the object itself. -- On the other hand the dynamic object list construction also means that the -EALREADY return - code can't be propagated. - -Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a -list of starting nodes (passed in from userspace) using one of the above -methods. And then lock any additional objects affected by the operations using -method #3 below. The backoff/retry procedure will be a bit more involved, since -when the dynamic locking step hits -EDEADLK we also need to unlock all the -objects acquired with the fixed list. But the w/w mutex debug checks will catch -any interface misuse for these cases. - -Also, method 3 can't fail the lock acquisition step since it doesn't return --EALREADY. Of course this would be different when using the _interruptible -variants, but that's outside of the scope of these examples here. - -struct obj { - struct ww_mutex ww_mutex; - struct list_head locked_list; -}; - -static DEFINE_WW_CLASS(ww_class); - -void __unlock_objs(struct list_head *list) -{ - struct obj *entry, *temp; - - list_for_each_entry_safe (entry, temp, list, locked_list) { - /* need to do that before unlocking, since only the current lock holder is - allowed to use object */ - list_del(&entry->locked_list); - ww_mutex_unlock(entry->ww_mutex) - } -} - -void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj *obj; - - ww_acquire_init(ctx, &ww_class); - -retry: - /* re-init loop start state */ - loop { - /* magic code which walks over a graph and decides which objects - * to lock */ - - ret = ww_mutex_lock(obj->ww_mutex, ctx); - if (ret == -EALREADY) { - /* we have that one already, get to the next object */ - continue; - } - if (ret == -EDEADLK) { - __unlock_objs(list); - - ww_mutex_lock_slow(obj, ctx); - list_add(&entry->locked_list, list); - goto retry; - } - - /* locked a new object, add it to the list */ - list_add_tail(&entry->locked_list, list); - } - - ww_acquire_done(ctx); - return 0; -} - -void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - __unlock_objs(list); - ww_acquire_fini(ctx); -} - -Method 4: Only lock one single objects. In that case deadlock detection and -prevention is obviously overkill, since with grabbing just one lock you can't -produce a deadlock within just one class. To simplify this case the w/w mutex -api can be used with a NULL context. - -Implementation Details ----------------------- - -Design: - ww_mutex currently encapsulates a struct mutex, this means no extra overhead for - normal mutex locks, which are far more common. As such there is only a small - increase in code size if wait/wound mutexes are not used. - - In general, not much contention is expected. The locks are typically used to - serialize access to resources for devices. The only way to make wakeups - smarter would be at the cost of adding a field to struct mutex_waiter. This - would add overhead to all cases where normal mutexes are used, and - ww_mutexes are generally less performance sensitive. - -Lockdep: - Special care has been taken to warn for as many cases of api abuse - as possible. Some common api abuses will be caught with - CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended. - - Some of the errors which will be warned about: - - Forgetting to call ww_acquire_fini or ww_acquire_init. - - Attempting to lock more mutexes after ww_acquire_done. - - Attempting to lock the wrong mutex after -EDEADLK and - unlocking all mutexes. - - Attempting to lock the right mutex after -EDEADLK, - before unlocking all mutexes. - - - Calling ww_mutex_lock_slow before -EDEADLK was returned. - - - Unlocking mutexes with the wrong unlock function. - - Calling one of the ww_acquire_* twice on the same context. - - Using a different ww_class for the mutex than for the ww_acquire_ctx. - - Normal lockdep errors that can result in deadlocks. - - Some of the lockdep errors that can result in deadlocks: - - Calling ww_acquire_init to initialize a second ww_acquire_ctx before - having called ww_acquire_fini on the first. - - 'normal' deadlocks that can occur. - -FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic -implemented. diff --git a/MAINTAINERS b/MAINTAINERS index 1acc624ecfd7..aac481fcbf5c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5523,8 +5523,8 @@ M: Ingo Molnar L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/locking S: Maintained -F: Documentation/lockdep*.txt -F: Documentation/lockstat.txt +F: Documentation/locking/lockdep*.txt +F: Documentation/locking/lockstat.txt F: include/linux/lockdep.h F: kernel/locking/ diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index 0dc57d5ecd10..3a02e5e3e9f3 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -35,7 +35,7 @@ * of extra utility/tracking out of our acquire-ctx. This is provided * by drm_modeset_lock / drm_modeset_acquire_ctx. * - * For basic principles of ww_mutex, see: Documentation/ww-mutex-design.txt + * For basic principles of ww_mutex, see: Documentation/locking/ww-mutex-design.txt * * The basic usage pattern is to: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 008388f920d7..f388481201cd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -4,7 +4,7 @@ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * - * see Documentation/lockdep-design.txt for more details. + * see Documentation/locking/lockdep-design.txt for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H diff --git a/include/linux/mutex.h b/include/linux/mutex.h index e4c29418f407..cc31498fc526 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -133,7 +133,7 @@ static inline int mutex_is_locked(struct mutex *lock) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 035d3c57fc8a..8f498cdde280 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -149,7 +149,7 @@ extern void downgrade_write(struct rw_semaphore *sem); * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. - * See Documentation/lockdep-design.txt for more details.) + * See Documentation/locking/lockdep-design.txt for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0d8b6ed93874..dadbf88c22c4 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -15,7 +15,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #include #include diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a0ea2a141b3b..7c98873a3077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,7 +8,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.txt for details. */ #include #include diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 901096d31c66..9b94a063e26c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -924,7 +924,7 @@ config PROVE_LOCKING the proof of observed correctness is also maintained for an arbitrary combination of these separate locking variants. - For more details, see Documentation/lockdep-design.txt. + For more details, see Documentation/locking/lockdep-design.txt. config LOCKDEP bool @@ -945,7 +945,7 @@ config LOCK_STAT help This feature enables tracking lock contention points - For more details, see Documentation/lockstat.txt + For more details, see Documentation/locking/lockstat.txt This also enables lock events required by "perf lock", subcommand of perf. -- cgit v1.2.3 From 4999201a59ef555f9105d2bb2459ed895627f7aa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 8 Aug 2014 12:35:36 +0200 Subject: locking/spinlocks: Always evaluate the second argument of spin_lock_nested() Evaluating a macro argument only if certain configuration options have been selected is confusing and error-prone. Hence always evaluate the second argument of spin_lock_nested(). An intentional side effect of this patch is that it avoids that the following warning is reported for netif_addr_lock_nested() when building with CONFIG_DEBUG_LOCK_ALLOC=n and with W=1: include/linux/netdevice.h: In function 'netif_addr_lock_nested': include/linux/netdevice.h:2865:6: warning: variable 'subclass' set but not used [-Wunused-but-set-variable] int subclass = SINGLE_DEPTH_NESTING; ^ Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra Cc: David Rientjes Cc: David S. Miller Cc: Andrew Morton Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/53E4A7F8.1040700@acm.org Signed-off-by: Ingo Molnar --- include/linux/spinlock.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 3f2867ff0ced..262ba4ef9a8e 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -197,7 +197,13 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else -# define raw_spin_lock_nested(lock, subclass) _raw_spin_lock(lock) +/* + * Always evaluate the 'subclass' argument to avoid that the compiler + * warns about set-but-not-used variables when building with + * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1. + */ +# define raw_spin_lock_nested(lock, subclass) \ + _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) #endif -- cgit v1.2.3 From f0bab73cb539fb803c4d419951e8d28aa4964f8f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 13:22:01 -0400 Subject: locking/lockdep: Restrict the use of recursive read_lock() with qrwlock Unlike the original unfair rwlock implementation, queued rwlock will grant lock according to the chronological sequence of the lock requests except when the lock requester is in the interrupt context. Consequently, recursive read_lock calls will now hang the process if there is a write_lock call somewhere in between the read_lock calls. This patch updates the lockdep implementation to look for recursive read_lock calls. A new read state (3) is used to mark those read_lock call that cannot be recursively called except in the interrupt context. The new read state does exhaust the 2 bits available in held_lock:read bit field. The addition of any new read state in the future may require a redesign of how all those bits are squeezed together in the held_lock structure. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra Cc: Maarten Lankhorst Cc: Rik van Riel Cc: Scott J Norton Cc: Fengguang Wu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407345722-61615-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 10 +++++++++- kernel/locking/lockdep.c | 6 ++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f388481201cd..b5a84b62fb84 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -478,16 +478,24 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ +/* + * Read states in the 2-bit held_lock:read field: + * 0: Exclusive lock + * 1: Shareable lock, cannot be recursively called + * 2: Shareable lock, can be recursively called + * 3: Shareable lock, cannot be recursively called except in interrupt context + */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) +#define lock_acquire_shared_irecursive(l, s, t, n, i) lock_acquire(l, s, t, 3, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, n, i) lock_release(l, n, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) -#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_irecursive(l, s, t, NULL, i) #define rwlock_release(l, n, i) lock_release(l, n, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..420ba685c4e5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3597,6 +3597,12 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); + /* + * An interrupt recursive read in interrupt context can be considered + * to be the same as a recursive read from checking perspective. + */ + if ((read == 3) && in_interrupt()) + read = 2; current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, -- cgit v1.2.3 From 515d9b2c03943ca904cd135e1b1d9ddd168c1b27 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 12 Aug 2014 18:22:27 +0200 Subject: ata: remove deprecated struct ahci_platform_data The last user of the deprecated struct ahci_platform_data has been cleaned up recently (SPEAr1340 got a proper PHY driver). Signed-off-by: Bartlomiej Zolnierkiewicz Acked-by: Hans de Goede Signed-off-by: Tejun Heo --- drivers/ata/ahci_platform.c | 18 +----------------- drivers/ata/libahci_platform.c | 23 ----------------------- include/linux/ahci_platform.h | 13 ------------- 3 files changed, 1 insertion(+), 53 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c index f61ddb9146d6..06f1d59fa678 100644 --- a/drivers/ata/ahci_platform.c +++ b/drivers/ata/ahci_platform.c @@ -32,7 +32,6 @@ static const struct ata_port_info ahci_port_info = { static int ahci_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct ahci_platform_data *pdata = dev_get_platdata(dev); struct ahci_host_priv *hpriv; int rc; @@ -44,29 +43,14 @@ static int ahci_probe(struct platform_device *pdev) if (rc) return rc; - /* - * Some platforms might need to prepare for mmio region access, - * which could be done in the following init call. So, the mmio - * region shouldn't be accessed before init (if provided) has - * returned successfully. - */ - if (pdata && pdata->init) { - rc = pdata->init(dev, hpriv->mmio); - if (rc) - goto disable_resources; - } - if (of_device_is_compatible(dev->of_node, "hisilicon,hisi-ahci")) hpriv->flags |= AHCI_HFLAG_NO_FBS | AHCI_HFLAG_NO_NCQ; rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info); if (rc) - goto pdata_exit; + goto disable_resources; return 0; -pdata_exit: - if (pdata && pdata->exit) - pdata->exit(dev); disable_resources: ahci_platform_disable_resources(hpriv); return rc; diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 5b92c290e6c6..c0510de8a4c9 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -502,13 +502,8 @@ EXPORT_SYMBOL_GPL(ahci_platform_init_host); static void ahci_host_stop(struct ata_host *host) { - struct device *dev = host->dev; - struct ahci_platform_data *pdata = dev_get_platdata(dev); struct ahci_host_priv *hpriv = host->private_data; - if (pdata && pdata->exit) - pdata->exit(dev); - ahci_platform_disable_resources(hpriv); } @@ -592,7 +587,6 @@ EXPORT_SYMBOL_GPL(ahci_platform_resume_host); */ int ahci_platform_suspend(struct device *dev) { - struct ahci_platform_data *pdata = dev_get_platdata(dev); struct ata_host *host = dev_get_drvdata(dev); struct ahci_host_priv *hpriv = host->private_data; int rc; @@ -601,19 +595,9 @@ int ahci_platform_suspend(struct device *dev) if (rc) return rc; - if (pdata && pdata->suspend) { - rc = pdata->suspend(dev); - if (rc) - goto resume_host; - } - ahci_platform_disable_resources(hpriv); return 0; - -resume_host: - ahci_platform_resume_host(dev); - return rc; } EXPORT_SYMBOL_GPL(ahci_platform_suspend); @@ -629,7 +613,6 @@ EXPORT_SYMBOL_GPL(ahci_platform_suspend); */ int ahci_platform_resume(struct device *dev) { - struct ahci_platform_data *pdata = dev_get_platdata(dev); struct ata_host *host = dev_get_drvdata(dev); struct ahci_host_priv *hpriv = host->private_data; int rc; @@ -638,12 +621,6 @@ int ahci_platform_resume(struct device *dev) if (rc) return rc; - if (pdata && pdata->resume) { - rc = pdata->resume(dev); - if (rc) - goto disable_resources; - } - rc = ahci_platform_resume_host(dev); if (rc) goto disable_resources; diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 09a947e8bc87..642d6ae4030c 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -22,19 +22,6 @@ struct ata_port_info; struct ahci_host_priv; struct platform_device; -/* - * Note ahci_platform_data is deprecated, it is only kept around for use - * by the old da850 and spear13xx ahci code. - * New drivers should instead declare their own platform_driver struct, and - * use ahci_platform* functions in their own probe, suspend and resume methods. - */ -struct ahci_platform_data { - int (*init)(struct device *dev, void __iomem *addr); - void (*exit)(struct device *dev); - int (*suspend)(struct device *dev); - int (*resume)(struct device *dev); -}; - int ahci_platform_enable_clks(struct ahci_host_priv *hpriv); void ahci_platform_disable_clks(struct ahci_host_priv *hpriv); int ahci_platform_enable_resources(struct ahci_host_priv *hpriv); -- cgit v1.2.3 From e5f81539f657af7e9f54ea37986fde8f92acef22 Mon Sep 17 00:00:00 2001 From: Feng Kan Date: Wed, 30 Jul 2014 14:56:58 -0700 Subject: irqchip: gic: Replace hex numbers with defines. This is to cleanup some hex numbers used in the code and replace them with defines to make the code cleaner. Signed-off-by: Feng Kan Reviewed-by: Anup Patel Link: https://lkml.kernel.org/r/1406757419-18729-2-git-send-email-fkan@apm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic-common.c | 15 +++++++++------ drivers/irqchip/irq-gic.c | 25 +++++++++++++------------ include/linux/irqchip/arm-gic.h | 15 +++++++++++++++ 3 files changed, 37 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c index 60ac704d2090..61541ff24397 100644 --- a/drivers/irqchip/irq-gic-common.c +++ b/drivers/irqchip/irq-gic-common.c @@ -74,20 +74,22 @@ void __init gic_dist_config(void __iomem *base, int gic_irqs, * Set all global interrupts to be level triggered, active low. */ for (i = 32; i < gic_irqs; i += 16) - writel_relaxed(0, base + GIC_DIST_CONFIG + i / 4); + writel_relaxed(GICD_INT_ACTLOW_LVLTRIG, + base + GIC_DIST_CONFIG + i / 4); /* * Set priority on all global interrupts. */ for (i = 32; i < gic_irqs; i += 4) - writel_relaxed(0xa0a0a0a0, base + GIC_DIST_PRI + i); + writel_relaxed(GICD_INT_DEF_PRI_X4, base + GIC_DIST_PRI + i); /* * Disable all interrupts. Leave the PPI and SGIs alone * as they are enabled by redistributor registers. */ for (i = 32; i < gic_irqs; i += 32) - writel_relaxed(0xffffffff, base + GIC_DIST_ENABLE_CLEAR + i / 8); + writel_relaxed(GICD_INT_EN_CLR_X32, + base + GIC_DIST_ENABLE_CLEAR + i / 8); if (sync_access) sync_access(); @@ -101,14 +103,15 @@ void gic_cpu_config(void __iomem *base, void (*sync_access)(void)) * Deal with the banked PPI and SGI interrupts - disable all * PPI interrupts, ensure all SGI interrupts are enabled. */ - writel_relaxed(0xffff0000, base + GIC_DIST_ENABLE_CLEAR); - writel_relaxed(0x0000ffff, base + GIC_DIST_ENABLE_SET); + writel_relaxed(GICD_INT_EN_CLR_PPI, base + GIC_DIST_ENABLE_CLEAR); + writel_relaxed(GICD_INT_EN_SET_SGI, base + GIC_DIST_ENABLE_SET); /* * Set priority on PPI and SGI interrupts */ for (i = 0; i < 32; i += 4) - writel_relaxed(0xa0a0a0a0, base + GIC_DIST_PRI + i * 4 / 4); + writel_relaxed(GICD_INT_DEF_PRI_X4, + base + GIC_DIST_PRI + i * 4 / 4); if (sync_access) sync_access(); diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 4b959e606fe8..35847453cecb 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -298,8 +298,8 @@ static void gic_handle_cascade_irq(unsigned int irq, struct irq_desc *desc) status = readl_relaxed(gic_data_cpu_base(chip_data) + GIC_CPU_INTACK); raw_spin_unlock(&irq_controller_lock); - gic_irq = (status & 0x3ff); - if (gic_irq == 1023) + gic_irq = (status & GICC_IAR_INT_ID_MASK); + if (gic_irq == GICC_INT_SPURIOUS) goto out; cascade_irq = irq_find_mapping(chip_data->domain, gic_irq); @@ -360,7 +360,7 @@ static void __init gic_dist_init(struct gic_chip_data *gic) unsigned int gic_irqs = gic->gic_irqs; void __iomem *base = gic_data_dist_base(gic); - writel_relaxed(0, base + GIC_DIST_CTRL); + writel_relaxed(GICD_DISABLE, base + GIC_DIST_CTRL); /* * Set all global interrupts to this CPU only. @@ -373,7 +373,7 @@ static void __init gic_dist_init(struct gic_chip_data *gic) gic_dist_config(base, gic_irqs, NULL); - writel_relaxed(1, base + GIC_DIST_CTRL); + writel_relaxed(GICD_ENABLE, base + GIC_DIST_CTRL); } static void gic_cpu_init(struct gic_chip_data *gic) @@ -400,8 +400,8 @@ static void gic_cpu_init(struct gic_chip_data *gic) gic_cpu_config(dist_base, NULL); - writel_relaxed(0xf0, base + GIC_CPU_PRIMASK); - writel_relaxed(1, base + GIC_CPU_CTRL); + writel_relaxed(GICC_INT_PRI_THRESHOLD, base + GIC_CPU_PRIMASK); + writel_relaxed(GICC_ENABLE, base + GIC_CPU_CTRL); } void gic_cpu_if_down(void) @@ -467,14 +467,14 @@ static void gic_dist_restore(unsigned int gic_nr) if (!dist_base) return; - writel_relaxed(0, dist_base + GIC_DIST_CTRL); + writel_relaxed(GICD_DISABLE, dist_base + GIC_DIST_CTRL); for (i = 0; i < DIV_ROUND_UP(gic_irqs, 16); i++) writel_relaxed(gic_data[gic_nr].saved_spi_conf[i], dist_base + GIC_DIST_CONFIG + i * 4); for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++) - writel_relaxed(0xa0a0a0a0, + writel_relaxed(GICD_INT_DEF_PRI_X4, dist_base + GIC_DIST_PRI + i * 4); for (i = 0; i < DIV_ROUND_UP(gic_irqs, 4); i++) @@ -485,7 +485,7 @@ static void gic_dist_restore(unsigned int gic_nr) writel_relaxed(gic_data[gic_nr].saved_spi_enable[i], dist_base + GIC_DIST_ENABLE_SET + i * 4); - writel_relaxed(1, dist_base + GIC_DIST_CTRL); + writel_relaxed(GICD_ENABLE, dist_base + GIC_DIST_CTRL); } static void gic_cpu_save(unsigned int gic_nr) @@ -539,10 +539,11 @@ static void gic_cpu_restore(unsigned int gic_nr) writel_relaxed(ptr[i], dist_base + GIC_DIST_CONFIG + i * 4); for (i = 0; i < DIV_ROUND_UP(32, 4); i++) - writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4); + writel_relaxed(GICD_INT_DEF_PRI_X4, + dist_base + GIC_DIST_PRI + i * 4); - writel_relaxed(0xf0, cpu_base + GIC_CPU_PRIMASK); - writel_relaxed(1, cpu_base + GIC_CPU_CTRL); + writel_relaxed(GICC_INT_PRI_THRESHOLD, cpu_base + GIC_CPU_PRIMASK); + writel_relaxed(GICC_ENABLE, cpu_base + GIC_CPU_CTRL); } static int gic_notifier(struct notifier_block *self, unsigned long cmd, void *v) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 45e2d8c15bd2..5cb9d41af5be 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -21,7 +21,10 @@ #define GIC_CPU_ACTIVEPRIO 0xd0 #define GIC_CPU_IDENT 0xfc +#define GICC_ENABLE 0x1 +#define GICC_INT_PRI_THRESHOLD 0xf0 #define GICC_IAR_INT_ID_MASK 0x3ff +#define GICC_INT_SPURIOUS 1023 #define GIC_DIST_CTRL 0x000 #define GIC_DIST_CTR 0x004 @@ -39,6 +42,18 @@ #define GIC_DIST_SGI_PENDING_CLEAR 0xf10 #define GIC_DIST_SGI_PENDING_SET 0xf20 +#define GICD_ENABLE 0x1 +#define GICD_DISABLE 0x0 +#define GICD_INT_ACTLOW_LVLTRIG 0x0 +#define GICD_INT_EN_CLR_X32 0xffffffff +#define GICD_INT_EN_SET_SGI 0x0000ffff +#define GICD_INT_EN_CLR_PPI 0xffff0000 +#define GICD_INT_DEF_PRI 0xa0 +#define GICD_INT_DEF_PRI_X4 ((GICD_INT_DEF_PRI << 24) |\ + (GICD_INT_DEF_PRI << 16) |\ + (GICD_INT_DEF_PRI << 8) |\ + GICD_INT_DEF_PRI) + #define GICH_HCR 0x0 #define GICH_VTR 0x4 #define GICH_VMCR 0x8 -- cgit v1.2.3 From 3228950621d92f0f212378f95a6998ef3a1be0bb Mon Sep 17 00:00:00 2001 From: Feng Kan Date: Wed, 30 Jul 2014 14:56:59 -0700 Subject: irqchip: gic: Preserve gic V2 bypass bits in cpu ctrl register This change is made to preserve the GIC v2 bypass bits in the GIC_CPU_CTRL register (also known as the GICC_CTLR register in spec). This code will preserve all bits configured by the bootloader regarding v2 bypass group bits. In the X-Gene platform, the bypass functionality is not used and bypass bits should not be changed by the kernel gic code as it could lead to incorrect behavior. Signed-off-by: Feng Kan Reviewed-by: Vinayak Kale Reviewed-by: Anup Patel Link: https://lkml.kernel.org/r/1406757419-18729-3-git-send-email-fkan@apm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic.c | 25 ++++++++++++++++++++++--- include/linux/irqchip/arm-gic.h | 1 + 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 35847453cecb..2500f6ba29e1 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -353,6 +353,21 @@ static u8 gic_get_cpumask(struct gic_chip_data *gic) return mask; } +static void gic_cpu_if_up(void) +{ + void __iomem *cpu_base = gic_data_cpu_base(&gic_data[0]); + u32 bypass = 0; + + /* + * Preserve bypass disable bits to be written back later + */ + bypass = readl(cpu_base + GIC_CPU_CTRL); + bypass &= GICC_DIS_BYPASS_MASK; + + writel_relaxed(bypass | GICC_ENABLE, cpu_base + GIC_CPU_CTRL); +} + + static void __init gic_dist_init(struct gic_chip_data *gic) { unsigned int i; @@ -401,13 +416,17 @@ static void gic_cpu_init(struct gic_chip_data *gic) gic_cpu_config(dist_base, NULL); writel_relaxed(GICC_INT_PRI_THRESHOLD, base + GIC_CPU_PRIMASK); - writel_relaxed(GICC_ENABLE, base + GIC_CPU_CTRL); + gic_cpu_if_up(); } void gic_cpu_if_down(void) { void __iomem *cpu_base = gic_data_cpu_base(&gic_data[0]); - writel_relaxed(0, cpu_base + GIC_CPU_CTRL); + u32 val = 0; + + val = readl(cpu_base + GIC_CPU_CTRL); + val &= ~GICC_ENABLE; + writel_relaxed(val, cpu_base + GIC_CPU_CTRL); } #ifdef CONFIG_CPU_PM @@ -543,7 +562,7 @@ static void gic_cpu_restore(unsigned int gic_nr) dist_base + GIC_DIST_PRI + i * 4); writel_relaxed(GICC_INT_PRI_THRESHOLD, cpu_base + GIC_CPU_PRIMASK); - writel_relaxed(GICC_ENABLE, cpu_base + GIC_CPU_CTRL); + gic_cpu_if_up(); } static int gic_notifier(struct notifier_block *self, unsigned long cmd, void *v) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 5cb9d41af5be..13eed92c7d24 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -25,6 +25,7 @@ #define GICC_INT_PRI_THRESHOLD 0xf0 #define GICC_IAR_INT_ID_MASK 0x3ff #define GICC_INT_SPURIOUS 1023 +#define GICC_DIS_BYPASS_MASK 0x1e0 #define GIC_DIST_CTRL 0x000 #define GIC_DIST_CTR 0x004 -- cgit v1.2.3 From 7b7d8982f0169d5ac67c6c2877449fb7f6968cac Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 27 Jul 2014 14:31:53 -0700 Subject: mtd: fix linux/mtd/nand.h kernel-doc warning Fix kernel-doc warning in : Warning(..//include/linux/mtd/nand.h:795): No description found for parameter 'ecc' Signed-off-by: Randy Dunlap Cc: David Woodhouse Cc: Brian Norris Cc: linux-mtd@lists.infradead.org Signed-off-by: Brian Norris --- include/linux/mtd/nand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 3083c53e0270..b7c11991cb09 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -766,6 +766,7 @@ struct nand_chip { * @options: stores various chip bit options * @id_len: The valid length of the @id. * @oobsize: OOB size + * @ecc: ECC correctability and step information from the datasheet. * @ecc.strength_ds: The ECC correctability from the datasheet, same as the * @ecc_strength_ds in nand_chip{}. * @ecc.step_ds: The ECC step required by the @ecc.strength_ds, same as the -- cgit v1.2.3 From 31f754628cbb12c983600f22d9f0fed50dfe2134 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 21 Jul 2014 19:07:22 -0700 Subject: mtd: use __packed shorthand Signed-off-by: Brian Norris --- drivers/mtd/mtdswap.c | 2 +- drivers/mtd/nand/sm_common.h | 2 +- include/linux/mtd/cfi.h | 22 +++++++++++----------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c index 48cf6f98df44..fc8b3d16cce7 100644 --- a/drivers/mtd/mtdswap.c +++ b/drivers/mtd/mtdswap.c @@ -145,7 +145,7 @@ struct mtdswap_dev { struct mtdswap_oobdata { __le16 magic; __le32 count; -} __attribute__((packed)); +} __packed; #define MTDSWAP_MAGIC_CLEAN 0x2095 #define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1) diff --git a/drivers/mtd/nand/sm_common.h b/drivers/mtd/nand/sm_common.h index 00f4a83359b2..d3e028e58b0f 100644 --- a/drivers/mtd/nand/sm_common.h +++ b/drivers/mtd/nand/sm_common.h @@ -18,7 +18,7 @@ struct sm_oob { uint8_t ecc2[3]; uint8_t lba_copy2[2]; uint8_t ecc1[3]; -} __attribute__((packed)); +} __packed; /* one sector is always 512 bytes, but it can consist of two nand pages */ diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 37ef6b194089..299d7d31fe53 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -153,7 +153,7 @@ struct cfi_ident { uint16_t MaxBufWriteSize; uint8_t NumEraseRegions; uint32_t EraseRegionInfo[0]; /* Not host ordered */ -} __attribute__((packed)); +} __packed; /* Extended Query Structure for both PRI and ALT */ @@ -161,7 +161,7 @@ struct cfi_extquery { uint8_t pri[3]; uint8_t MajorVersion; uint8_t MinorVersion; -} __attribute__((packed)); +} __packed; /* Vendor-Specific PRI for Intel/Sharp Extended Command Set (0x0001) */ @@ -180,7 +180,7 @@ struct cfi_pri_intelext { uint8_t FactProtRegSize; uint8_t UserProtRegSize; uint8_t extra[0]; -} __attribute__((packed)); +} __packed; struct cfi_intelext_otpinfo { uint32_t ProtRegAddr; @@ -188,7 +188,7 @@ struct cfi_intelext_otpinfo { uint8_t FactProtRegSize; uint16_t UserGroups; uint8_t UserProtRegSize; -} __attribute__((packed)); +} __packed; struct cfi_intelext_blockinfo { uint16_t NumIdentBlocks; @@ -196,7 +196,7 @@ struct cfi_intelext_blockinfo { uint16_t MinBlockEraseCycles; uint8_t BitsPerCell; uint8_t BlockCap; -} __attribute__((packed)); +} __packed; struct cfi_intelext_regioninfo { uint16_t NumIdentPartitions; @@ -205,7 +205,7 @@ struct cfi_intelext_regioninfo { uint8_t NumOpAllowedSimEraMode; uint8_t NumBlockTypes; struct cfi_intelext_blockinfo BlockTypes[1]; -} __attribute__((packed)); +} __packed; struct cfi_intelext_programming_regioninfo { uint8_t ProgRegShift; @@ -214,7 +214,7 @@ struct cfi_intelext_programming_regioninfo { uint8_t Reserved2; uint8_t ControlInvalid; uint8_t Reserved3; -} __attribute__((packed)); +} __packed; /* Vendor-Specific PRI for AMD/Fujitsu Extended Command Set (0x0002) */ @@ -233,7 +233,7 @@ struct cfi_pri_amdstd { uint8_t VppMin; uint8_t VppMax; uint8_t TopBottom; -} __attribute__((packed)); +} __packed; /* Vendor-Specific PRI for Atmel chips (command set 0x0002) */ @@ -245,18 +245,18 @@ struct cfi_pri_atmel { uint8_t BottomBoot; uint8_t BurstMode; uint8_t PageMode; -} __attribute__((packed)); +} __packed; struct cfi_pri_query { uint8_t NumFields; uint32_t ProtField[1]; /* Not host ordered */ -} __attribute__((packed)); +} __packed; struct cfi_bri_query { uint8_t PageModeReadCap; uint8_t NumFields; uint32_t ConfField[1]; /* Not host ordered */ -} __attribute__((packed)); +} __packed; #define P_ID_NONE 0x0000 #define P_ID_INTEL_EXT 0x0001 -- cgit v1.2.3 From 53f3cc46336b9e514c98556b4a009a69ed808d3b Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Sat, 23 Aug 2014 14:45:47 +0400 Subject: pata_platform: Remove useless irq_flags field IRQ flags can be obtained from resource structure, there are no need to use additional field in the platform_data to store these values. This patch removes this field and convert existing users of this driver to use IRQ flags from the resources. Signed-off-by: Alexander Shiyan Signed-off-by: Tejun Heo --- arch/blackfin/mach-bf537/boards/cm_bf537e.c | 3 +-- arch/blackfin/mach-bf537/boards/cm_bf537u.c | 3 +-- arch/blackfin/mach-bf537/boards/stamp.c | 3 +-- arch/blackfin/mach-bf537/boards/tcm_bf537.c | 3 +-- arch/blackfin/mach-bf561/boards/cm_bf561.c | 3 +-- drivers/ata/pata_of_platform.c | 2 -- drivers/ata/pata_platform.c | 4 +--- include/linux/ata_platform.h | 5 ----- 8 files changed, 6 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/blackfin/mach-bf537/boards/cm_bf537e.c b/arch/blackfin/mach-bf537/boards/cm_bf537e.c index 1e7290ef3525..1e1014df5e9e 100644 --- a/arch/blackfin/mach-bf537/boards/cm_bf537e.c +++ b/arch/blackfin/mach-bf537/boards/cm_bf537e.c @@ -733,7 +733,6 @@ static struct platform_device bfin_mac_device = { static struct pata_platform_info bfin_pata_platform_data = { .ioport_shift = 2, - .irq_type = IRQF_TRIGGER_HIGH, }; static struct resource bfin_pata_resources[] = { @@ -750,7 +749,7 @@ static struct resource bfin_pata_resources[] = { { .start = PATA_INT, .end = PATA_INT, - .flags = IORESOURCE_IRQ, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, }, }; diff --git a/arch/blackfin/mach-bf537/boards/cm_bf537u.c b/arch/blackfin/mach-bf537/boards/cm_bf537u.c index c7495dc74690..d056db9e5592 100644 --- a/arch/blackfin/mach-bf537/boards/cm_bf537u.c +++ b/arch/blackfin/mach-bf537/boards/cm_bf537u.c @@ -587,7 +587,6 @@ static struct platform_device bfin_mac_device = { static struct pata_platform_info bfin_pata_platform_data = { .ioport_shift = 2, - .irq_type = IRQF_TRIGGER_HIGH, }; static struct resource bfin_pata_resources[] = { @@ -604,7 +603,7 @@ static struct resource bfin_pata_resources[] = { { .start = PATA_INT, .end = PATA_INT, - .flags = IORESOURCE_IRQ, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, }, }; diff --git a/arch/blackfin/mach-bf537/boards/stamp.c b/arch/blackfin/mach-bf537/boards/stamp.c index de19b8a56007..88a19fc9844d 100644 --- a/arch/blackfin/mach-bf537/boards/stamp.c +++ b/arch/blackfin/mach-bf537/boards/stamp.c @@ -2462,7 +2462,6 @@ static struct platform_device bfin_sport0_device = { #define PATA_INT IRQ_PF5 static struct pata_platform_info bfin_pata_platform_data = { .ioport_shift = 1, - .irq_flags = IRQF_TRIGGER_HIGH, }; static struct resource bfin_pata_resources[] = { @@ -2479,7 +2478,7 @@ static struct resource bfin_pata_resources[] = { { .start = PATA_INT, .end = PATA_INT, - .flags = IORESOURCE_IRQ, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, }, }; #elif defined(CF_IDE_NAND_CARD_USE_CF_IN_COMMON_MEMORY_MODE) diff --git a/arch/blackfin/mach-bf537/boards/tcm_bf537.c b/arch/blackfin/mach-bf537/boards/tcm_bf537.c index 6b988ad653d8..ed309c9a62b6 100644 --- a/arch/blackfin/mach-bf537/boards/tcm_bf537.c +++ b/arch/blackfin/mach-bf537/boards/tcm_bf537.c @@ -589,7 +589,6 @@ static struct platform_device bfin_mac_device = { static struct pata_platform_info bfin_pata_platform_data = { .ioport_shift = 2, - .irq_type = IRQF_TRIGGER_HIGH, }; static struct resource bfin_pata_resources[] = { @@ -606,7 +605,7 @@ static struct resource bfin_pata_resources[] = { { .start = PATA_INT, .end = PATA_INT, - .flags = IORESOURCE_IRQ, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, }, }; diff --git a/arch/blackfin/mach-bf561/boards/cm_bf561.c b/arch/blackfin/mach-bf561/boards/cm_bf561.c index e862f7823e68..c6db52ba3a06 100644 --- a/arch/blackfin/mach-bf561/boards/cm_bf561.c +++ b/arch/blackfin/mach-bf561/boards/cm_bf561.c @@ -354,7 +354,6 @@ static struct platform_device bfin_sir0_device = { static struct pata_platform_info bfin_pata_platform_data = { .ioport_shift = 2, - .irq_type = IRQF_TRIGGER_HIGH, }; static struct resource bfin_pata_resources[] = { @@ -371,7 +370,7 @@ static struct resource bfin_pata_resources[] = { { .start = PATA_INT, .end = PATA_INT, - .flags = IORESOURCE_IRQ, + .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, }, }; diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c index 6af1c9b9a464..64965398914a 100644 --- a/drivers/ata/pata_of_platform.c +++ b/drivers/ata/pata_of_platform.c @@ -43,8 +43,6 @@ static int pata_of_platform_probe(struct platform_device *ofdev) } irq_res = platform_get_resource(ofdev, IORESOURCE_IRQ, 0); - if (irq_res) - irq_res->flags = 0; prop = of_get_property(dn, "reg-shift", NULL); if (prop) diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c index a5579b55e332..f8cff3e247c5 100644 --- a/drivers/ata/pata_platform.c +++ b/drivers/ata/pata_platform.c @@ -118,7 +118,7 @@ int __pata_platform_probe(struct device *dev, struct resource *io_res, */ if (irq_res && irq_res->start > 0) { irq = irq_res->start; - irq_flags = irq_res->flags; + irq_flags = irq_res->flags & IRQF_TRIGGER_MASK; } /* @@ -213,8 +213,6 @@ static int pata_platform_probe(struct platform_device *pdev) * And the IRQ */ irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (irq_res) - irq_res->flags = pp_info ? pp_info->irq_flags : 0; return __pata_platform_probe(&pdev->dev, io_res, ctl_res, irq_res, pp_info ? pp_info->ioport_shift : 0, diff --git a/include/linux/ata_platform.h b/include/linux/ata_platform.h index b9fde17f767c..5c618a084225 100644 --- a/include/linux/ata_platform.h +++ b/include/linux/ata_platform.h @@ -8,11 +8,6 @@ struct pata_platform_info { * spacing used by ata_std_ports(). */ unsigned int ioport_shift; - /* - * Indicate platform specific irq types and initial - * IRQ flags when call request_irq() - */ - unsigned int irq_flags; }; extern int __pata_platform_probe(struct device *dev, -- cgit v1.2.3 From 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Aug 2014 11:48:26 -0400 Subject: perf: Add PERF_EVENT_STATE_EXIT state for events with exited task Adding new perf event state to indicate that the monitored task has exited. In this case the event stays alive until the owner task exits or close the event fd while providing the last data through the read syscall and ring buffer. Instead it needs to propagate the error info (monitored task has died) via poll and read syscalls by returning POLLHUP and 0 respectively. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140811120102.GY9918@twins.programming.kicks-ass.net Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jean Pihet Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-t5y3w8jjx6tfo5w8y6oajsjq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f0a1036b1911..893a0d07986f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -269,6 +269,7 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { + PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index 4575dd6e59ea..d8cb4d21a346 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3600,7 +3600,8 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (event->state == PERF_EVENT_STATE_ERROR) + if ((event->state == PERF_EVENT_STATE_ERROR) || + (event->state == PERF_EVENT_STATE_EXIT)) return 0; if (count < event->read_size) @@ -3630,6 +3631,10 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLLHUP; poll_wait(file, &event->waitq, wait); + + if (event->state == PERF_EVENT_STATE_EXIT) + return events; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -7588,6 +7593,9 @@ __perf_event_exit_task(struct perf_event *child_event, if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); + } else { + child_event->state = PERF_EVENT_STATE_EXIT; + perf_event_wakeup(child_event); } } -- cgit v1.2.3 From b4bbb107d73bbc0d92c9ae7fd8e69580aa9381e7 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 27 Jun 2014 11:56:58 +0200 Subject: dma-mapping: Provide write-combine allocations Provide an implementation for dma_{alloc,free,mmap}_writecombine() when the architecture supports DMA attributes. Signed-off-by: Thierry Reding Acked-by: Arnd Bergmann Signed-off-by: Marek Szyprowski --- arch/arm/include/asm/dma-mapping.h | 16 ---------------- include/asm-generic/dma-mapping-common.h | 8 -------- include/linux/dma-mapping.h | 26 ++++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index c45b61a4b4a5..85738b200023 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -265,22 +265,6 @@ extern int arm_dma_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs); -static inline void *dma_alloc_writecombine(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ - DEFINE_DMA_ATTRS(attrs); - dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); - return dma_alloc_attrs(dev, size, dma_handle, flag, &attrs); -} - -static inline void dma_free_writecombine(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ - DEFINE_DMA_ATTRS(attrs); - dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); - return dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs); -} - /* * This can be called during early boot to increase the size of the atomic * coherent DMA pool above the default value of 256KiB. It must be called diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index de8bf89940f8..d137431bf26f 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h @@ -205,14 +205,6 @@ dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL) -static inline int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) -{ - DEFINE_DMA_ATTRS(attrs); - dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); - return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs); -} - int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 931b70986272..d5d388160f42 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -263,6 +263,32 @@ struct dma_attrs; #define dma_unmap_sg_attrs(dev, sgl, nents, dir, attrs) \ dma_unmap_sg(dev, sgl, nents, dir) +#else +static inline void *dma_alloc_writecombine(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t gfp) +{ + DEFINE_DMA_ATTRS(attrs); + dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); + return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs); +} + +static inline void dma_free_writecombine(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr) +{ + DEFINE_DMA_ATTRS(attrs); + dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); + return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs); +} + +static inline int dma_mmap_writecombine(struct device *dev, + struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, + size_t size) +{ + DEFINE_DMA_ATTRS(attrs); + dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); + return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs); +} #endif /* CONFIG_HAVE_DMA_ATTRS */ #ifdef CONFIG_NEED_DMA_MAP_STATE -- cgit v1.2.3 From 4a32fea9d78f2d2315c0072757b197d5a304dc8b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:27 -0500 Subject: scheduler: Replace __get_cpu_var with this_cpu_ptr Convert all uses of __get_cpu_var for address calculation to use this_cpu_ptr instead. [Uses of __get_cpu_var with cpumask_var_t are no longer handled by this patch] Cc: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/kernel_stat.h | 4 ++-- kernel/events/callchain.c | 4 ++-- kernel/events/core.c | 24 ++++++++++++------------ kernel/sched/sched.h | 4 ++-- kernel/taskstats.c | 2 +- kernel/time/tick-sched.c | 4 ++-- kernel/user-return-notifier.c | 4 ++-- 7 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index ecbc52f9ff77..8422b4ed6882 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -44,8 +44,8 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); /* Must have preemption disabled for this to be meaningful. */ -#define kstat_this_cpu (&__get_cpu_var(kstat)) -#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat)) +#define kstat_this_cpu this_cpu_ptr(&kstat) +#define kcpustat_this_cpu this_cpu_ptr(&kernel_cpustat) #define kstat_cpu(cpu) per_cpu(kstat, cpu) #define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..c4f63e68a35c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) int cpu; struct callchain_cpus_entries *entries; - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; @@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) static void put_callchain_entry(int rctx) { - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..4d44e40a0483 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -239,7 +239,7 @@ static void perf_duration_warn(struct irq_work *w) u64 avg_local_sample_len; u64 local_samples_len; - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; printk_ratelimited(KERN_WARNING @@ -261,10 +261,10 @@ void perf_sample_event_took(u64 sample_len_ns) return; /* decay the counter by 1 average sample */ - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; local_samples_len += sample_len_ns; - __get_cpu_var(running_sample_length) = local_samples_len; + __this_cpu_write(running_sample_length, local_samples_len); /* * note: this will be biased artifically low until we have @@ -877,7 +877,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list); static void perf_pmu_rotate_start(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&rotation_list); WARN_ON(!irqs_disabled()); @@ -2389,7 +2389,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_out(task, next); } @@ -2632,11 +2632,11 @@ void __perf_event_task_sched_in(struct task_struct *prev, * to check if we have to switch in PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); /* check for system-wide branch_stack events */ - if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) perf_branch_stack_sched_in(prev, task); } @@ -2891,7 +2891,7 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&rotation_list); struct perf_cpu_context *cpuctx, *tmp; struct perf_event_context *ctx; int throttled; @@ -5671,7 +5671,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; @@ -5690,7 +5690,7 @@ end: int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } @@ -5698,7 +5698,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); inline void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } @@ -5727,7 +5727,7 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_add(struct perf_event *event, int flags) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..77d92f8130e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -650,10 +650,10 @@ static inline int cpu_of(struct rq *rq) DECLARE_PER_CPU(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) +#define this_rq() this_cpu_ptr(&runqueues) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) +#define raw_rq() raw_cpu_ptr(&runqueues) static inline u64 rq_clock(struct rq *rq) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7cd65db..b312fcc73024 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) fill_tgid_exit(tsk); } - listeners = __this_cpu_ptr(&listener_array); + listeners = raw_cpu_ptr(&listener_array); if (list_empty(&listeners->list)) return; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73f90932282b..3cadc112519f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -924,7 +924,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -1041,7 +1041,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 394f70b17162..9586b670a5b2 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); void user_return_notifier_register(struct user_return_notifier *urn) { set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); - hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); + hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list)); } EXPORT_SYMBOL_GPL(user_return_notifier_register); @@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); void user_return_notifier_unregister(struct user_return_notifier *urn) { hlist_del(&urn->link); - if (hlist_empty(&__get_cpu_var(return_notifier_list))) + if (hlist_empty(this_cpu_ptr(&return_notifier_list))) clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); } EXPORT_SYMBOL_GPL(user_return_notifier_unregister); -- cgit v1.2.3 From 47405a253da4d8ca4b18ad537423083fdd790440 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:56 -0500 Subject: percpu: Remove __this_cpu_ptr The __this_cpu_ptr macro is no longer in use so drop it. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/percpu-defs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index cfd56046ecec..420032d41d27 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -257,9 +257,6 @@ do { \ #define __raw_get_cpu_var(var) (*raw_cpu_ptr(&(var))) #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) -/* keep until we have removed all uses of __this_cpu_ptr */ -#define __this_cpu_ptr(ptr) raw_cpu_ptr(ptr) - /* * Must be an lvalue. Since @var must be a simple identifier, * we force a syntax error here if it isn't. -- cgit v1.2.3 From 4ba2968420fa9d0604b6a6a5c61bfa8d0fa84ae0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 26 Aug 2014 19:12:21 -0500 Subject: percpu: Resolve ambiguities in __get_cpu_var/cpumask_var_t __get_cpu_var can paper over differences in the definitions of cpumask_var_t and either use the address of the cpumask variable directly or perform a fetch of the address of the struct cpumask allocated elsewhere. This is important particularly when using per cpu cpumask_var_t declarations because in one case we have an offset into a per cpu area to handle and in the other case we need to fetch a pointer from the offset. This patch introduces a new macro this_cpu_cpumask_var_ptr() that is defined where cpumask_var_t is defined and performs the proper actions. All use cases where __get_cpu_var is used with cpumask_var_t are converted to the use of this_cpu_cpumask_var_ptr(). Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/perf_event_p4.h | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 3 +-- arch/x86/oprofile/op_model_p4.c | 2 +- include/linux/cpumask.h | 11 +++++++++++ kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- 7 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 85e13ccf15c4..d725382c2ae0 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -189,7 +189,7 @@ static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP if (smp_num_siblings == 2) - return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6ce600f9bc78..1f5d5f2ffae6 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -42,8 +42,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) * We are to modify mask, so we need an own copy * and be sure it's manipulated with irq off. */ - ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); - cpumask_copy(ipi_mask_ptr, mask); + ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask); /* * The idea is to send one IPI per cluster. diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 98ab13058f89..ad1d91f475ab 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -372,7 +372,7 @@ static unsigned int get_stagger(void) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; } diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2997af6d2ccd..0a9a6da21e74 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -666,10 +666,19 @@ static inline size_t cpumask_size(void) * * This code makes NR_CPUS length memcopy and brings to a memory corruption. * cpumask_copy() provide safe copy functionality. + * + * Note that there is another evil here: If you define a cpumask_var_t + * as a percpu variable then the way to obtain the address of the cpumask + * structure differently influences what this_cpu_* operation needs to be + * used. Please use this_cpu_cpumask_var_t in those cases. The direct use + * of this_cpu_ptr() or this_cpu_read() will lead to failures when the + * other type of cpumask_var_t implementation is configured. */ #ifdef CONFIG_CPUMASK_OFFSTACK typedef struct cpumask *cpumask_var_t; +#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) + bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); @@ -681,6 +690,8 @@ void free_bootmem_cpumask_var(cpumask_var_t mask); #else typedef struct cpumask cpumask_var_t[1]; +#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) + static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..4a608cfaecbd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1158,7 +1158,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); int best_cpu, cpu = task_cpu(task); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..197d659c144c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6539,7 +6539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_mask); + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { .sd = sd, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..a4c50fce9b90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1526,7 +1526,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); -- cgit v1.2.3 From abdc08a3a263a20e49534a36291d657bf53dda5b Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Tue, 19 Aug 2014 10:06:09 -0700 Subject: gpio: change gpiochip_request_own_desc() prototype The current prototype of gpiochip_request_own_desc() requires to obtain a pointer to a descriptor. This is in contradiction to all other GPIO request schemes, and imposes an extra step of obtaining a descriptor to drivers. Most drivers actually cannot even perform that step since the function that does it (gpichip_get_desc()) is gpiolib-private. Change gpiochip_request_own_desc() to return a descriptor from a (chip, hwnum) tuple and update users of this function (currently gpiolib-acpi only). Signed-off-by: Alexandre Courbot Tested-by: Mika Westerberg Signed-off-by: Linus Walleij --- Documentation/gpio/driver.txt | 3 ++- drivers/gpio/gpiolib-acpi.c | 20 +++----------------- drivers/gpio/gpiolib.c | 18 ++++++++++++++---- include/linux/gpio/driver.h | 3 ++- 4 files changed, 21 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/gpio/driver.txt b/Documentation/gpio/driver.txt index 18790c237977..23b751a10d7b 100644 --- a/Documentation/gpio/driver.txt +++ b/Documentation/gpio/driver.txt @@ -178,7 +178,8 @@ does not help since it pins the module to the kernel forever (it calls try_module_get()). A GPIO driver can use the following functions instead to request and free descriptors without being pinned to the kernel forever. - int gpiochip_request_own_desc(struct gpio_desc *desc, const char *label) + struct gpio_desc *gpiochip_request_own_desc(struct gpio_desc *desc, + const char *label) void gpiochip_free_own_desc(struct gpio_desc *desc) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 84540025aa08..f9103e72e2a4 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -145,14 +145,8 @@ static acpi_status acpi_gpiochip_request_interrupt(struct acpi_resource *ares, if (!handler) return AE_BAD_PARAMETER; - desc = gpiochip_get_desc(chip, pin); + desc = gpiochip_request_own_desc(chip, pin, "ACPI:Event"); if (IS_ERR(desc)) { - dev_err(chip->dev, "Failed to get GPIO descriptor\n"); - return AE_ERROR; - } - - ret = gpiochip_request_own_desc(desc, "ACPI:Event"); - if (ret) { dev_err(chip->dev, "Failed to request GPIO\n"); return AE_ERROR; } @@ -420,22 +414,14 @@ acpi_gpio_adr_space_handler(u32 function, acpi_physical_address address, } } if (!found) { - int ret; - - desc = gpiochip_get_desc(chip, pin); + desc = gpiochip_request_own_desc(chip, pin, + "ACPI:OpRegion"); if (IS_ERR(desc)) { status = AE_ERROR; mutex_unlock(&achip->conn_lock); goto out; } - ret = gpiochip_request_own_desc(desc, "ACPI:OpRegion"); - if (ret) { - status = AE_ERROR; - mutex_unlock(&achip->conn_lock); - goto out; - } - switch (agpio->io_restriction) { case ACPI_IO_RESTRICT_INPUT: gpiod_direction_input(desc); diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 15cc0bb65dda..a5831d6a9b91 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -895,12 +895,22 @@ EXPORT_SYMBOL_GPL(gpiochip_is_requested); * allows the GPIO chip module to be unloaded as needed (we assume that the * GPIO chip driver handles freeing the GPIOs it has requested). */ -int gpiochip_request_own_desc(struct gpio_desc *desc, const char *label) +struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *chip, u16 hwnum, + const char *label) { - if (!desc || !desc->chip) - return -EINVAL; + struct gpio_desc *desc = gpiochip_get_desc(chip, hwnum); + int err; + + if (IS_ERR(desc)) { + chip_err(chip, "failed to get GPIO descriptor\n"); + return desc; + } + + err = __gpiod_request(desc, label); + if (err < 0) + return ERR_PTR(err); - return __gpiod_request(desc, label); + return desc; } EXPORT_SYMBOL_GPL(gpiochip_request_own_desc); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e78a2373e374..a2de58fffd19 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -166,7 +166,8 @@ int gpiochip_irqchip_add(struct gpio_chip *gpiochip, #endif /* CONFIG_GPIO_IRQCHIP */ -int gpiochip_request_own_desc(struct gpio_desc *desc, const char *label); +struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *chip, u16 hwnum, + const char *label); void gpiochip_free_own_desc(struct gpio_desc *desc); #else /* CONFIG_GPIOLIB */ -- cgit v1.2.3 From 068765ba7987e73d4381edfe47b70aa121c7155c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 1 Sep 2014 13:47:49 +0200 Subject: PM / sleep: Mechanism for aborting system suspends unconditionally It sometimes may be necessary to abort a system suspend in progress or wake up the system from suspend-to-idle even if the pm_wakeup_event()/pm_stay_awake() mechanism is not enabled. For this purpose, introduce a new global variable pm_abort_suspend and make pm_wakeup_pending() check its value. Also add routines for manipulating that variable. Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 16 +++++++++++++++- include/linux/suspend.h | 4 ++++ kernel/power/process.c | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index eb1bd2ecad8b..c2744b30d5d9 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -24,6 +24,9 @@ */ bool events_check_enabled __read_mostly; +/* If set and the system is suspending, terminate the suspend. */ +static bool pm_abort_suspend __read_mostly; + /* * Combined counters of registered wakeup events and wakeup events in progress. * They need to be modified together atomically, so it's better to use one @@ -719,7 +722,18 @@ bool pm_wakeup_pending(void) pm_print_active_wakeup_sources(); } - return ret; + return ret || pm_abort_suspend; +} + +void pm_system_wakeup(void) +{ + pm_abort_suspend = true; + freeze_wake(); +} + +void pm_wakeup_clear(void) +{ + pm_abort_suspend = false; } /** diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 519064e0c943..06a9910827c2 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -371,6 +371,8 @@ extern int unregister_pm_notifier(struct notifier_block *nb); extern bool events_check_enabled; extern bool pm_wakeup_pending(void); +extern void pm_system_wakeup(void); +extern void pm_wakeup_clear(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); @@ -418,6 +420,8 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_wakeup_pending(void) { return false; } +static inline void pm_system_wakeup(void) {} +static inline void pm_wakeup_clear(void) {} static inline void lock_system_sleep(void) {} static inline void unlock_system_sleep(void) {} diff --git a/kernel/power/process.c b/kernel/power/process.c index 4ee194eb524b..7b323221b9ee 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -129,6 +129,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); + pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); -- cgit v1.2.3 From cab303be91dc47942bc25de33dc1140123540800 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Aug 2014 11:44:31 +0200 Subject: genirq: Add sanity checks for PM options on shared interrupt lines Account the IRQF_NO_SUSPEND and IRQF_RESUME_EARLY actions on shared interrupt lines and yell loudly if there is a mismatch. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- include/linux/irqdesc.h | 10 ++++++++++ kernel/irq/internals.h | 10 ++++++++++ kernel/irq/manage.c | 4 ++++ kernel/irq/pm.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 472c021a2d4f..cb1a31e448ae 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -36,6 +36,11 @@ struct irq_desc; * @threads_oneshot: bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers + * @nr_actions: number of installed actions on this descriptor + * @no_suspend_depth: number of irqactions on a irq descriptor with + * IRQF_NO_SUSPEND set + * @force_resume_depth: number of irqactions on a irq descriptor with + * IRQF_FORCE_RESUME set * @dir: /proc/irq/ procfs entry * @name: flow handler name for /proc/interrupts output */ @@ -68,6 +73,11 @@ struct irq_desc { unsigned long threads_oneshot; atomic_t threads_active; wait_queue_head_t wait_for_threads; +#ifdef CONFIG_PM_SLEEP + unsigned int nr_actions; + unsigned int no_suspend_depth; + unsigned int force_resume_depth; +#endif #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index af2821178900..c402502a5111 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -194,3 +194,13 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } + +#ifdef CONFIG_PM_SLEEP +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); +#else +static inline void +irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } +static inline void +irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa564e8db996..0a9104b4608b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1200,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; *old_ptr = new; + irq_pm_install_action(desc, new); + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -1318,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; + irq_pm_remove_action(desc, action); + /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_shutdown(desc); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index b84141dcee5e..1b1b67a73218 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,6 +13,42 @@ #include "internals.h" +/* + * Called from __setup_irq() with desc->lock held after @action has + * been installed in the action chain. + */ +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions++; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth++; + + WARN_ON_ONCE(desc->force_resume_depth && + desc->force_resume_depth != desc->nr_actions); + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth++; + + WARN_ON_ONCE(desc->no_suspend_depth && + desc->no_suspend_depth != desc->nr_actions); +} + +/* + * Called from __free_irq() with desc->lock held after @action has + * been removed from the action chain. + */ +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions--; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth--; + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth--; +} + static void suspend_device_irq(struct irq_desc *desc, int irq) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) -- cgit v1.2.3 From b76f16748fa61801b1a1fd3ffb6f25ee228a35e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Aug 2014 13:54:09 +0200 Subject: genirq: Mark wakeup sources as armed on suspend This allows us to utilize this information in the irq_may_run() check without adding another conditional to the fast path. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- include/linux/irq.h | 8 ++++++++ kernel/irq/pm.c | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 62af59242ddc..03f48d936f66 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -173,6 +173,7 @@ struct irq_data { * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt + * IRQD_WAKEUP_ARMED - Wakeup mode armed */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -186,6 +187,7 @@ enum { IRQD_IRQ_DISABLED = (1 << 16), IRQD_IRQ_MASKED = (1 << 17), IRQD_IRQ_INPROGRESS = (1 << 18), + IRQD_WAKEUP_ARMED = (1 << 19), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -257,6 +259,12 @@ static inline bool irqd_irq_inprogress(struct irq_data *d) return d->state_use_accessors & IRQD_IRQ_INPROGRESS; } +static inline bool irqd_is_wakeup_armed(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_WAKEUP_ARMED; +} + + /* * Functions for chained handlers which can be enabled/disabled by the * standard disable_irq/enable_irq calls. Must be called with diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cf0ce0163db9..766930eaeed9 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -54,6 +54,9 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) if (!desc->action || desc->no_suspend_depth) return false; + if (irqd_is_wakeup_set(&desc->irq_data)) + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED; __disable_irq(desc, irq); @@ -101,6 +104,8 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc, int irq) { + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + if (desc->istate & IRQS_SUSPENDED) goto resume; -- cgit v1.2.3 From 9ce7a25849e80cfb264f4995f832b932c1987e1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Aug 2014 14:00:16 +0200 Subject: genirq: Simplify wakeup mechanism Currently we suspend wakeup interrupts by lazy disabling them and check later whether the interrupt has fired, but that's not sufficient for suspend to idle as there is no way to check that once we transitioned into the CPU idle state. So we change the mechanism in the following way: 1) Leave the wakeup interrupts enabled across suspend 2) Add a check to irq_may_run() which is called at the beginning of each flow handler whether the interrupt is an armed wakeup source. This check is basically free as it just extends the existing check for IRQD_IRQ_INPROGRESS. So no new conditional in the hot path. If the IRQD_WAKEUP_ARMED flag is set, then the interrupt is disabled, marked as pending/suspended and the pm core is notified about the wakeup event. Signed-off-by: Thomas Gleixner [ rjw: syscore.c and put irq_pm_check_wakeup() into pm.c ] Signed-off-by: Rafael J. Wysocki --- drivers/base/syscore.c | 7 +++--- include/linux/interrupt.h | 5 ----- kernel/irq/chip.c | 20 ++++++++++++++++- kernel/irq/internals.h | 2 ++ kernel/irq/pm.c | 55 +++++++++++++++++++++++++---------------------- 5 files changed, 53 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index dbb8350ea8dc..8d98a329f6ea 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include static LIST_HEAD(syscore_ops_list); @@ -54,9 +54,8 @@ int syscore_suspend(void) pr_debug("Checking wakeup interrupts\n"); /* Return error code if there are any wakeup interrupts pending. */ - ret = check_wakeup_irqs(); - if (ret) - return ret; + if (pm_wakeup_pending()) + return -EBUSY; WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core suspend.\n"); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 698ad053d064..69517a24bc50 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -193,11 +193,6 @@ extern void irq_wake_thread(unsigned int irq, void *dev_id); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); -#ifdef CONFIG_PM_SLEEP -extern int check_wakeup_irqs(void); -#else -static inline int check_wakeup_irqs(void) { return 0; } -#endif /** * struct irq_affinity_notify - context for notification of IRQ affinity changes diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6baf86085571..e7917ff8a486 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -344,8 +344,26 @@ static bool irq_check_poll(struct irq_desc *desc) static bool irq_may_run(struct irq_desc *desc) { - if (!irqd_irq_inprogress(&desc->irq_data)) + unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; + + /* + * If the interrupt is not in progress and is not an armed + * wakeup interrupt, proceed. + */ + if (!irqd_has_set(&desc->irq_data, mask)) return true; + + /* + * If the interrupt is an armed wakeup source, mark it pending + * and suspended, disable it and notify the pm core about the + * event. + */ + if (irq_pm_check_wakeup(desc)) + return false; + + /* + * Handle a potential concurrent poll on a different core. + */ return irq_check_poll(desc); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c402502a5111..4332d766619d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -196,9 +196,11 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d } #ifdef CONFIG_PM_SLEEP +bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else +static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 766930eaeed9..3ca532592704 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,10 +9,24 @@ #include #include #include +#include #include #include "internals.h" +bool irq_pm_check_wakeup(struct irq_desc *desc) +{ + if (irqd_is_wakeup_armed(&desc->irq_data)) { + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_wakeup(); + return true; + } + return false; +} + /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. @@ -54,8 +68,16 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) if (!desc->action || desc->no_suspend_depth) return false; - if (irqd_is_wakeup_set(&desc->irq_data)) + if (irqd_is_wakeup_set(&desc->irq_data)) { irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + /* + * We return true here to force the caller to issue + * synchronize_irq(). We need to make sure that the + * IRQD_WAKEUP_ARMED is visible before we return from + * suspend_device_irqs(). + */ + return true; + } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc, irq); @@ -79,9 +101,13 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except - * for those which are unused and those which are marked as not + * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND - * set. + * set and those which are marked as active wakeup sources. + * + * The active wakeup sources are handled by the flow handler entry + * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the + * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { @@ -173,26 +199,3 @@ void resume_device_irqs(void) resume_irqs(false); } EXPORT_SYMBOL_GPL(resume_device_irqs); - -/** - * check_wakeup_irqs - check if any wake-up interrupts are pending - */ -int check_wakeup_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - /* - * Only interrupts which are marked as wakeup source - * and have not been disabled before the suspend check - * can abort suspend. - */ - if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->depth == 1 && desc->istate & IRQS_PENDING) - return -EBUSY; - } - } - - return 0; -} -- cgit v1.2.3 From fbff66108352d19b5cffa7dce26d7638c9dd4d70 Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Thu, 28 Aug 2014 04:43:09 -0700 Subject: security: Silence shadow warning Renaming an unused formal parameter in the static inline function security_inode_init_security eliminates many W=2 warnings. Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher Signed-off-by: James Morris --- include/linux/security.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 623f90e5f38d..3b3aeb1b74cb 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2108,7 +2108,7 @@ static inline int security_dentry_init_security(struct dentry *dentry, static inline int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, - const initxattrs initxattrs, + const initxattrs xattrs, void *fs_data) { return 0; -- cgit v1.2.3 From 5835d96e9ce4efdba8c6cefffc2f1575925456de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: implement [__]alloc_percpu_gfp() Now that pcpu_alloc_area() can allocate only from populated areas, it's easy to add atomic allocation support to [__]alloc_percpu(). Update pcpu_alloc() so that it accepts @gfp and skips all the blocking operations and allocates only from the populated areas if @gfp doesn't contain GFP_KERNEL. New interface functions [__]alloc_percpu_gfp() are added. While this means that atomic allocations are possible, this isn't complete yet as there's no mechanism to ensure that certain amount of populated areas is kept available and atomic allocations may keep failing under certain conditions. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 9 +++++-- mm/percpu.c | 64 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 6f61b61b7996..d1b416da25ed 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -122,11 +122,16 @@ extern void __init setup_per_cpu_areas(void); #endif extern void __init percpu_init_late(void); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); extern void __percpu *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -#define alloc_percpu(type) \ - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) +#define alloc_percpu_gfp(type, gfp) \ + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ + __alignof__(type), gfp) +#define alloc_percpu(type) \ + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) #endif /* __LINUX_PERCPU_H */ diff --git a/mm/percpu.c b/mm/percpu.c index 577d84fb3002..c52b93117dc2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -151,11 +151,6 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; -/* - * Free path accesses and alters only the index data structures and can be - * safely called from atomic context. When memory needs to be returned to - * the system, free path schedules reclaim_work. - */ static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ @@ -727,20 +722,21 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; + bool is_atomic = !(gfp & GFP_KERNEL); int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -773,14 +769,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) while ((new_alloc = pcpu_need_to_extend(chunk))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; @@ -797,6 +794,8 @@ restart: new_alloc = pcpu_need_to_extend(chunk); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -811,7 +810,7 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; } @@ -824,6 +823,9 @@ restart: * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ + if (is_atomic) + goto fail; + mutex_lock(&pcpu_alloc_mutex); if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { @@ -846,7 +848,7 @@ area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - if (true) { + if (!is_atomic) { int page_start, page_end, rs, re; mutex_lock(&pcpu_alloc_mutex); @@ -884,9 +886,9 @@ area_found: fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); @@ -895,22 +897,34 @@ fail: } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -932,7 +946,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** -- cgit v1.2.3 From 1a4d76076cda69b0abf15463a8cebc172406da25 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implement asynchronous chunk population The percpu allocator now supports atomic allocations by only allocating from already populated areas but the mechanism to ensure that there's adequate amount of populated areas was missing. This patch expands pcpu_balance_work so that in addition to freeing excess free chunks it also populates chunks to maintain an adequate level of populated areas. pcpu_alloc() schedules pcpu_balance_work if the amount of free populated areas is too low or after an atomic allocation failure. * PERPCU_DYNAMIC_RESERVE is increased by two pages to account for PCPU_EMPTY_POP_PAGES_LOW. * pcpu_async_enabled is added to gate both async jobs - chunk->map_extend_work and pcpu_balance_work - so that we don't end up scheduling them while the needed subsystems aren't up yet. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 4 +- mm/percpu.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 115 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d1b416da25ed..a3aa63e47637 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -48,9 +48,9 @@ * intelligent way to determine this would be nice. */ #if BITS_PER_LONG > 32 -#define PERCPU_DYNAMIC_RESERVE (20 << 10) +#define PERCPU_DYNAMIC_RESERVE (28 << 10) #else -#define PERCPU_DYNAMIC_RESERVE (12 << 10) +#define PERCPU_DYNAMIC_RESERVE (20 << 10) #endif extern void *pcpu_base_addr; diff --git a/mm/percpu.c b/mm/percpu.c index 28a830590b4c..867efd38d879 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -78,6 +78,8 @@ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* balance work is used to populate or destroy chunks asynchronously */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) schedule_work(&chunk->map_extend_work); } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; @@ -1005,6 +1021,9 @@ area_found: if (chunk != pcpu_reserved_chunk) pcpu_nr_empty_pop_pages -= occ_pages; + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1023,6 +1042,11 @@ fail: if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } @@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_balance_workfn - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. @@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work) LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_balance_work); + pcpu_schedule_balance_work(); break; } } @@ -2187,3 +2284,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); -- cgit v1.2.3 From 76ba59f8366f2d9282cb5bda9de75b4b68cbe55f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Aug 2014 11:03:16 +0100 Subject: genirq: Add irq_domain-aware core IRQ handler Calling irq_find_mapping from outside a irq_{enter,exit} section is unsafe and produces ugly messages if CONFIG_PROVE_RCU is enabled: If coming from the idle state, the rcu_read_lock call in irq_find_mapping will generate an unpleasant warning: =============================== [ INFO: suspicious RCU usage. ] 3.16.0-rc1+ #135 Not tainted ------------------------------- include/linux/rcupdate.h:871 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 1 lock held by swapper/0/0: #0: (rcu_read_lock){......}, at: [] irq_find_mapping+0x4c/0x198 As this issue is fairly widespread and involves at least three different architectures, a possible solution is to add a new handle_domain_irq entry point into the generic IRQ code that the interrupt controller code can call. This new function takes an irq_domain, and calls into irq_find_domain inside the irq_{enter,exit} block. An additional "lookup" parameter is used to allow non-domain architecture code to be replaced by this as well. Interrupt controllers can then be updated to use the new mechanism. This code is sitting behind a new CONFIG_HANDLE_DOMAIN_IRQ, as not all architectures implement set_irq_regs (yes, mn10300, I'm looking at you...). Reported-by: Vladimir Murzin Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1409047421-27649-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- include/linux/irqdesc.h | 19 +++++++++++++++++++ kernel/irq/Kconfig | 3 +++ kernel/irq/irqdesc.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 472c021a2d4f..ff24667cd86c 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -12,6 +12,8 @@ struct irq_affinity_notify; struct proc_dir_entry; struct module; struct irq_desc; +struct irq_domain; +struct pt_regs; /** * struct irq_desc - interrupt descriptor @@ -118,6 +120,23 @@ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *de int generic_handle_irq(unsigned int irq); +#ifdef CONFIG_HANDLE_DOMAIN_IRQ +/* + * Convert a HW interrupt number to a logical one using a IRQ domain, + * and handle the result interrupt number. Return -EINVAL if + * conversion failed. Providing a NULL domain indicates that the + * conversion has already been done. + */ +int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, + bool lookup, struct pt_regs *regs); + +static inline int handle_domain_irq(struct irq_domain *domain, + unsigned int hwirq, struct pt_regs *regs) +{ + return __handle_domain_irq(domain, hwirq, true, regs); +} +#endif + /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) { diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d269cecdfbf0..225086b2652e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +config HANDLE_DOMAIN_IRQ + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1487a123db5c..a1782f88f0af 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internals.h" @@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +#ifdef CONFIG_HANDLE_DOMAIN_IRQ +/** + * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @lookup: Whether to perform the domain lookup or not + * @regs: Register file coming from the low-level handling code + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, + bool lookup, struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq = hwirq; + int ret = 0; + + irq_enter(); + +#ifdef CONFIG_IRQ_DOMAIN + if (lookup) + irq = irq_find_mapping(domain, hwirq); +#endif + + /* + * Some hardware gives randomly wrong interrupts. Rather + * than crashing, do something sensible. + */ + if (unlikely(!irq || irq >= nr_irqs)) { + ack_bad_irq(irq); + ret = -EINVAL; + } else { + generic_handle_irq(irq); + } + + irq_exit(); + set_irq_regs(old_regs); + return ret; +} +#endif + /* Dynamic interrupt handling */ /** -- cgit v1.2.3 From a4412fc9486ec85686c6c7929e7e829f62ae377e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 21 Jul 2014 18:49:14 -0700 Subject: seccomp,x86,arm,mips,s390: Remove nr parameter from secure_computing The secure_computing function took a syscall number parameter, but it only paid any attention to that parameter if seccomp mode 1 was enabled. Rather than coming up with a kludge to get the parameter to work in mode 2, just remove the parameter. To avoid churn in arches that don't have seccomp filters (and may not even support syscall_get_nr right now), this leaves the parameter in secure_computing_strict, which is now a real function. For ARM, this is a bit ugly due to the fact that ARM conditionally supports seccomp filters. Fixing that would probably only be a couple of lines of code, but it should be coordinated with the audit maintainers. This will be a slight slowdown on some arches. The right fix is to pass in all of seccomp_data instead of trying to make just the syscall nr part be fast. This is a prerequisite for making two-phase seccomp work cleanly. Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-s390@vger.kernel.org Cc: x86@kernel.org Cc: Kees Cook Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- arch/arm/kernel/ptrace.c | 7 ++++- arch/mips/kernel/ptrace.c | 2 +- arch/s390/kernel/ptrace.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/vsyscall_64.c | 2 +- include/linux/seccomp.h | 21 +++++++------- kernel/seccomp.c | 64 ++++++++++++++++++++++++++++++------------- 7 files changed, 66 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 0c27ed6f3f23..5e772a21ab97 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -933,8 +933,13 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) current_thread_info()->syscall = scno; /* Do the secure computing check first; failures should be fast. */ - if (secure_computing(scno) == -1) +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER + if (secure_computing() == -1) return -1; +#else + /* XXX: remove this once OABI gets fixed */ + secure_computing_strict(scno); +#endif if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 645b3c4fcfba..f7aac5b57b4b 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -770,7 +770,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) long ret = 0; user_exit(); - if (secure_computing(syscall) == -1) + if (secure_computing() == -1) return -1; if (test_thread_flag(TIF_SYSCALL_TRACE) && diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 5dc7ad9e2fbf..bebacad48305 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -803,7 +803,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) long ret = 0; /* Do the secure computing check first. */ - if (secure_computing(regs->gprs[2])) { + if (secure_computing()) { /* seccomp failures shouldn't expose any additional code. */ ret = -1; goto out; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 678c0ada3b3c..93c182a00506 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1471,7 +1471,7 @@ long syscall_trace_enter(struct pt_regs *regs) regs->flags |= X86_EFLAGS_TF; /* do the secure computing check first */ - if (secure_computing(regs->orig_ax)) { + if (secure_computing()) { /* seccomp failures shouldn't expose any additional code. */ ret = -1L; goto out; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e1e1e80fc6a6..957779f4eb40 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; - tmp = secure_computing(syscall_nr); + tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 5d586a45a319..aa3c040230be 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -27,19 +27,17 @@ struct seccomp { struct seccomp_filter *filter; }; -extern int __secure_computing(int); -static inline int secure_computing(int this_syscall) +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER +extern int __secure_computing(void); +static inline int secure_computing(void) { if (unlikely(test_thread_flag(TIF_SECCOMP))) - return __secure_computing(this_syscall); + return __secure_computing(); return 0; } - -/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */ -static inline void secure_computing_strict(int this_syscall) -{ - BUG_ON(secure_computing(this_syscall) != 0); -} +#else +extern void secure_computing_strict(int this_syscall); +#endif extern long prctl_get_seccomp(void); extern long prctl_set_seccomp(unsigned long, char __user *); @@ -56,8 +54,11 @@ static inline int seccomp_mode(struct seccomp *s) struct seccomp { }; struct seccomp_filter { }; -static inline int secure_computing(int this_syscall) { return 0; } +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER +static inline int secure_computing(void) { return 0; } +#else static inline void secure_computing_strict(int this_syscall) { return; } +#endif static inline long prctl_get_seccomp(void) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..5e738e0dd2e9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -23,8 +23,11 @@ /* #define SECCOMP_DEBUG 1 */ -#ifdef CONFIG_SECCOMP_FILTER +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include +#endif + +#ifdef CONFIG_SECCOMP_FILTER #include #include #include @@ -172,7 +175,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(int syscall) +static u32 seccomp_run_filters(void) { struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd; @@ -564,10 +567,43 @@ static int mode1_syscalls_32[] = { }; #endif -int __secure_computing(int this_syscall) +static void __secure_computing_strict(int this_syscall) +{ + int *syscall_whitelist = mode1_syscalls; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + syscall_whitelist = mode1_syscalls_32; +#endif + do { + if (*syscall_whitelist == this_syscall) + return; + } while (*++syscall_whitelist); + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + do_exit(SIGKILL); +} + +#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER +void secure_computing_strict(int this_syscall) +{ + int mode = current->seccomp.mode; + + if (mode == 0) + return; + else if (mode == SECCOMP_MODE_STRICT) + __secure_computing_strict(this_syscall); + else + BUG(); +} +#else +int __secure_computing(void) { + struct pt_regs *regs = task_pt_regs(current); + int this_syscall = syscall_get_nr(current, regs); int exit_sig = 0; - int *syscall; u32 ret; /* @@ -578,23 +614,12 @@ int __secure_computing(int this_syscall) switch (current->seccomp.mode) { case SECCOMP_MODE_STRICT: - syscall = mode1_syscalls; -#ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall = mode1_syscalls_32; -#endif - do { - if (*syscall == this_syscall) - return 0; - } while (*++syscall); - exit_sig = SIGKILL; - ret = SECCOMP_RET_KILL; - break; + __secure_computing_strict(this_syscall); + return 0; #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; - struct pt_regs *regs = task_pt_regs(current); - ret = seccomp_run_filters(this_syscall); + ret = seccomp_run_filters(); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { @@ -652,9 +677,10 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER skip: audit_seccomp(this_syscall, exit_sig, ret); -#endif return -1; +#endif } +#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) { -- cgit v1.2.3 From 13aa72f0fd0a9f98a41cefb662487269e2f1ad65 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 21 Jul 2014 18:49:15 -0700 Subject: seccomp: Refactor the filter callback and the API The reason I did this is to add a seccomp API that will be usable for an x86 fast path. The x86 entry code needs to use a rather expensive slow path for a syscall that might be visible to things like ptrace. By splitting seccomp into two phases, we can check whether we need the slow path and then use the fast path in if the filter allows the syscall or just returns some errno. As a side effect, I think the new code is much easier to understand than the old code. This has one user-visible effect: the audit record written for SECCOMP_RET_TRACE is now a simple indication that SECCOMP_RET_TRACE happened. It used to depend in a complicated way on what the tracer did. I couldn't make much sense of it. Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/linux/seccomp.h | 6 ++ kernel/seccomp.c | 190 +++++++++++++++++++++++++++++++----------------- 2 files changed, 130 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index aa3c040230be..38851085e481 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -35,6 +35,12 @@ static inline int secure_computing(void) return __secure_computing(); return 0; } + +#define SECCOMP_PHASE1_OK 0 +#define SECCOMP_PHASE1_SKIP 1 + +extern u32 seccomp_phase1(void); +int seccomp_phase2(u32 phase1_result); #else extern void secure_computing_strict(int this_syscall); #endif diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5e738e0dd2e9..6c8528ce9df9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -21,8 +21,6 @@ #include #include -/* #define SECCOMP_DEBUG 1 */ - #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include #endif @@ -601,10 +599,21 @@ void secure_computing_strict(int this_syscall) #else int __secure_computing(void) { - struct pt_regs *regs = task_pt_regs(current); - int this_syscall = syscall_get_nr(current, regs); - int exit_sig = 0; - u32 ret; + u32 phase1_result = seccomp_phase1(); + + if (likely(phase1_result == SECCOMP_PHASE1_OK)) + return 0; + else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) + return -1; + else + return seccomp_phase2(phase1_result); +} + +#ifdef CONFIG_SECCOMP_FILTER +static u32 __seccomp_phase1_filter(int this_syscall, struct pt_regs *regs) +{ + u32 filter_ret, action; + int data; /* * Make sure that any changes to mode from another thread have @@ -612,73 +621,122 @@ int __secure_computing(void) */ rmb(); - switch (current->seccomp.mode) { + filter_ret = seccomp_run_filters(); + data = filter_ret & SECCOMP_RET_DATA; + action = filter_ret & SECCOMP_RET_ACTION; + + switch (action) { + case SECCOMP_RET_ERRNO: + /* Set the low-order 16-bits as a errno. */ + syscall_set_return_value(current, regs, + -data, 0); + goto skip; + + case SECCOMP_RET_TRAP: + /* Show the handler the original registers. */ + syscall_rollback(current, regs); + /* Let the filter pass back 16 bits of data. */ + seccomp_send_sigsys(this_syscall, data); + goto skip; + + case SECCOMP_RET_TRACE: + return filter_ret; /* Save the rest for phase 2. */ + + case SECCOMP_RET_ALLOW: + return SECCOMP_PHASE1_OK; + + case SECCOMP_RET_KILL: + default: + audit_seccomp(this_syscall, SIGSYS, action); + do_exit(SIGSYS); + } + + unreachable(); + +skip: + audit_seccomp(this_syscall, 0, action); + return SECCOMP_PHASE1_SKIP; +} +#endif + +/** + * seccomp_phase1() - run fast path seccomp checks on the current syscall + * + * This only reads pt_regs via the syscall_xyz helpers. The only change + * it will make to pt_regs is via syscall_set_return_value, and it will + * only do that if it returns SECCOMP_PHASE1_SKIP. + * + * It may also call do_exit or force a signal; these actions must be + * safe. + * + * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should + * be processed normally. + * + * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be + * invoked. In this case, seccomp_phase1 will have set the return value + * using syscall_set_return_value. + * + * If it returns anything else, then the return value should be passed + * to seccomp_phase2 from a context in which ptrace hooks are safe. + */ +u32 seccomp_phase1(void) +{ + int mode = current->seccomp.mode; + struct pt_regs *regs = task_pt_regs(current); + int this_syscall = syscall_get_nr(current, regs); + + switch (mode) { case SECCOMP_MODE_STRICT: - __secure_computing_strict(this_syscall); - return 0; + __secure_computing_strict(this_syscall); /* may call do_exit */ + return SECCOMP_PHASE1_OK; #ifdef CONFIG_SECCOMP_FILTER - case SECCOMP_MODE_FILTER: { - int data; - ret = seccomp_run_filters(); - data = ret & SECCOMP_RET_DATA; - ret &= SECCOMP_RET_ACTION; - switch (ret) { - case SECCOMP_RET_ERRNO: - /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, - -data, 0); - goto skip; - case SECCOMP_RET_TRAP: - /* Show the handler the original registers. */ - syscall_rollback(current, regs); - /* Let the filter pass back 16 bits of data. */ - seccomp_send_sigsys(this_syscall, data); - goto skip; - case SECCOMP_RET_TRACE: - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - goto skip; - } - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - break; - if (syscall_get_nr(current, regs) < 0) - goto skip; /* Explicit request to skip. */ - - return 0; - case SECCOMP_RET_ALLOW: - return 0; - case SECCOMP_RET_KILL: - default: - break; - } - exit_sig = SIGSYS; - break; - } + case SECCOMP_MODE_FILTER: + return __seccomp_phase1_filter(this_syscall, regs); #endif default: BUG(); } +} -#ifdef SECCOMP_DEBUG - dump_stack(); -#endif - audit_seccomp(this_syscall, exit_sig, ret); - do_exit(exit_sig); -#ifdef CONFIG_SECCOMP_FILTER -skip: - audit_seccomp(this_syscall, exit_sig, ret); - return -1; -#endif +/** + * seccomp_phase2() - finish slow path seccomp work for the current syscall + * @phase1_result: The return value from seccomp_phase1() + * + * This must be called from a context in which ptrace hooks can be used. + * + * Returns 0 if the syscall should be processed or -1 to skip the syscall. + */ +int seccomp_phase2(u32 phase1_result) +{ + struct pt_regs *regs = task_pt_regs(current); + u32 action = phase1_result & SECCOMP_RET_ACTION; + int data = phase1_result & SECCOMP_RET_DATA; + + BUG_ON(action != SECCOMP_RET_TRACE); + + audit_seccomp(syscall_get_nr(current, regs), 0, action); + + /* Skip these calls if there is no tracer. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); + return -1; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + do_exit(SIGSYS); + if (syscall_get_nr(current, regs) < 0) + return -1; /* Explicit request to skip. */ + + return 0; } #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ -- cgit v1.2.3 From d39bd00deabe57420f2a3669eb71b0e0c4997184 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 21 Jul 2014 18:49:16 -0700 Subject: seccomp: Allow arch code to provide seccomp_data populate_seccomp_data is expensive: it works by inspecting task_pt_regs and various other bits to piece together all the information, and it's does so in multiple partially redundant steps. Arch-specific code in the syscall entry path can do much better. Admittedly this adds a bit of additional room for error, but the speedup should be worth it. Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- kernel/seccomp.c | 32 +++++++++++++++++++------------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 38851085e481..a19ddacdac30 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -39,7 +39,7 @@ static inline int secure_computing(void) #define SECCOMP_PHASE1_OK 0 #define SECCOMP_PHASE1_SKIP 1 -extern u32 seccomp_phase1(void); +extern u32 seccomp_phase1(struct seccomp_data *sd); int seccomp_phase2(u32 phase1_result); #else extern void secure_computing_strict(int this_syscall); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 6c8528ce9df9..1285cb205d49 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(void) +static u32 seccomp_run_filters(struct seccomp_data *sd) { struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); - struct seccomp_data sd; + struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ @@ -186,14 +186,17 @@ static u32 seccomp_run_filters(void) /* Make sure cross-thread synced filter points somewhere sane. */ smp_read_barrier_depends(); - populate_seccomp_data(&sd); + if (!sd) { + populate_seccomp_data(&sd_local); + sd = &sd_local; + } /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; @@ -599,7 +602,7 @@ void secure_computing_strict(int this_syscall) #else int __secure_computing(void) { - u32 phase1_result = seccomp_phase1(); + u32 phase1_result = seccomp_phase1(NULL); if (likely(phase1_result == SECCOMP_PHASE1_OK)) return 0; @@ -610,7 +613,7 @@ int __secure_computing(void) } #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct pt_regs *regs) +static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) { u32 filter_ret, action; int data; @@ -621,20 +624,20 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct pt_regs *regs) */ rmb(); - filter_ret = seccomp_run_filters(); + filter_ret = seccomp_run_filters(sd); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION; switch (action) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, + syscall_set_return_value(current, task_pt_regs(current), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, regs); + syscall_rollback(current, task_pt_regs(current)); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; @@ -661,11 +664,14 @@ skip: /** * seccomp_phase1() - run fast path seccomp checks on the current syscall + * @arg sd: The seccomp_data or NULL * * This only reads pt_regs via the syscall_xyz helpers. The only change * it will make to pt_regs is via syscall_set_return_value, and it will * only do that if it returns SECCOMP_PHASE1_SKIP. * + * If sd is provided, it will not read pt_regs at all. + * * It may also call do_exit or force a signal; these actions must be * safe. * @@ -679,11 +685,11 @@ skip: * If it returns anything else, then the return value should be passed * to seccomp_phase2 from a context in which ptrace hooks are safe. */ -u32 seccomp_phase1(void) +u32 seccomp_phase1(struct seccomp_data *sd) { int mode = current->seccomp.mode; - struct pt_regs *regs = task_pt_regs(current); - int this_syscall = syscall_get_nr(current, regs); + int this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: @@ -691,7 +697,7 @@ u32 seccomp_phase1(void) return SECCOMP_PHASE1_OK; #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: - return __seccomp_phase1_filter(this_syscall, regs); + return __seccomp_phase1_filter(this_syscall, sd); #endif default: BUG(); -- cgit v1.2.3 From 26ae4980b5e4739af93543a147facb421fb78ae8 Mon Sep 17 00:00:00 2001 From: Aaron Sierra Date: Fri, 15 Aug 2014 16:07:48 -0500 Subject: fsl_ifc: Fix csor_ext position in fsl_ifc_regs According to Freescale manuals, the IFC_CSORn_EXT register is located immediately _after_ the bank's IFC_CSORn register. This patch adjusts the csor_ext member of and reserved register arrays immediately surrounding the csor_cs structure to provide proper access to this register. Signed-off-by: Aaron Sierra Acked-by: Prabhakar Kushwaha Signed-off-by: Scott Wood --- include/linux/fsl_ifc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl_ifc.h b/include/linux/fsl_ifc.h index f49ddb1b2273..84d60cb841b1 100644 --- a/include/linux/fsl_ifc.h +++ b/include/linux/fsl_ifc.h @@ -781,13 +781,13 @@ struct fsl_ifc_regs { __be32 amask; u32 res4[0x2]; } amask_cs[FSL_IFC_BANK_COUNT]; - u32 res5[0x17]; + u32 res5[0x18]; struct { - __be32 csor_ext; __be32 csor; + __be32 csor_ext; u32 res6; } csor_cs[FSL_IFC_BANK_COUNT]; - u32 res7[0x19]; + u32 res7[0x18]; struct { __be32 ftim[4]; u32 res8[0x8]; -- cgit v1.2.3 From 3af0dbd592fe0a92002f16e341519ba03e92adf7 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Mon, 1 Sep 2014 11:19:52 +0800 Subject: gpio: mcp23s08 to support both device tree and platform data Device tree is not enabled in some architecture where gpio driver mcp23s08 is still required. v2-changes: - Parse device tree properties into platform data other than individual variables. v3-changes: - Use of_node in gpio_chip device structure, because the struct device * always has an of_node which is NULL when OF is not used. Signed-off-by: Sonic Zhang Reviewed-by: Alexandre Courbot Signed-off-by: Linus Walleij --- drivers/gpio/Kconfig | 1 - drivers/gpio/gpio-mcp23s08.c | 64 +++++++++++++++++++++++--------------------- include/linux/spi/mcp23s08.h | 18 +++++++++++++ 3 files changed, 52 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index ec27ec0be8c2..ec398bee7c60 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -806,7 +806,6 @@ config GPIO_MAX7301 config GPIO_MCP23S08 tristate "Microchip MCP23xxx I/O expander" - depends on OF_GPIO depends on (SPI_MASTER && !I2C) || I2C help SPI/I2C driver for Microchip MCP23S08/MCP23S17/MCP23008/MCP23017 diff --git a/drivers/gpio/gpio-mcp23s08.c b/drivers/gpio/gpio-mcp23s08.c index 6f183d9b487e..8488e2fd307c 100644 --- a/drivers/gpio/gpio-mcp23s08.c +++ b/drivers/gpio/gpio-mcp23s08.c @@ -479,7 +479,7 @@ static int mcp23s08_irq_setup(struct mcp23s08 *mcp) mutex_init(&mcp->irq_lock); - mcp->irq_domain = irq_domain_add_linear(chip->of_node, chip->ngpio, + mcp->irq_domain = irq_domain_add_linear(chip->dev->of_node, chip->ngpio, &irq_domain_simple_ops, mcp); if (!mcp->irq_domain) return -ENODEV; @@ -581,7 +581,7 @@ done: static int mcp23s08_probe_one(struct mcp23s08 *mcp, struct device *dev, void *data, unsigned addr, unsigned type, - unsigned base, unsigned pullups) + struct mcp23s08_platform_data *pdata, int cs) { int status; bool mirror = false; @@ -635,7 +635,7 @@ static int mcp23s08_probe_one(struct mcp23s08 *mcp, struct device *dev, return -EINVAL; } - mcp->chip.base = base; + mcp->chip.base = pdata->base; mcp->chip.can_sleep = true; mcp->chip.dev = dev; mcp->chip.owner = THIS_MODULE; @@ -648,11 +648,9 @@ static int mcp23s08_probe_one(struct mcp23s08 *mcp, struct device *dev, if (status < 0) goto fail; - mcp->irq_controller = of_property_read_bool(mcp->chip.of_node, - "interrupt-controller"); + mcp->irq_controller = pdata->irq_controller; if (mcp->irq && mcp->irq_controller && (type == MCP_TYPE_017)) - mirror = of_property_read_bool(mcp->chip.of_node, - "microchip,irq-mirror"); + mirror = pdata->mirror; if ((status & IOCON_SEQOP) || !(status & IOCON_HAEN) || mirror) { /* mcp23s17 has IOCON twice, make sure they are in sync */ @@ -668,7 +666,7 @@ static int mcp23s08_probe_one(struct mcp23s08 *mcp, struct device *dev, } /* configure ~100K pullups */ - status = mcp->ops->write(mcp, MCP_GPPU, pullups); + status = mcp->ops->write(mcp, MCP_GPPU, pdata->chip[cs].pullups); if (status < 0) goto fail; @@ -768,25 +766,29 @@ MODULE_DEVICE_TABLE(of, mcp23s08_i2c_of_match); static int mcp230xx_probe(struct i2c_client *client, const struct i2c_device_id *id) { - struct mcp23s08_platform_data *pdata; + struct mcp23s08_platform_data *pdata, local_pdata; struct mcp23s08 *mcp; - int status, base, pullups; + int status; const struct of_device_id *match; match = of_match_device(of_match_ptr(mcp23s08_i2c_of_match), &client->dev); - pdata = dev_get_platdata(&client->dev); - if (match || !pdata) { - base = -1; - pullups = 0; + if (match) { + pdata = &local_pdata; + pdata->base = -1; + pdata->chip[0].pullups = 0; + pdata->irq_controller = of_property_read_bool( + client->dev.of_node, + "interrupt-controller"); + pdata->mirror = of_property_read_bool(client->dev.of_node, + "microchip,irq-mirror"); client->irq = irq_of_parse_and_map(client->dev.of_node, 0); } else { - if (!gpio_is_valid(pdata->base)) { + pdata = dev_get_platdata(&client->dev); + if (!pdata || !gpio_is_valid(pdata->base)) { dev_dbg(&client->dev, "invalid platform data\n"); return -EINVAL; } - base = pdata->base; - pullups = pdata->chip[0].pullups; } mcp = kzalloc(sizeof(*mcp), GFP_KERNEL); @@ -795,7 +797,7 @@ static int mcp230xx_probe(struct i2c_client *client, mcp->irq = client->irq; status = mcp23s08_probe_one(mcp, &client->dev, client, client->addr, - id->driver_data, base, pullups); + id->driver_data, pdata, 0); if (status) goto fail; @@ -863,14 +865,12 @@ static void mcp23s08_i2c_exit(void) { } static int mcp23s08_probe(struct spi_device *spi) { - struct mcp23s08_platform_data *pdata; + struct mcp23s08_platform_data *pdata, local_pdata; unsigned addr; int chips = 0; struct mcp23s08_driver_data *data; int status, type; - unsigned base = -1, - ngpio = 0, - pullups[ARRAY_SIZE(pdata->chip)]; + unsigned ngpio = 0; const struct of_device_id *match; u32 spi_present_mask = 0; @@ -893,11 +893,18 @@ static int mcp23s08_probe(struct spi_device *spi) return -ENODEV; } + pdata = &local_pdata; + pdata->base = -1; for (addr = 0; addr < ARRAY_SIZE(pdata->chip); addr++) { - pullups[addr] = 0; + pdata->chip[addr].pullups = 0; if (spi_present_mask & (1 << addr)) chips++; } + pdata->irq_controller = of_property_read_bool( + spi->dev.of_node, + "interrupt-controller"); + pdata->mirror = of_property_read_bool(spi->dev.of_node, + "microchip,irq-mirror"); } else { type = spi_get_device_id(spi)->driver_data; pdata = dev_get_platdata(&spi->dev); @@ -917,10 +924,7 @@ static int mcp23s08_probe(struct spi_device *spi) return -EINVAL; } spi_present_mask |= 1 << addr; - pullups[addr] = pdata->chip[addr].pullups; } - - base = pdata->base; } if (!chips) @@ -938,13 +942,13 @@ static int mcp23s08_probe(struct spi_device *spi) chips--; data->mcp[addr] = &data->chip[chips]; status = mcp23s08_probe_one(data->mcp[addr], &spi->dev, spi, - 0x40 | (addr << 1), type, base, - pullups[addr]); + 0x40 | (addr << 1), type, pdata, + addr); if (status < 0) goto fail; - if (base != -1) - base += (type == MCP_TYPE_S17) ? 16 : 8; + if (pdata->base != -1) + pdata->base += (type == MCP_TYPE_S17) ? 16 : 8; ngpio += (type == MCP_TYPE_S17) ? 16 : 8; } data->ngpio = ngpio; diff --git a/include/linux/spi/mcp23s08.h b/include/linux/spi/mcp23s08.h index 2d676d5aaa89..aa07d7b32568 100644 --- a/include/linux/spi/mcp23s08.h +++ b/include/linux/spi/mcp23s08.h @@ -22,4 +22,22 @@ struct mcp23s08_platform_data { * base to base+15 (or base+31 for s17 variant). */ unsigned base; + /* Marks the device as a interrupt controller. + * NOTE: The interrupt functionality is only supported for i2c + * versions of the chips. The spi chips can also do the interrupts, + * but this is not supported by the linux driver yet. + */ + bool irq_controller; + + /* Sets the mirror flag in the IOCON register. Devices + * with two interrupt outputs (these are the devices ending with 17 and + * those that have 16 IOs) have two IO banks: IO 0-7 form bank 1 and + * IO 8-15 are bank 2. These chips have two different interrupt outputs: + * One for bank 1 and another for bank 2. If irq-mirror is set, both + * interrupts are generated regardless of the bank that an input change + * occurred on. If it is not set, the interrupt are only generated for + * the bank they belong to. + * On devices with only one interrupt output this property is useless. + */ + bool mirror; }; -- cgit v1.2.3 From 8d38821cbcf51292cd5a23469d03bd38932a3ba9 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 1 Aug 2014 14:15:10 +0200 Subject: resources: Add device-managed request/release_resource() Provide device-managed implementations of the request_resource() and release_resource() functions. Upon failure to request a resource, the new devm_request_resource() function will output an error message for consistent error reporting. Signed-off-by: Thierry Reding Signed-off-by: Bjorn Helgaas Acked-by: Tejun Heo --- Documentation/driver-model/devres.txt | 2 + include/linux/ioport.h | 5 +++ kernel/resource.c | 70 +++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) (limited to 'include/linux') diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index d14710b04439..befc3fe12ba6 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -264,8 +264,10 @@ IIO IO region devm_release_mem_region() devm_release_region() + devm_release_resource() devm_request_mem_region() devm_request_region() + devm_request_resource() IOMAP devm_ioport_map() diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 142ec544167c..2c5250222278 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -215,6 +215,11 @@ static inline int __deprecated check_region(resource_size_t s, /* Wrappers for managed devices */ struct device; + +extern int devm_request_resource(struct device *dev, struct resource *root, + struct resource *new); +extern void devm_release_resource(struct device *dev, struct resource *new); + #define devm_request_region(dev,start,n,name) \ __devm_request_region(dev, &ioport_resource, (start), (n), (name)) #define devm_request_mem_region(dev,start,n,name) \ diff --git a/kernel/resource.c b/kernel/resource.c index da14b8d09296..ca24f19f9d18 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1248,6 +1248,76 @@ int release_mem_region_adjustable(struct resource *parent, /* * Managed region resource */ +static void devm_resource_release(struct device *dev, void *ptr) +{ + struct resource **r = ptr; + + release_resource(*r); +} + +/** + * devm_request_resource() - request and reserve an I/O or memory resource + * @dev: device for which to request the resource + * @root: root of the resource tree from which to request the resource + * @new: descriptor of the resource to request + * + * This is a device-managed version of request_resource(). There is usually + * no need to release resources requested by this function explicitly since + * that will be taken care of when the device is unbound from its driver. + * If for some reason the resource needs to be released explicitly, because + * of ordering issues for example, drivers must call devm_release_resource() + * rather than the regular release_resource(). + * + * When a conflict is detected between any existing resources and the newly + * requested resource, an error message will be printed. + * + * Returns 0 on success or a negative error code on failure. + */ +int devm_request_resource(struct device *dev, struct resource *root, + struct resource *new) +{ + struct resource *conflict, **ptr; + + ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + *ptr = new; + + conflict = request_resource_conflict(root, new); + if (conflict) { + dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", + new, conflict->name, conflict); + devres_free(ptr); + return -EBUSY; + } + + devres_add(dev, ptr); + return 0; +} +EXPORT_SYMBOL(devm_request_resource); + +static int devm_resource_match(struct device *dev, void *res, void *data) +{ + struct resource **ptr = res; + + return *ptr == data; +} + +/** + * devm_release_resource() - release a previously requested resource + * @dev: device for which to release the resource + * @new: descriptor of the resource to release + * + * Releases a resource previously requested using devm_request_resource(). + */ +void devm_release_resource(struct device *dev, struct resource *new) +{ + WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, + new)); +} +EXPORT_SYMBOL(devm_release_resource); + struct region_devres { struct resource *parent; resource_size_t start; -- cgit v1.2.3 From efd01a72e7ec99ed583151fbf16b176cd2158967 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 5 Aug 2014 14:08:55 +0200 Subject: PCI/AER: Make standalone includable The header file references u16 and u32 types, but they are not defined in the header nor does the header pull in the necessary includes for them. This causes build breakage when the file is included without any of the dependencies being satisfied from somewhere else. Fix this by including linux/types.h (for u16 and u32). [bhelgaas: removed pci_dev declaration (already added by 5ccb8225abf2)] Signed-off-by: Thierry Reding Signed-off-by: Bjorn Helgaas --- include/linux/aer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index c826d1c28f9c..4fef65e57023 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -7,6 +7,8 @@ #ifndef _AER_H_ #define _AER_H_ +#include + #define AER_NONFATAL 0 #define AER_FATAL 1 #define AER_CORRECTABLE 2 -- cgit v1.2.3 From 6b44f519017b219a12b37173c7eef8dfce2c0100 Mon Sep 17 00:00:00 2001 From: Scot Doyle Date: Sun, 24 Aug 2014 17:12:27 +0000 Subject: sched/wait: Document timeout corner case The timeout may elapse without 0 being returned, such as when waiting on an unused queue. Document this possibility. Signed-off-by: Scot Doyle Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LNX.2.11.1408241710070.6462@localhost.localdomain Signed-off-by: Ingo Molnar --- include/linux/wait.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6fb1ba5f9b2f..034f6fc7c65f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -280,9 +280,11 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, or the remaining - * jiffies (at least 1) if the @condition evaluated to %true before - * the @timeout elapsed. + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq, condition, timeout) \ ({ \ @@ -363,9 +365,11 @@ do { \ * change the result of the wait condition. * * Returns: - * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by - * a signal, or the remaining jiffies (at least 1) if the @condition - * evaluated to %true before the @timeout elapsed. + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was + * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ -- cgit v1.2.3 From a8adcc9012d8502e06ba7b3f966bad8f2c58edc3 Mon Sep 17 00:00:00 2001 From: Ramakrishna Pallala Date: Wed, 27 Aug 2014 23:44:08 +0530 Subject: power_supply: Add boot and calibration attributes Usually PMIC's come with coulomb counting mechanism which can be used to implement a Fuel Gauginig solution in Software itself. One of key input to these SW Fuel Gauge solutioons is the boot up parameters like boot voltage and boot current. This patch adds the VOLTAGE_BOOT and CURRENT_BOOT power supply attributes to report bootup voltage and current. This patch also adds CALIBRATE power supply attribute which useful is for calibrating the battery/coulomb counter. Signed-off-by: Ramakrishna Pallala Signed-off-by: Sebastian Reichel --- Documentation/power/power_supply_class.txt | 6 ++++++ drivers/power/power_supply_sysfs.c | 3 +++ include/linux/power_supply.h | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt index 48cff881cb8a..82dacc06e355 100644 --- a/Documentation/power/power_supply_class.txt +++ b/Documentation/power/power_supply_class.txt @@ -101,6 +101,10 @@ VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that these ones should be used if hardware could only guess (measure and retain) the thresholds of a given power supply. +VOLTAGE_BOOT - Reports the voltage measured during boot + +CURRENT_BOOT - Reports the current measured during boot + CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when battery considered full/empty. @@ -123,6 +127,8 @@ the current drawn from a charging source. CHARGE_TERM_CURRENT - Charge termination current used to detect the end of charge condition. +CALIBRATE - battery or coulomb counter calibration status + CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger. CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the power supply object. diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c index 750a20275664..7e4726d37211 100644 --- a/drivers/power/power_supply_sysfs.c +++ b/drivers/power/power_supply_sysfs.c @@ -149,9 +149,11 @@ static struct device_attribute power_supply_attrs[] = { POWER_SUPPLY_ATTR(voltage_now), POWER_SUPPLY_ATTR(voltage_avg), POWER_SUPPLY_ATTR(voltage_ocv), + POWER_SUPPLY_ATTR(voltage_boot), POWER_SUPPLY_ATTR(current_max), POWER_SUPPLY_ATTR(current_now), POWER_SUPPLY_ATTR(current_avg), + POWER_SUPPLY_ATTR(current_boot), POWER_SUPPLY_ATTR(power_now), POWER_SUPPLY_ATTR(power_avg), POWER_SUPPLY_ATTR(charge_full_design), @@ -193,6 +195,7 @@ static struct device_attribute power_supply_attrs[] = { POWER_SUPPLY_ATTR(type), POWER_SUPPLY_ATTR(scope), POWER_SUPPLY_ATTR(charge_term_current), + POWER_SUPPLY_ATTR(calibrate), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(model_name), POWER_SUPPLY_ATTR(manufacturer), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f3dea41dbcd2..de59a28b1b5b 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -102,9 +102,11 @@ enum power_supply_property { POWER_SUPPLY_PROP_VOLTAGE_NOW, POWER_SUPPLY_PROP_VOLTAGE_AVG, POWER_SUPPLY_PROP_VOLTAGE_OCV, + POWER_SUPPLY_PROP_VOLTAGE_BOOT, POWER_SUPPLY_PROP_CURRENT_MAX, POWER_SUPPLY_PROP_CURRENT_NOW, POWER_SUPPLY_PROP_CURRENT_AVG, + POWER_SUPPLY_PROP_CURRENT_BOOT, POWER_SUPPLY_PROP_POWER_NOW, POWER_SUPPLY_PROP_POWER_AVG, POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN, @@ -146,6 +148,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_TYPE, /* use power_supply.type instead */ POWER_SUPPLY_PROP_SCOPE, POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT, + POWER_SUPPLY_PROP_CALIBRATE, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, @@ -291,6 +294,7 @@ static inline bool power_supply_is_amp_property(enum power_supply_property psp) case POWER_SUPPLY_PROP_CURRENT_MAX: case POWER_SUPPLY_PROP_CURRENT_NOW: case POWER_SUPPLY_PROP_CURRENT_AVG: + case POWER_SUPPLY_PROP_CURRENT_BOOT: return 1; default: break; @@ -315,6 +319,7 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp) case POWER_SUPPLY_PROP_VOLTAGE_NOW: case POWER_SUPPLY_PROP_VOLTAGE_AVG: case POWER_SUPPLY_PROP_VOLTAGE_OCV: + case POWER_SUPPLY_PROP_VOLTAGE_BOOT: case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE: case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: case POWER_SUPPLY_PROP_POWER_NOW: -- cgit v1.2.3 From 521d24ee598bd8a8b71d7ac76ce2c0da0e548406 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:18 -0400 Subject: rcu: Return bool type in rcu_lockdep_current_cpu_online() Return true instead of 1 in rcu_lockdep_current_cpu_online() as this has bool as return type. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d231aa17b1d7..7e47e44bce03 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -349,7 +349,7 @@ bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { - return 1; + return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ -- cgit v1.2.3 From 85b39d305bfe809a11ff2770d380be3e2465beec Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 8 Jul 2014 15:17:59 -0700 Subject: rcu: Uninline rcu_read_lock_held() This commit uninlines rcu_read_lock_held(). According to "size vmlinux" this saves 28549 in .text: - 5541731 3014560 14757888 23314179 + 5513182 3026848 14757888 23297918 Note: it looks as if the data grows by 12288 bytes but this is not true, it does not actually grow. But .data starts with ALIGN(THREAD_SIZE) and since .text shrinks the padding grows, and thus .data grows too as it seen by /bin/size. diff System.map: - ffffffff81510000 D _sdata - ffffffff81510000 D init_thread_union + ffffffff81509000 D _sdata + ffffffff8150c000 D init_thread_union Perhaps we can change vmlinux.lds.S to .data itself, so that /bin/size can't "wrongly" report that .data grows if .text shinks. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 36 +----------------------------------- kernel/rcu/update.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 7e47e44bce03..321ed0d4e675 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -371,41 +371,7 @@ extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); -/** - * rcu_read_lock_held() - might we be in RCU read-side critical section? - * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU - * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, - * this assumes we are in an RCU read-side critical section unless it can - * prove otherwise. This is useful for debug checks in functions that - * require that they be called within an RCU read-side critical section. - * - * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot - * and while lockdep is disabled. - * - * Note that rcu_read_lock() and the matching rcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * rcu_read_unlock() in process context if the matching rcu_read_lock() - * was invoked from within an irq handler. - * - * Note that rcu_read_lock() is disallowed if the CPU is either idle or - * offline from an RCU perspective, so check for those as well. - */ -static inline int rcu_read_lock_held(void) -{ - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - return lock_is_held(&rcu_lock_map); -} - -/* - * rcu_read_lock_bh_held() is defined out of line to avoid #include-file - * hell. - */ +int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); /** diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..ea8ea7b16e11 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -136,6 +136,38 @@ int notrace debug_lockdep_rcu_enabled(void) } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +/** + * rcu_read_lock_held() - might we be in RCU read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU + * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, + * this assumes we are in an RCU read-side critical section unless it can + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. + * + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that rcu_read_lock() and the matching rcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock() in process context if the matching rcu_read_lock() + * was invoked from within an irq handler. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. + */ +int rcu_read_lock_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + return lock_is_held(&rcu_lock_map); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_held); + /** * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * -- cgit v1.2.3 From eea203fea3484598280a07fe503e025e886297fb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 14 Jul 2014 09:16:15 -0400 Subject: rcu: Use pr_alert/pr_cont for printing logs User pr_alert/pr_cont for printing the logs from rcutorture module directly instead of writing it to a buffer and then printing it. This allows us from not having to allocate such buffers. Also remove a resulting empty function. I tested this using the parse-torture.sh script as follows: $ dmesg | grep torture > log.txt $ bash parse-torture.sh log.txt test $ There were no warnings which means that parsing went fine. Signed-off-by: Joe Perches Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- kernel/rcu/rcutorture.c | 127 +++++++++++++++++++++--------------------------- kernel/torture.c | 16 +++--- 3 files changed, 64 insertions(+), 81 deletions(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 5ca58fcbaf1b..fec46f8c08eb 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -51,7 +51,7 @@ /* Definitions for online/offline exerciser. */ int torture_onoff_init(long ooholdoff, long oointerval); -char *torture_onoff_stats(char *page); +void torture_onoff_stats(void); bool torture_onoff_failures(void); /* Low-rider random number generator. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e67711cbae8..ff4f0c756dee 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -242,7 +242,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - void (*stats)(char *page); + void (*stats)(void); int irq_capable; int can_boost; const char *name; @@ -525,21 +525,21 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static void srcu_torture_stats(char *page) +static void srcu_torture_stats(void) { int cpu; int idx = srcu_ctl.completed & 0x1; - page += sprintf(page, "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); + pr_alert("%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { long c0, c1; c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; - page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } - sprintf(page, "\n"); + pr_cont("\n"); } static void srcu_torture_synchronize_expedited(void) @@ -1031,10 +1031,15 @@ rcu_torture_reader(void *arg) } /* - * Create an RCU-torture statistics message in the specified buffer. + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). */ static void -rcu_torture_printk(char *page) +rcu_torture_stats_print(void) { int cpu; int i; @@ -1052,55 +1057,60 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - page = torture_onoff_stats(page); - page += sprintf(page, "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + pr_cont("rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + torture_onoff_stats(); + pr_cont("barrier: %ld/%ld:%ld\n", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - page += sprintf(page, "!!! "); + pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - page += sprintf(page, "Reader Pipe: "); + pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", pipesummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Reader Batch: "); + pr_cont(" %ld", pipesummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", batchsummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Free-Block Circulation: "); + pr_cont(" %ld", batchsummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - page += sprintf(page, " %d", - atomic_read(&rcu_torture_wcount[i])); + pr_cont(" %d", atomic_read(&rcu_torture_wcount[i])); } - page += sprintf(page, "\n"); + pr_cont("\n"); + if (cur_ops->stats) - cur_ops->stats(page); + cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags; @@ -1109,40 +1119,15 @@ rcu_torture_printk(char *page) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - page += sprintf(page, - "??? Writer stall state %d g%lu c%lu f%#x\n", - rcu_torture_writer_state, - gpnum, completed, flags); + pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); show_rcu_gp_kthreads(); rcutorture_trace_dump(); } rtcv_snap = rcu_torture_current_version; } -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int size = nr_cpu_ids * 200 + 8192; - char *buf; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - pr_err("rcu-torture: Out of memory, need: %d", size); - return; - } - rcu_torture_printk(buf); - pr_alert("%s", buf); - kfree(buf); -} - /* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. diff --git a/kernel/torture.c b/kernel/torture.c index d600af21f022..ede8b25ec1ae 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. */ -char *torture_onoff_stats(char *page) +void torture_onoff_stats(void) { #ifdef CONFIG_HOTPLUG_CPU - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return page; } EXPORT_SYMBOL_GPL(torture_onoff_stats); -- cgit v1.2.3 From 8315f42295d2667a7f942f154b73a86fd7cb2227 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jun 2014 13:42:20 -0700 Subject: rcu: Add call_rcu_tasks() This commit adds a new RCU-tasks flavor of RCU, which provides call_rcu_tasks(). This RCU flavor's quiescent states are voluntary context switch (not preemption!) and userspace execution (not the idle loop -- use some sort of schedule_on_each_cpu() if you need to handle the idle tasks. Note that unlike other RCU flavors, these quiescent states occur in tasks, not necessarily CPUs. Includes fixes from Steven Rostedt. This RCU flavor is assumed to have very infrequent latency-tolerant updaters. This assumption permits significant simplifications, including a single global callback list protected by a single global lock, along with a single task-private linked list containing all tasks that have not yet passed through a quiescent state. If experience shows this assumption to be incorrect, the required additional complexity will be added. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 9 +++ include/linux/rcupdate.h | 36 ++++++++++ include/linux/sched.h | 23 ++++--- init/Kconfig | 10 +++ kernel/rcu/tiny.c | 2 + kernel/rcu/tree.c | 2 + kernel/rcu/update.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 242 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2bb4c4f3531a..dffd9258ee60 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -117,6 +117,14 @@ extern struct group_info init_groups; #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif +#ifdef CONFIG_TASKS_RCU +#define INIT_TASK_RCU_TASKS(tsk) \ + .rcu_tasks_holdout = false, \ + .rcu_tasks_holdout_list = \ + LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), +#else +#define INIT_TASK_RCU_TASKS(tsk) +#endif extern struct cred init_cred; @@ -224,6 +232,7 @@ extern struct task_group root_task_group; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_TASK_RCU_TASKS(tsk) \ INIT_CPUSET_SEQ(tsk) \ INIT_RT_MUTEXES(tsk) \ INIT_VTIME(tsk) \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d231aa17b1d7..3432063f4c87 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -197,6 +197,26 @@ void call_rcu_sched(struct rcu_head *head, void synchronize_sched(void); +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head)); + #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); @@ -294,6 +314,22 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, rcu_irq_exit(); \ } while (0) +/* + * Note a voluntary context switch for RCU-tasks benefit. This is a + * macro rather than an inline function to avoid #include hell. + */ +#ifdef CONFIG_TASKS_RCU +#define rcu_note_voluntary_context_switch(t) \ + do { \ + preempt_disable(); /* Exclude synchronize_sched(); */ \ + if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ + ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ + preempt_enable(); \ + } while (0) +#else /* #ifdef CONFIG_TASKS_RCU */ +#define rcu_note_voluntary_context_switch(t) do { } while (0) +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52b..eaacac4ae77d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1270,6 +1270,11 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + unsigned long rcu_tasks_nvcsw; + bool rcu_tasks_holdout; + struct list_head rcu_tasks_holdout_list; +#endif /* #ifdef CONFIG_TASKS_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -2000,28 +2005,24 @@ extern void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask); #ifdef CONFIG_PREEMPT_RCU - #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ static inline void rcu_copy_process(struct task_struct *p) { +#ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special = 0; -#ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ INIT_LIST_HEAD(&p->rcu_node_entry); +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + p->rcu_tasks_holdout = false; + INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); +#endif /* #ifdef CONFIG_TASKS_RCU */ } -#else - -static inline void rcu_copy_process(struct task_struct *p) -{ -} - -#endif - static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { diff --git a/init/Kconfig b/init/Kconfig index e84c6423a2e5..c4539c4e177f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -507,6 +507,16 @@ config PREEMPT_RCU This option enables preemptible-RCU code that is common between TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU. +config TASKS_RCU + bool "Task_based RCU implementation using voluntary context switch" + default n + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + + If unsure, say N. + config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) help diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..717f00854fc0 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -254,6 +254,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + if (user) + rcu_note_voluntary_context_switch(current); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..8ad91d1e317d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); + if (user) + rcu_note_voluntary_context_switch(current); trace_rcu_utilization(TPS("End scheduler-tick")); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..19b3dacb0753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -47,6 +47,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -347,3 +348,173 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +#ifdef CONFIG_TASKS_RCU + +/* + * Simple variant of RCU whose quiescent states are voluntary context switch, + * user-space execution, and idle. As such, grace periods can take one good + * long time. There are no read-side primitives similar to rcu_read_lock() + * and rcu_read_unlock() because this implementation is intended to get + * the system into a safe state for some of the manipulations involved in + * tracing and the like. Finally, this implementation does not support + * high call_rcu_tasks() rates from multiple CPUs. If this is required, + * per-CPU callback lists will be needed. + */ + +/* Global list of callbacks and associated lock. */ +static struct rcu_head *rcu_tasks_cbs_head; +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); + +/* Post an RCU-tasks callback. */ +void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +{ + unsigned long flags; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + *rcu_tasks_cbs_tail = rhp; + rcu_tasks_cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/* See if the current task has stopped holding out, remove from list if so. */ +static void check_holdout_task(struct task_struct *t) +{ + if (!ACCESS_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || + !ACCESS_ONCE(t->on_rq)) { + ACCESS_ONCE(t->rcu_tasks_holdout) = false; + list_del_rcu(&t->rcu_tasks_holdout_list); + put_task_struct(t); + } +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct task_struct *g, *t; + struct rcu_head *list; + struct rcu_head *next; + LIST_HEAD(rcu_tasks_holdouts); + + /* FIXME: Add housekeeping affinity. */ + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + list = rcu_tasks_cbs_head; + rcu_tasks_cbs_head = NULL; + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + schedule_timeout_interruptible(HZ); + WARN_ON(signal_pending(current)); + continue; + } + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw + * transitions to complete. Invoking synchronize_sched() + * suffices because all these transitions occur with + * interrupts disabled. Without this synchronize_sched(), + * a read-side critical section that started before the + * grace period might be incorrectly seen as having started + * after the grace period. + * + * This synchronize_sched() also dispenses with the + * need for a memory barrier on the first store to + * ->rcu_tasks_holdout, as it forces the store to happen + * after the beginning of the grace period. + */ + synchronize_sched(); + + /* + * There were callbacks, so we need to wait for an + * RCU-tasks grace period. Start off by scanning + * the task list for tasks that are not already + * voluntarily blocked. Mark these tasks and make + * a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && ACCESS_ONCE(t->on_rq) && + !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); + ACCESS_ONCE(t->rcu_tasks_holdout) = true; + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Each pass through the following loop scans the list + * of holdout tasks, removing any that are no longer + * holdouts. When the list is empty, we are done. + */ + while (!list_empty(&rcu_tasks_holdouts)) { + schedule_timeout_interruptible(HZ); + WARN_ON(signal_pending(current)); + rcu_read_lock(); + list_for_each_entry_rcu(t, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) + check_holdout_task(t); + rcu_read_unlock(); + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed + * to have a full memory barriers prior to them in the + * schedule() path, memory reordering on other CPUs could + * cause their RCU-tasks read-side critical sections to + * extend past the end of the grace period. However, + * because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_sched() + * to force the needed ordering on all such CPUs. + * + * This synchronize_sched() also confines all + * ->rcu_tasks_holdout accesses to be within the grace + * period, avoiding the need for memory barriers for + * ->rcu_tasks_holdout accesses. + */ + synchronize_sched(); + + /* Invoke the callbacks. */ + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + } +} + +/* Spawn rcu_tasks_kthread() at boot time. */ +static int __init rcu_spawn_tasks_kthread(void) +{ + struct task_struct __maybe_unused *t; + + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + BUG_ON(IS_ERR(t)); + return 0; +} +early_initcall(rcu_spawn_tasks_kthread); + +#endif /* #ifdef CONFIG_TASKS_RCU */ -- cgit v1.2.3 From bde6c3aa993066acb0d6ce32ecabe03b9d5df92d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 11:26:57 -0700 Subject: rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops RCU-tasks requires the occasional voluntary context switch from CPU-bound in-kernel tasks. In some cases, this requires instrumenting cond_resched(). However, there is some reluctance to countenance unconditionally instrumenting cond_resched() (see http://lwn.net/Articles/603252/), so this commit creates a separate cond_resched_rcu_qs() that may be used in place of cond_resched() in locations prone to long-duration in-kernel looping. This commit currently instruments only RCU-tasks. Future possibilities include also instrumenting RCU, RCU-bh, and RCU-sched in order to reduce IPI usage. Signed-off-by: Paul E. McKenney --- fs/file.c | 2 +- include/linux/rcupdate.h | 13 +++++++++++++ kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree_plugin.h | 2 +- mm/mlock.c | 2 +- 6 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/file.c b/fs/file.c index 66923fe3176e..1cafc4c9275b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); - cond_resched(); + cond_resched_rcu_qs(); } } i++; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3432063f4c87..473350462d04 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -330,6 +330,19 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, #define rcu_note_voluntary_context_switch(t) do { } while (0) #endif /* #else #ifdef CONFIG_TASKS_RCU */ +/** + * cond_resched_rcu_qs - Report potential quiescent states to RCU + * + * This macro resembles cond_resched(), except that it is defined to + * report potential quiescent states to RCU-tasks even if the cond_resched() + * machinery were to be shut off, as some advocate for PREEMPT kernels. + */ +#define cond_resched_rcu_qs() \ +do { \ + rcu_note_voluntary_context_switch(current); \ + cond_resched(); \ +} while (0) + #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..178716713e11 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -667,7 +667,7 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -1019,7 +1019,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ad91d1e317d..e23dad0661e2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1647,7 +1647,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } mutex_unlock(&rsp->onoff_mutex); @@ -1736,7 +1736,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1785,7 +1785,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched(); + cond_resched_rcu_qs(); flush_signals(current); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1828,10 +1828,10 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched(); + cond_resched_rcu_qs(); } else { /* Deal with stray signal. */ - cond_resched(); + cond_resched_rcu_qs(); flush_signals(current); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -2434,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); + cond_resched_rcu_qs(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..7672586d3920 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1848,7 +1848,7 @@ static int rcu_oom_notify(struct notifier_block *self, get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); + cond_resched_rcu_qs(); } put_online_cpus(); diff --git a/mm/mlock.c b/mm/mlock.c index ce84cb0b83ef..ab3150c26711 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -789,7 +789,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - cond_resched(); + cond_resched_rcu_qs(); } out: return 0; -- cgit v1.2.3 From 53c6d4edf874d3cbc031a53738c6cba9277faea5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 12:22:23 -0700 Subject: rcu: Add synchronous grace-period waiting for RCU-tasks It turns out to be easier to add the synchronous grace-period waiting functions to RCU-tasks than to work around their absense in rcutorture, so this commit adds them. The key point is that the existence of call_rcu_tasks() means that rcutorture needs an rcu_barrier_tasks(). Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 ++ kernel/rcu/update.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 473350462d04..640152fedcde 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -216,6 +216,8 @@ void synchronize_sched(void); * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head)); +void synchronize_rcu_tasks(void); +void rcu_barrier_tasks(void); #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 19b3dacb0753..5fd1ddbfcc55 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -381,6 +381,61 @@ void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) } EXPORT_SYMBOL_GPL(call_rcu_tasks); +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-tasks read-side critical section whose beginning + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU + * having an RCU-tasks read-side critical section that extends beyond + * the return from synchronize_rcu_tasks() is guaranteed to have executed + * a full memory barrier after the beginning of synchronize_rcu_tasks() + * and before the beginning of that RCU-tasks read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU + * (but again only if the system has more than one CPU). + */ +void synchronize_rcu_tasks(void) +{ + /* Complain if the scheduler has not started. */ + rcu_lockdep_assert(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(call_rcu_tasks); +} + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} + /* See if the current task has stopped holding out, remove from list if so. */ static void check_holdout_task(struct task_struct *t) { -- cgit v1.2.3 From 3f95aa81d265223fdb13ea2b59883766a05adbdf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 06:10:23 -0700 Subject: rcu: Make TASKS_RCU handle tasks that are almost done exiting Once a task has passed exit_notify() in the do_exit() code path, it is no longer on the task lists, and is therefore no longer visible to rcu_tasks_kthread(). This means that an almost-exited task might be preempted while within a trampoline, and this task won't be waited on by rcu_tasks_kthread(). This commit fixes this bug by adding an srcu_struct. An exiting task does srcu_read_lock() just before calling exit_notify(), and does the corresponding srcu_read_unlock() after doing the final preempt_disable(). This means that rcu_tasks_kthread() can do synchronize_srcu() to wait for all mostly-exited tasks to reach their final preempt_disable() region, and then use synchronize_sched() to wait for those tasks to finish exiting. Reported-by: Oleg Nesterov Suggested-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 3 +++ kernel/exit.c | 3 +++ kernel/rcu/update.c | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 640152fedcde..54b2ebb20313 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -321,6 +321,8 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, * macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU +#define TASKS_RCU(x) x +extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ preempt_disable(); /* Exclude synchronize_sched(); */ \ @@ -329,6 +331,7 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, preempt_enable(); \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ +#define TASKS_RCU(x) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #endif /* #else #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a3..d13f2eec4bb8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -667,6 +667,7 @@ void do_exit(long code) { struct task_struct *tsk = current; int group_dead; + TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); @@ -775,6 +776,7 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA @@ -814,6 +816,7 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5fd1ddbfcc55..403fc4ae539e 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -367,6 +367,13 @@ static struct rcu_head *rcu_tasks_cbs_head; static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_SRCU(tasks_rcu_exit_srcu); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 3; +module_param(rcu_task_stall_timeout, int, 0644); + /* Post an RCU-tasks callback. */ void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) { @@ -517,6 +524,15 @@ static int __noreturn rcu_tasks_kthread(void *arg) } rcu_read_unlock(); + /* + * Wait for tasks that are in the process of exiting. + * This does only part of the job, ensuring that all + * tasks that were previously exiting reach the point + * where they have disabled preemption, allowing the + * later synchronize_sched() to finish the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + /* * Each pass through the following loop scans the list * of holdout tasks, removing any that are no longer @@ -546,6 +562,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) * ->rcu_tasks_holdout accesses to be within the grace * period, avoiding the need for memory barriers for * ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_sched() waits for exiting + * tasks to complete their final preempt_disable() region + * of execution, cleaning up after the synchronize_srcu() + * above. */ synchronize_sched(); -- cgit v1.2.3 From 69c604557ce34015629b325b85ff1a4996038a3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 11:59:36 -0700 Subject: rcutorture: Add torture tests for RCU-tasks This commit adds torture tests for RCU-tasks. It also fixes a bug that would segfault for an RCU flavor lacking a callback-barrier function. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + kernel/rcu/rcutorture.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 54b2ebb20313..a3123f53a4ce 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -55,6 +55,7 @@ enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 178716713e11..75b1abf78c48 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -601,6 +601,52 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks torture testing. + */ + +static int tasks_torture_read_lock(void) +{ + return 0; +} + +static void tasks_torture_read_unlock(int idx) +{ +} + +static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_ops = { + .ttype = RCU_TASKS_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = tasks_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_tasks_torture_deferred_free, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .call = call_rcu_tasks, + .cb_barrier = rcu_barrier_tasks, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks" +}; + +#define RCUTORTURE_TASKS_OPS &tasks_ops, + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUTORTURE_TASKS_OPS + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1295,7 +1341,8 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); - cur_ops->cb_barrier(); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; @@ -1534,6 +1581,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, + RCUTORTURE_TASKS_OPS }; if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) -- cgit v1.2.3 From 176f8f7a52cc6d09d686f0d900abda6942a52fbb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 17:43:50 -0700 Subject: rcu: Make TASKS_RCU handle nohz_full= CPUs Currently TASKS_RCU would ignore a CPU running a task in nohz_full= usermode execution. There would be neither a context switch nor a scheduling-clock interrupt to tell TASKS_RCU that the task in question had passed through a quiescent state. The grace period would therefore extend indefinitely. This commit therefore makes RCU's dyntick-idle subsystem record the task_struct structure of the task that is running in dyntick-idle mode on each CPU. The TASKS_RCU grace period can then access this information and record a quiescent state on behalf of any CPU running in dyntick-idle usermode. Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 3 ++- include/linux/sched.h | 2 ++ kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 16 ++++++++++++++++ kernel/rcu/update.c | 4 +++- 6 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index dffd9258ee60..03b274873b06 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -121,7 +121,8 @@ extern struct group_info init_groups; #define INIT_TASK_RCU_TASKS(tsk) \ .rcu_tasks_holdout = false, \ .rcu_tasks_holdout_list = \ - LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), + LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), \ + .rcu_tasks_idle_cpu = -1, #else #define INIT_TASK_RCU_TASKS(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index eaacac4ae77d..ec8b34722bcc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1274,6 +1274,7 @@ struct task_struct { unsigned long rcu_tasks_nvcsw; bool rcu_tasks_holdout; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_idle_cpu; #endif /* #ifdef CONFIG_TASKS_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2020,6 +2021,7 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); + p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e23dad0661e2..c880f5387b1f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -526,6 +526,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_task_enter(); /* * It is illegal to enter an extended quiescent state while @@ -642,6 +643,7 @@ void rcu_irq_exit(void) static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, int user) { + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..3a92000c354f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -605,6 +605,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static void rcu_dynticks_task_enter(void); +static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7672586d3920..e466b40052a7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -3036,3 +3036,19 @@ static void rcu_bind_gp_kthread(void) housekeeping_affine(current); #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } + +/* Record the current task on dyntick-idle entry. */ +static void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e1d71741958f..2658de4a5975 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -463,7 +463,9 @@ static void check_holdout_task(struct task_struct *t, { if (!ACCESS_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || - !ACCESS_ONCE(t->on_rq)) { + !ACCESS_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { ACCESS_ONCE(t->rcu_tasks_holdout) = false; list_del_rcu(&t->rcu_tasks_holdout_list); put_task_struct(t); -- cgit v1.2.3 From 01a81330344b09028881c953a51d1106a9e63518 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Aug 2014 05:23:35 -0700 Subject: rcu: Remove redundant preempt_disable() from rcu_note_voluntary_context_switch() In theory, synchronize_sched() requires a read-side critical section to order against. In practice, preemption can be thought of as being disabled across every machine instruction, at least for those machine instructions that are not in the idle loop and not on offline CPUs. So this commit removes the redundant preempt_disable() from rcu_note_voluntary_context_switch(). Please note that the single instruction in question is the store of zero to ->rcu_tasks_holdout. The "if" is simply a performance optimization that avoids unnecessary stores. To see this, keep in mind that both the "if" condition and the store are in a quiescent state. Therefore, even if the task is preempted for a full grace period (presumably due to its having done a context switch beforehand), the store will be recording a legitimate quiescent state. Reported-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Conflicts: include/linux/rcupdate.h --- include/linux/rcupdate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a3123f53a4ce..132e1e34cdca 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -326,10 +326,8 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ - preempt_disable(); /* Exclude synchronize_sched(); */ \ if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ - preempt_enable(); \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -- cgit v1.2.3 From 1d082fd061884a587c490c4fc8a2056ce1e47624 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Aug 2014 16:01:53 -0700 Subject: rcu: Remove local_irq_disable() in rcu_preempt_note_context_switch() The rcu_preempt_note_context_switch() function is on a scheduling fast path, so it would be good to avoid disabling irqs. The reason that irqs are disabled is to synchronize process-level and irq-handler access to the task_struct ->rcu_read_unlock_special bitmask. This commit therefore makes ->rcu_read_unlock_special instead be a union of bools with a short allowing single-access checks in RCU's __rcu_read_unlock(). This results in the process-level and irq-handler accesses being simple loads and stores, so that irqs need no longer be disabled. This commit therefore removes the irq disabling from rcu_preempt_note_context_switch(). Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 2 +- include/linux/sched.h | 16 +++++++++------- kernel/rcu/tree_plugin.h | 32 +++++++++++++++----------------- kernel/rcu/update.c | 2 +- 4 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 03b274873b06..77fc43f8fb72 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -111,7 +111,7 @@ extern struct group_info init_groups; #ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ - .rcu_read_unlock_special = 0, \ + .rcu_read_unlock_special.s = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ INIT_TASK_RCU_TREE_PREEMPT() #else diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8b34722bcc..42888d715fb1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1212,6 +1212,13 @@ struct sched_dl_entity { struct hrtimer dl_timer; }; +union rcu_special { + struct { + bool blocked; + bool need_qs; + } b; + short s; +}; struct rcu_node; enum perf_event_task_context { @@ -1264,7 +1271,7 @@ struct task_struct { #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; - char rcu_read_unlock_special; + union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TREE_PREEMPT_RCU @@ -2005,16 +2012,11 @@ extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask); -#ifdef CONFIG_PREEMPT_RCU -#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - static inline void rcu_copy_process(struct task_struct *p) { #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; - p->rcu_read_unlock_special = 0; + p->rcu_read_unlock_special.s = 0; p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); #endif /* #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e466b40052a7..0981c0cd70fe 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -155,9 +155,8 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. + * As with the other rcu_*_qs() functions, callers to this function + * must disable preemption. */ static void rcu_preempt_qs(int cpu) { @@ -166,7 +165,7 @@ static void rcu_preempt_qs(int cpu) if (rdp->passed_quiesce == 0) trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + current->rcu_read_unlock_special.b.need_qs = false; } /* @@ -190,14 +189,14 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_node *rnp; if (t->rcu_read_lock_nesting > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; /* @@ -239,7 +238,7 @@ static void rcu_preempt_note_context_switch(int cpu) : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special) { + t->rcu_read_unlock_special.s) { /* * Complete exit from RCU read-side critical section on @@ -257,9 +256,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - local_irq_save(flags); rcu_preempt_qs(cpu); - local_irq_restore(flags); } /* @@ -340,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; - int special; + union rcu_special special; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -350,12 +347,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. + * let it know that we have done so. Because irqs are disabled, + * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) { + if (special.b.need_qs) { rcu_preempt_qs(smp_processor_id()); - if (!t->rcu_read_unlock_special) { + if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; } @@ -368,8 +366,8 @@ void rcu_read_unlock_special(struct task_struct *t) } /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + if (special.b.blocked) { + t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The @@ -658,7 +656,7 @@ static void rcu_preempt_check_callbacks(int cpu) } if (t->rcu_read_lock_nesting > 0 && per_cpu(rcu_preempt_data, cpu).qs_pending) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; + t->rcu_read_unlock_special.b.need_qs = true; } #ifdef CONFIG_RCU_BOOST @@ -941,7 +939,7 @@ void exit_rcu(void) return; t->rcu_read_lock_nesting = 1; barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 9487b4898e51..6fb911558562 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -93,7 +93,7 @@ void __rcu_read_unlock(void) barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; -- cgit v1.2.3 From 284a8c93af47306beed967a303d84730b32bab39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Aug 2014 16:38:46 -0700 Subject: rcu: Per-CPU operation cleanups to rcu_*_qs() functions The rcu_bh_qs(), rcu_preempt_qs(), and rcu_sched_qs() functions use old-style per-CPU variable access and write to ->passed_quiesce even if it is already set. This commit therefore updates to use the new-style per-CPU variable access functions and avoids the spurious writes. This commit also eliminates the "cpu" argument to these functions because they are always invoked on the indicated CPU. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- include/linux/rcutiny.h | 2 +- kernel/rcu/tiny.c | 10 +++++----- kernel/rcu/tree.c | 34 ++++++++++++++++++---------------- kernel/rcu/tree_plugin.h | 27 +++++++++++++++------------ kernel/softirq.c | 2 +- 6 files changed, 42 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 132e1e34cdca..2fab0e37afe0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -261,8 +261,8 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); -void rcu_sched_qs(int cpu); -void rcu_bh_qs(int cpu); +void rcu_sched_qs(void); +void rcu_bh_qs(void); void rcu_check_callbacks(int cpu, int user); struct notifier_block; void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d40a6a451330..38cc5b1e252d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -80,7 +80,7 @@ static inline void kfree_call_rcu(struct rcu_head *head, static inline void rcu_note_context_switch(int cpu) { - rcu_sched_qs(cpu); + rcu_sched_qs(); } /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 717f00854fc0..61b8d2ccc2cb 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + rcu_sched_qs(); /* implies rcu_bh_inc() */ barrier(); rcu_dynticks_nesting = newval; } @@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { unsigned long flags; @@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu) /* * Record an rcu_bh quiescent state. */ -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { unsigned long flags; @@ -251,9 +251,9 @@ void rcu_check_callbacks(int cpu, int user) { RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); + rcu_sched_qs(); else if (!in_softirq()) - rcu_bh_qs(cpu); + rcu_bh_qs(); if (user) rcu_note_voluntary_context_switch(current); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c880f5387b1f..4c340625ffd4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -188,22 +188,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * one since the start of the grace period, this just sets a flag. * The caller must have disabled preemption. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + } } -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_bh"), + __this_cpu_read(rcu_bh_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + } } static DEFINE_PER_CPU(int, rcu_sched_qs_mask); @@ -278,7 +280,7 @@ static void rcu_momentary_dyntick_idle(void) void rcu_note_context_switch(int cpu) { trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(cpu); + rcu_sched_qs(); rcu_preempt_note_context_switch(cpu); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); @@ -2395,8 +2397,8 @@ void rcu_check_callbacks(int cpu, int user) * at least not while the corresponding CPU is online. */ - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); + rcu_sched_qs(); + rcu_bh_qs(); } else if (!in_softirq()) { @@ -2407,7 +2409,7 @@ void rcu_check_callbacks(int cpu, int user) * critical section, so note it. */ - rcu_bh_qs(cpu); + rcu_bh_qs(); } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0981c0cd70fe..25e692a36280 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -158,14 +158,16 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * As with the other rcu_*_qs() functions, callers to this function * must disable preemption. */ -static void rcu_preempt_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; - current->rcu_read_unlock_special.b.need_qs = false; +static void rcu_preempt_qs(void) +{ + if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_preempt"), + __this_cpu_read(rcu_preempt_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + current->rcu_read_unlock_special.b.need_qs = false; + } } /* @@ -256,7 +258,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs(cpu); + rcu_preempt_qs(); } /* @@ -352,7 +354,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ special = t->rcu_read_unlock_special; if (special.b.need_qs) { - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; @@ -651,11 +653,12 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(cpu); + rcu_preempt_qs(); return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending) + per_cpu(rcu_preempt_data, cpu).qs_pending && + !per_cpu(rcu_preempt_data, cpu).passed_quiesce) t->rcu_read_unlock_special.b.need_qs = true; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..348ec763b104 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -278,7 +278,7 @@ restart: pending >>= softirq_bit; } - rcu_bh_qs(smp_processor_id()); + rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); -- cgit v1.2.3 From 908c7f1949cb7cc6e92ba8f18f2998e87e265b8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: add @gfp to percpu_counter_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_counter_init() so that !GFP_KERNEL allocation masks can be used with percpu_counters too. We could have left percpu_counter_init() alone and added percpu_counter_init_gfp(); however, the number of users isn't that high and introducing _gfp variants to all percpu data structures would be quite ugly, so let's just do the conversion. This is the one with the most users. Other percpu data structures are a lot easier to convert. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Jan Kara Acked-by: "David S. Miller" Cc: x86@kernel.org Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andrew Morton --- arch/x86/kvm/mmu.c | 2 +- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/extent-tree.c | 2 +- fs/ext2/super.c | 6 +++--- fs/ext3/super.c | 6 +++--- fs/ext4/super.c | 14 +++++++++----- fs/file_table.c | 2 +- fs/quota/dquot.c | 2 +- fs/super.c | 3 ++- include/linux/percpu_counter.h | 10 ++++++---- include/net/dst_ops.h | 2 +- include/net/inet_frag.h | 2 +- lib/flex_proportions.c | 4 ++-- lib/percpu_counter.c | 4 ++-- lib/proportions.c | 6 +++--- mm/backing-dev.c | 2 +- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/shmem.c | 2 +- net/dccp/proto.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_memcontrol.c | 2 +- net/sctp/protocol.c | 2 +- 23 files changed, 49 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 931467881da7..5bd53f206f4f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4534,7 +4534,7 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto nomem; register_shrinker(&mmu_shrinker); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2aa..61dae01788d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1180,7 +1180,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2193,13 +2193,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813537f362f9..94ec71eda86b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3493,7 +3493,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b88edc05c230..170dc41e8bf4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext2_count_free_blocks(sb)); + ext2_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext2_count_free_inodes(sb)); + ext2_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext2_count_dirs(sb)); + ext2_count_dirs(sb), GFP_KERNEL); } if (err) { ext2_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 08cdfe5461e3..eba021b88440 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount2; } err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); + ext3_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); + ext3_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); + ext3_count_dirs(sb), GFP_KERNEL); } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 32b43ad154b9..e25ca8fdde7d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3891,7 +3891,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -4105,17 +4106,20 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); + ext4_count_dirs(sb), GFP_KERNEL); if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd31512a..0bab12b20460 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f2d0eee9d1f1..8b663b2d9562 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2725,7 +2725,7 @@ static int __init dquot_init(void) panic("Cannot create dquot hash table"); for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0); + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); if (ret) panic("Cannot create dquot stat counters"); } diff --git a/fs/super.c b/fs/super.c index b9a214d2fe98..1b836107acee 100644 --- a/fs/super.c +++ b/fs/super.c @@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) goto fail; lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], &type->s_writers_key[i], 0); diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d5dd4657c8d6..50e50095c8d1 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_SMP @@ -26,14 +27,14 @@ struct percpu_counter { extern int percpu_counter_batch; -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key); -#define percpu_counter_init(fbc, value) \ +#define percpu_counter_init(fbc, value, gfp) \ ({ \ static struct lock_class_key __key; \ \ - __percpu_counter_init(fbc, value, &__key); \ + __percpu_counter_init(fbc, value, gfp, &__key); \ }) void percpu_counter_destroy(struct percpu_counter *fbc); @@ -89,7 +90,8 @@ struct percpu_counter { s64 count; }; -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, + gfp_t gfp) { fbc->count = amount; return 0; diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 2f26dfb8450e..1f99a1de0e4f 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -63,7 +63,7 @@ static inline void dst_entries_add(struct dst_ops *dst, int val) static inline int dst_entries_init(struct dst_ops *dst) { - return percpu_counter_init(&dst->pcpuc_entries, 0); + return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 65a8855e99fe..8d1765577acc 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -151,7 +151,7 @@ static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) static inline void init_frag_mem_limit(struct netns_frags *nf) { - percpu_counter_init(&nf->mem, 0); + percpu_counter_init(&nf->mem, 0, GFP_KERNEL); } static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index ebf3bac460b0..b9d026bfcf38 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -40,7 +40,7 @@ int fprop_global_init(struct fprop_global *p) p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1); + err = percpu_counter_init(&p->events, 1, GFP_KERNEL); if (err) return err; seqcount_init(&p->sequence); @@ -172,7 +172,7 @@ int fprop_local_init_percpu(struct fprop_local_percpu *pl) { int err; - err = percpu_counter_init(&pl->events, 0); + err = percpu_counter_init(&pl->events, 0, GFP_KERNEL); if (err) return err; pl->period = 0; diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 3fde78275cd1..48144cdae819 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -112,7 +112,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) } EXPORT_SYMBOL(__percpu_counter_sum); -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { unsigned long flags __maybe_unused; @@ -120,7 +120,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; - fbc->counters = alloc_percpu(s32); + fbc->counters = alloc_percpu_gfp(s32, gfp); if (!fbc->counters) return -ENOMEM; diff --git a/lib/proportions.c b/lib/proportions.c index 05df84801b56..ca95f8d54384 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0); + err = percpu_counter_init(&pd->pg[0].events, 0, GFP_KERNEL); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0); + err = percpu_counter_init(&pd->pg[1].events, 0, GFP_KERNEL); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -193,7 +193,7 @@ int prop_local_init_percpu(struct prop_local_percpu *pl) raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0); + return percpu_counter_init(&pl->events, 0, GFP_KERNEL); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..f19a818be2d3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4a0b99..d7ec93e25fa1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3196,7 +3196,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/shmem.c b/mm/shmem.c index 0e5fb225007c..d4bc55d3f107 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2993,7 +2993,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index de2c1e719305..e421eddf67b4 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1115,7 +1115,7 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0); + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; rc = -ENOBUFS; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 541f26a67ba2..d59c2604c247 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3188,8 +3188,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 3af522622fad..1d191357bf88 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) res_parent = &parent_cg->memory_allocated; res_counter_init(&cg_proto->memory_allocated, res_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0); + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6240834f4b95..f00a85a3fddd 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1341,7 +1341,7 @@ static __init int sctp_init(void) if (!sctp_chunk_cachep) goto err_chunk_cachep; - status = percpu_counter_init(&sctp_sockets_allocated, 0); + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; -- cgit v1.2.3 From 20ae00792c6f1f18fc4fc5965445a145df92827e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: proportions: add @gfp to init functions Percpu allocator now supports allocation mask. Add @gfp to [flex_]proportions init functions so that !GFP_KERNEL allocation masks can be used with them too. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Peter Zijlstra --- include/linux/flex_proportions.h | 5 +++-- include/linux/proportions.h | 5 +++-- lib/flex_proportions.c | 8 ++++---- lib/proportions.c | 10 +++++----- mm/backing-dev.c | 2 +- mm/page-writeback.c | 2 +- 6 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 4ebc49fae391..0d348e011a6e 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * When maximum proportion of some event type is specified, this is the @@ -32,7 +33,7 @@ struct fprop_global { seqcount_t sequence; }; -int fprop_global_init(struct fprop_global *p); +int fprop_global_init(struct fprop_global *p, gfp_t gfp); void fprop_global_destroy(struct fprop_global *p); bool fprop_new_period(struct fprop_global *p, int periods); @@ -79,7 +80,7 @@ struct fprop_local_percpu { raw_spinlock_t lock; /* Protect period and numerator */ }; -int fprop_local_init_percpu(struct fprop_local_percpu *pl); +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 26a8a4ed9b07..00e8e8fa7358 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -12,6 +12,7 @@ #include #include #include +#include struct prop_global { /* @@ -40,7 +41,7 @@ struct prop_descriptor { struct mutex mutex; /* serialize the prop_global switch */ }; -int prop_descriptor_init(struct prop_descriptor *pd, int shift); +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); void prop_change_shift(struct prop_descriptor *pd, int new_shift); /* @@ -61,7 +62,7 @@ struct prop_local_percpu { raw_spinlock_t lock; /* protect the snapshot state */ }; -int prop_local_init_percpu(struct prop_local_percpu *pl); +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); void prop_local_destroy_percpu(struct prop_local_percpu *pl); void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index b9d026bfcf38..8f25652f40d4 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -34,13 +34,13 @@ */ #include -int fprop_global_init(struct fprop_global *p) +int fprop_global_init(struct fprop_global *p, gfp_t gfp) { int err; p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1, GFP_KERNEL); + err = percpu_counter_init(&p->events, 1, gfp); if (err) return err; seqcount_init(&p->sequence); @@ -168,11 +168,11 @@ void fprop_fraction_single(struct fprop_global *p, */ #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int fprop_local_init_percpu(struct fprop_local_percpu *pl) +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) { int err; - err = percpu_counter_init(&pl->events, 0, GFP_KERNEL); + err = percpu_counter_init(&pl->events, 0, gfp); if (err) return err; pl->period = 0; diff --git a/lib/proportions.c b/lib/proportions.c index ca95f8d54384..6f724298f67a 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -73,7 +73,7 @@ #include #include -int prop_descriptor_init(struct prop_descriptor *pd, int shift) +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) { int err; @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, GFP_KERNEL); + err = percpu_counter_init(&pd->pg[0].events, 0, gfp); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0, GFP_KERNEL); + err = percpu_counter_init(&pd->pg[1].events, 0, gfp); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -188,12 +188,12 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int prop_local_init_percpu(struct prop_local_percpu *pl) +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) { raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0, GFP_KERNEL); + return percpu_counter_init(&pl->events, 0, gfp); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f19a818be2d3..64ec49d1772b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef1744d..508599403721 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** -- cgit v1.2.3 From a34375ef9e65340a138fc0be287de5c940d260fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: percpu-refcount: add @gfp to percpu_ref_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_ref_init() so that !GFP_KERNEL allocation masks can be used with percpu_refs too. This patch doesn't make any functional difference. v2: blk-mq conversion was missing. Updated. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Benjamin LaHaise Cc: Li Zefan Cc: Nicholas A. Bellinger Cc: Jens Axboe --- block/blk-mq.c | 3 ++- drivers/target/target_core_tpg.c | 3 ++- fs/aio.c | 4 ++-- include/linux/percpu-refcount.h | 3 ++- kernel/cgroup.c | 6 +++--- lib/percpu-refcount.c | 6 ++++-- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..702df07b980d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1776,7 +1776,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, + GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae61222f..4ab6da338585 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,8 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index bd7ec2cc2674..93fbcc0f5696 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -666,10 +666,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3dfbf237cd8f..ee8325122dbd 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -49,6 +49,7 @@ #include #include #include +#include struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); @@ -66,7 +67,7 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release); + percpu_ref_func_t *release, gfp_t gfp); void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..589b4d89a0a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1628,7 +1628,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out; @@ -4487,7 +4487,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); if (err) goto err_free_css; @@ -4555,7 +4555,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe5a3342e960..ff9903264a91 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -40,6 +40,7 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @gfp: allocation mask to use * * Initializes the refcount in single atomic counter mode with a refcount of 1; * analagous to atomic_set(ref, 1). @@ -47,11 +48,12 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, + gfp_t gfp) { atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); + ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned, gfp); if (!ref->pcpu_count_ptr) return -ENOMEM; -- cgit v1.2.3 From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/exit.c | 4 ++++ kernel/fork.c | 1 + kernel/sched/cputime.c | 33 ++++++++++++++++++++------------- kernel/sys.c | 2 -- kernel/time/posix-cpu-timers.c | 14 -------------- 6 files changed, 26 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52b..dd9eb4807389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -645,6 +645,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; diff --git a/kernel/exit.c b/kernel/exit.c index b93d46dab6fc..fa09b86609db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) * the signal_struct. */ task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e491..9387ae8ab048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836359ba..49b7cfe98f7a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; rcu_read_lock(); - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } + /* Attempt a lockless read on the first round. */ + nextseq = 0; + do { + seq = nextseq; + read_seqbegin_or_lock(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry(&sig->stats_lock, seq); rcu_read_unlock(); } @@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..b6636643cbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) -- cgit v1.2.3 From ff9ea323816dc1c8ac7144afd4eab3ac97704430 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:56 +0900 Subject: block, bdi: an active gendisk always has a request_queue associated with it bdev_get_queue() returns the request_queue associated with the specified block_device. blk_get_backing_dev_info() makes use of bdev_get_queue() to determine the associated bdi given a block_device. All the callers of bdev_get_queue() including blk_get_backing_dev_info() assume that bdev_get_queue() may return NULL and implement NULL handling; however, bdev_get_queue() requires the passed in block_device is opened and attached to its gendisk. Because an active gendisk always has a valid request_queue associated with it, bdev_get_queue() can never return NULL and neither can blk_get_backing_dev_info(). Make it clear that neither of the two functions can return NULL and remove NULL handling from all the callers. Signed-off-by: Tejun Heo Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++------- block/compat_ioctl.c | 4 ---- block/ioctl.c | 4 ---- fs/block_dev.c | 2 -- fs/btrfs/disk-io.c | 2 +- fs/xfs/xfs_buf.c | 2 -- include/linux/blkdev.h | 2 +- 7 files changed, 5 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 93603e6ff479..817446175489 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -83,18 +83,14 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info - * - * Will return NULL if the request queue cannot be located. + * backing_dev_info. This function can only be called if @bdev is opened + * and the return value is never NULL. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - struct backing_dev_info *ret = NULL; struct request_queue *q = bdev_get_queue(bdev); - if (q) - ret = &q->backing_dev_info; - return ret; + return &q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 18b282ce361e..f678c733df40 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -709,8 +709,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return compat_put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: /* compatible */ @@ -731,8 +729,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: diff --git a/block/ioctl.c b/block/ioctl.c index d6cda8147c91..6c7bf903742f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -356,8 +356,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); @@ -386,8 +384,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d7274619bf9..d3251eca6429 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1173,8 +1173,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; bdev_inode_switch_bdi(bdev->bd_inode, bdi); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d0ed9e664f7d..39ff591ae1b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1694,7 +1694,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { + if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca9b064..497fcde381d7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1678,8 +1678,6 @@ xfs_alloc_buftarg( btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; btp->bt_bdi = blk_get_backing_dev_info(bdev); - if (!btp->bt_bdi) - goto error; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 518b46555b80..e267bf0db559 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -865,7 +865,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue; + return bdev->bd_disk->queue; /* this is never NULL */ } /* -- cgit v1.2.3 From e36f1dfce0b45d347927568efe1088821758cc3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:57 +0900 Subject: bdi: remove unused stuff Two flags and one bdi_writeback field are no longer used. Remove them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e488e9459a93..2103a7fa3fd8 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -28,12 +28,10 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ BDI_registered, /* bdi_register() was done */ BDI_writeback_running, /* Writeback is in progress */ - BDI_unused, /* Available bits start here */ }; typedef int (congested_fn)(void *, int); @@ -50,7 +48,6 @@ enum bdi_stat_item { struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ - unsigned int nr; unsigned long last_old_flush; /* last old data flush */ -- cgit v1.2.3 From 018a17bdc8658ad448497c84d4ba21b6985820ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:04:01 +0900 Subject: bdi: reimplement bdev_inode_switch_bdi() A block_device may be attached to different gendisks and thus different bdis over time. bdev_inode_switch_bdi() is used to switch the associated bdi. The function assumes that the inode could be dirty and transfers it between bdis if so. This is a bit nasty in that it reaches into bdi internals. This patch reimplements the function so that it writes out the inode if dirty. This is a lot simpler and can be implemented without exposing bdi internals. Signed-off-by: Tejun Heo Cc: Alexander Viro Signed-off-by: Jens Axboe --- fs/block_dev.c | 32 +++++++++++--------------------- include/linux/backing-dev.h | 1 - mm/backing-dev.c | 2 +- 3 files changed, 12 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index d3251eca6429..cc8d68ac29aa 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -50,32 +50,22 @@ inline struct block_device *I_BDEV(struct inode *inode) EXPORT_SYMBOL(I_BDEV); /* - * Move the inode from its current bdi to a new bdi. If the inode is dirty we - * need to move it onto the dirty list of @dst so that the inode is always on - * the right list. + * Move the inode from its current bdi to a new bdi. Make sure the inode + * is clean before moving so that it doesn't linger on the old bdi. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - struct backing_dev_info *old = inode->i_data.backing_dev_info; - bool wakeup_bdi = false; - - if (unlikely(dst == old)) /* deadlock avoidance */ - return; - bdi_lock_two(&old->wb, &dst->wb); - spin_lock(&inode->i_lock); - inode->i_data.backing_dev_info = dst; - if (inode->i_state & I_DIRTY) { - if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) - wakeup_bdi = true; - list_move(&inode->i_wb_list, &dst->wb.b_dirty); + while (true) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_DIRTY)) { + inode->i_data.backing_dev_info = dst; + spin_unlock(&inode->i_lock); + return; + } + spin_unlock(&inode->i_lock); + WARN_ON_ONCE(write_inode_now(inode, true)); } - spin_unlock(&inode->i_lock); - spin_unlock(&old->wb.list_lock); - spin_unlock(&dst->wb.list_lock); - - if (wakeup_bdi) - bdi_wakeup_thread_delayed(dst); } /* Kill _all_ buffers and pagecache , dirty or not.. */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2103a7fa3fd8..5da6012b7a14 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -121,7 +121,6 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi); void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b65fe93ad612..7d63d5e9d3de 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -40,7 +40,7 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { spin_lock(&wb1->list_lock); -- cgit v1.2.3 From 55e15c949fd05d247a889df0ed0177a676fec665 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:17 +0200 Subject: PM / domains: Remove the pm_genpd_add|remove_callbacks APIs There are no users of these APIs. To simplify the generic power domain let's remove them. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 106 -------------------------------------------- include/linux/pm_domain.h | 19 -------- 2 files changed, 125 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index eee55c1e5fde..e613e3cb96d8 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -1743,112 +1743,6 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, return ret; } -/** - * pm_genpd_add_callbacks - Add PM domain callbacks to a given device. - * @dev: Device to add the callbacks to. - * @ops: Set of callbacks to add. - * @td: Timing data to add to the device along with the callbacks (optional). - * - * Every call to this routine should be balanced with a call to - * __pm_genpd_remove_callbacks() and they must not be nested. - */ -int pm_genpd_add_callbacks(struct device *dev, struct gpd_dev_ops *ops, - struct gpd_timing_data *td) -{ - struct generic_pm_domain_data *gpd_data_new, *gpd_data = NULL; - int ret = 0; - - if (!(dev && ops)) - return -EINVAL; - - gpd_data_new = __pm_genpd_alloc_dev_data(dev); - if (!gpd_data_new) - return -ENOMEM; - - pm_runtime_disable(dev); - device_pm_lock(); - - ret = dev_pm_get_subsys_data(dev); - if (ret) - goto out; - - spin_lock_irq(&dev->power.lock); - - if (dev->power.subsys_data->domain_data) { - gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); - } else { - gpd_data = gpd_data_new; - dev->power.subsys_data->domain_data = &gpd_data->base; - } - gpd_data->refcount++; - gpd_data->ops = *ops; - if (td) - gpd_data->td = *td; - - spin_unlock_irq(&dev->power.lock); - - out: - device_pm_unlock(); - pm_runtime_enable(dev); - - if (gpd_data != gpd_data_new) - __pm_genpd_free_dev_data(dev, gpd_data_new); - - return ret; -} -EXPORT_SYMBOL_GPL(pm_genpd_add_callbacks); - -/** - * __pm_genpd_remove_callbacks - Remove PM domain callbacks from a given device. - * @dev: Device to remove the callbacks from. - * @clear_td: If set, clear the device's timing data too. - * - * This routine can only be called after pm_genpd_add_callbacks(). - */ -int __pm_genpd_remove_callbacks(struct device *dev, bool clear_td) -{ - struct generic_pm_domain_data *gpd_data = NULL; - bool remove = false; - int ret = 0; - - if (!(dev && dev->power.subsys_data)) - return -EINVAL; - - pm_runtime_disable(dev); - device_pm_lock(); - - spin_lock_irq(&dev->power.lock); - - if (dev->power.subsys_data->domain_data) { - gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); - gpd_data->ops = (struct gpd_dev_ops){ NULL }; - if (clear_td) - gpd_data->td = (struct gpd_timing_data){ 0 }; - - if (--gpd_data->refcount == 0) { - dev->power.subsys_data->domain_data = NULL; - remove = true; - } - } else { - ret = -EINVAL; - } - - spin_unlock_irq(&dev->power.lock); - - device_pm_unlock(); - pm_runtime_enable(dev); - - if (ret) - return ret; - - dev_pm_put_subsys_data(dev); - if (remove) - __pm_genpd_free_dev_data(dev, gpd_data); - - return 0; -} -EXPORT_SYMBOL_GPL(__pm_genpd_remove_callbacks); - /** * pm_genpd_attach_cpuidle - Connect the given PM domain with cpuidle. * @genpd: PM domain to be connected with cpuidle. diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index ebc4c76ffb73..9ae2b9e88d61 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -151,10 +151,6 @@ extern int pm_genpd_add_subdomain_names(const char *master_name, const char *subdomain_name); extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, struct generic_pm_domain *target); -extern int pm_genpd_add_callbacks(struct device *dev, - struct gpd_dev_ops *ops, - struct gpd_timing_data *td); -extern int __pm_genpd_remove_callbacks(struct device *dev, bool clear_td); extern int pm_genpd_attach_cpuidle(struct generic_pm_domain *genpd, int state); extern int pm_genpd_name_attach_cpuidle(const char *name, int state); extern int pm_genpd_detach_cpuidle(struct generic_pm_domain *genpd); @@ -217,16 +213,6 @@ static inline int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, { return -ENOSYS; } -static inline int pm_genpd_add_callbacks(struct device *dev, - struct gpd_dev_ops *ops, - struct gpd_timing_data *td) -{ - return -ENOSYS; -} -static inline int __pm_genpd_remove_callbacks(struct device *dev, bool clear_td) -{ - return -ENOSYS; -} static inline int pm_genpd_attach_cpuidle(struct generic_pm_domain *genpd, int st) { return -ENOSYS; @@ -281,11 +267,6 @@ static inline int pm_genpd_name_add_device(const char *domain_name, return __pm_genpd_name_add_device(domain_name, dev, NULL); } -static inline int pm_genpd_remove_callbacks(struct device *dev) -{ - return __pm_genpd_remove_callbacks(dev, true); -} - #ifdef CONFIG_PM_GENERIC_DOMAINS_RUNTIME extern void genpd_queue_power_off_work(struct generic_pm_domain *genpd); extern void pm_genpd_poweroff_unused(void); -- cgit v1.2.3 From 67da6d4bf43c4208433ef8f3ee487401b4dc9c74 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:18 +0200 Subject: PM / domains: Ignore callbacks for subsys generic_pm_domain_data In a step of simplifying the generic power domain let's move away from using these callbacks. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 131 +++----------------------------------------- include/linux/pm_domain.h | 1 - 2 files changed, 8 insertions(+), 124 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index e613e3cb96d8..aa5b14c1e643 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -25,10 +25,6 @@ __routine = genpd->dev_ops.callback; \ if (__routine) { \ __ret = __routine(dev); \ - } else { \ - __routine = dev_gpd_data(dev)->ops.callback; \ - if (__routine) \ - __ret = __routine(dev); \ } \ __ret; \ }) @@ -1871,10 +1867,6 @@ static int pm_genpd_default_save_state(struct device *dev) { int (*cb)(struct device *__dev); - cb = dev_gpd_data(dev)->ops.save_state; - if (cb) - return cb(dev); - if (dev->type && dev->type->pm) cb = dev->type->pm->runtime_suspend; else if (dev->class && dev->class->pm) @@ -1898,10 +1890,6 @@ static int pm_genpd_default_restore_state(struct device *dev) { int (*cb)(struct device *__dev); - cb = dev_gpd_data(dev)->ops.restore_state; - if (cb) - return cb(dev); - if (dev->type && dev->type->pm) cb = dev->type->pm->runtime_resume; else if (dev->class && dev->class->pm) @@ -1917,109 +1905,6 @@ static int pm_genpd_default_restore_state(struct device *dev) return cb ? cb(dev) : 0; } -#ifdef CONFIG_PM_SLEEP - -/** - * pm_genpd_default_suspend - Default "device suspend" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_suspend(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.suspend; - - return cb ? cb(dev) : pm_generic_suspend(dev); -} - -/** - * pm_genpd_default_suspend_late - Default "late device suspend" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_suspend_late(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.suspend_late; - - return cb ? cb(dev) : pm_generic_suspend_late(dev); -} - -/** - * pm_genpd_default_resume_early - Default "early device resume" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_resume_early(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.resume_early; - - return cb ? cb(dev) : pm_generic_resume_early(dev); -} - -/** - * pm_genpd_default_resume - Default "device resume" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_resume(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.resume; - - return cb ? cb(dev) : pm_generic_resume(dev); -} - -/** - * pm_genpd_default_freeze - Default "device freeze" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_freeze(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.freeze; - - return cb ? cb(dev) : pm_generic_freeze(dev); -} - -/** - * pm_genpd_default_freeze_late - Default "late device freeze" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_freeze_late(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.freeze_late; - - return cb ? cb(dev) : pm_generic_freeze_late(dev); -} - -/** - * pm_genpd_default_thaw_early - Default "early device thaw" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_thaw_early(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.thaw_early; - - return cb ? cb(dev) : pm_generic_thaw_early(dev); -} - -/** - * pm_genpd_default_thaw - Default "device thaw" for PM domians. - * @dev: Device to handle. - */ -static int pm_genpd_default_thaw(struct device *dev) -{ - int (*cb)(struct device *__dev) = dev_gpd_data(dev)->ops.thaw; - - return cb ? cb(dev) : pm_generic_thaw(dev); -} - -#else /* !CONFIG_PM_SLEEP */ - -#define pm_genpd_default_suspend NULL -#define pm_genpd_default_suspend_late NULL -#define pm_genpd_default_resume_early NULL -#define pm_genpd_default_resume NULL -#define pm_genpd_default_freeze NULL -#define pm_genpd_default_freeze_late NULL -#define pm_genpd_default_thaw_early NULL -#define pm_genpd_default_thaw NULL - -#endif /* !CONFIG_PM_SLEEP */ - /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -2071,14 +1956,14 @@ void pm_genpd_init(struct generic_pm_domain *genpd, genpd->domain.ops.complete = pm_genpd_complete; genpd->dev_ops.save_state = pm_genpd_default_save_state; genpd->dev_ops.restore_state = pm_genpd_default_restore_state; - genpd->dev_ops.suspend = pm_genpd_default_suspend; - genpd->dev_ops.suspend_late = pm_genpd_default_suspend_late; - genpd->dev_ops.resume_early = pm_genpd_default_resume_early; - genpd->dev_ops.resume = pm_genpd_default_resume; - genpd->dev_ops.freeze = pm_genpd_default_freeze; - genpd->dev_ops.freeze_late = pm_genpd_default_freeze_late; - genpd->dev_ops.thaw_early = pm_genpd_default_thaw_early; - genpd->dev_ops.thaw = pm_genpd_default_thaw; + genpd->dev_ops.suspend = pm_generic_suspend; + genpd->dev_ops.suspend_late = pm_generic_suspend_late; + genpd->dev_ops.resume_early = pm_generic_resume_early; + genpd->dev_ops.resume = pm_generic_resume; + genpd->dev_ops.freeze = pm_generic_freeze; + genpd->dev_ops.freeze_late = pm_generic_freeze_late; + genpd->dev_ops.thaw_early = pm_generic_thaw_early; + genpd->dev_ops.thaw = pm_generic_thaw; mutex_lock(&gpd_list_lock); list_add(&genpd->gpd_list_node, &gpd_list); mutex_unlock(&gpd_list_lock); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 9ae2b9e88d61..436ea149d40c 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -108,7 +108,6 @@ struct gpd_timing_data { struct generic_pm_domain_data { struct pm_domain_data base; - struct gpd_dev_ops ops; struct gpd_timing_data td; struct notifier_block nb; struct mutex lock; -- cgit v1.2.3 From 1e0407ca54d28db8e5f02e437ff21cc6416c0be8 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:19 +0200 Subject: PM / domains: Remove system PM callbacks from gpd_dev_ops There no users of these callbacks, let's simplify the generic power domain by removing them. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 64 ++++++--------------------------------------- include/linux/pm_domain.h | 8 ------ 2 files changed, 8 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index aa5b14c1e643..e777609ccc77 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -770,46 +770,6 @@ static bool genpd_dev_active_wakeup(struct generic_pm_domain *genpd, return GENPD_DEV_CALLBACK(genpd, bool, active_wakeup, dev); } -static int genpd_suspend_dev(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, suspend, dev); -} - -static int genpd_suspend_late(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, suspend_late, dev); -} - -static int genpd_resume_early(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, resume_early, dev); -} - -static int genpd_resume_dev(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, resume, dev); -} - -static int genpd_freeze_dev(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, freeze, dev); -} - -static int genpd_freeze_late(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, freeze_late, dev); -} - -static int genpd_thaw_early(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, thaw_early, dev); -} - -static int genpd_thaw_dev(struct generic_pm_domain *genpd, struct device *dev) -{ - return GENPD_DEV_CALLBACK(genpd, int, thaw, dev); -} - /** * pm_genpd_sync_poweroff - Synchronously power off a PM domain and its masters. * @genpd: PM domain to power off, if possible. @@ -991,7 +951,7 @@ static int pm_genpd_suspend(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_suspend_dev(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_suspend(dev); } /** @@ -1012,7 +972,7 @@ static int pm_genpd_suspend_late(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_suspend_late(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_suspend_late(dev); } /** @@ -1099,7 +1059,7 @@ static int pm_genpd_resume_early(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_resume_early(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_resume_early(dev); } /** @@ -1120,7 +1080,7 @@ static int pm_genpd_resume(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_resume_dev(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_resume(dev); } /** @@ -1141,7 +1101,7 @@ static int pm_genpd_freeze(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_freeze_dev(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_freeze(dev); } /** @@ -1163,7 +1123,7 @@ static int pm_genpd_freeze_late(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_freeze_late(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_freeze_late(dev); } /** @@ -1227,7 +1187,7 @@ static int pm_genpd_thaw_early(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_thaw_early(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_thaw_early(dev); } /** @@ -1248,7 +1208,7 @@ static int pm_genpd_thaw(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - return genpd->suspend_power_off ? 0 : genpd_thaw_dev(genpd, dev); + return genpd->suspend_power_off ? 0 : pm_generic_thaw(dev); } /** @@ -1956,14 +1916,6 @@ void pm_genpd_init(struct generic_pm_domain *genpd, genpd->domain.ops.complete = pm_genpd_complete; genpd->dev_ops.save_state = pm_genpd_default_save_state; genpd->dev_ops.restore_state = pm_genpd_default_restore_state; - genpd->dev_ops.suspend = pm_generic_suspend; - genpd->dev_ops.suspend_late = pm_generic_suspend_late; - genpd->dev_ops.resume_early = pm_generic_resume_early; - genpd->dev_ops.resume = pm_generic_resume; - genpd->dev_ops.freeze = pm_generic_freeze; - genpd->dev_ops.freeze_late = pm_generic_freeze_late; - genpd->dev_ops.thaw_early = pm_generic_thaw_early; - genpd->dev_ops.thaw = pm_generic_thaw; mutex_lock(&gpd_list_lock); list_add(&genpd->gpd_list_node, &gpd_list); mutex_unlock(&gpd_list_lock); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 436ea149d40c..b6343d4b9b04 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -35,14 +35,6 @@ struct gpd_dev_ops { int (*stop)(struct device *dev); int (*save_state)(struct device *dev); int (*restore_state)(struct device *dev); - int (*suspend)(struct device *dev); - int (*suspend_late)(struct device *dev); - int (*resume_early)(struct device *dev); - int (*resume)(struct device *dev); - int (*freeze)(struct device *dev); - int (*freeze_late)(struct device *dev); - int (*thaw_early)(struct device *dev); - int (*thaw)(struct device *dev); bool (*active_wakeup)(struct device *dev); }; -- cgit v1.2.3 From c5d79ec2a5715489cff16a0d1cf4fa9108a5509e Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:22 +0200 Subject: PM / domains: Remove dev_irq_safe from genpd config The genpd dev_irq_safe configuration somewhat overlaps with the runtime PM pm_runtime_irq_safe() option. Also, currently genpd don't have a good way to deal with these device. So, until we figured out if and how to support this in genpd, let's remove the option to configure it. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 4 ---- include/linux/pm_domain.h | 1 - 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index e777609ccc77..4298a2bcd228 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -615,8 +615,6 @@ static int pm_genpd_runtime_suspend(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - might_sleep_if(!genpd->dev_irq_safe); - stop_ok = genpd->gov ? genpd->gov->stop_ok : NULL; if (stop_ok && !stop_ok(dev)) return -EBUSY; @@ -661,8 +659,6 @@ static int pm_genpd_runtime_resume(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - might_sleep_if(!genpd->dev_irq_safe); - /* If power.irq_safe, the PM domain is never powered off. */ if (dev->power.irq_safe) return genpd_start_dev_no_timing(genpd, dev); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b6343d4b9b04..360c94ceeebb 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -63,7 +63,6 @@ struct generic_pm_domain { unsigned int suspended_count; /* System suspend device counter */ unsigned int prepared_count; /* Suspend counter of prepared devices */ bool suspend_power_off; /* Power status before system suspend */ - bool dev_irq_safe; /* Device callbacks are IRQ-safe */ int (*power_off)(struct generic_pm_domain *domain); s64 power_off_latency_ns; int (*power_on)(struct generic_pm_domain *domain); -- cgit v1.2.3 From d47e6464ae6c96735d4706f5cb0537fe717b6b00 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:24 +0200 Subject: PM / domains: Remove pm_genpd_syscore_switch() API The pm_genpd_syscore_poweroff() API and pm_genpd_syscore_poweron() API makes the pm_genpd_syscore_switch() API redundant. Moreover, since there are no active users, let's just remove it. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 17 ++++++++++++++--- include/linux/pm_domain.h | 16 ++++------------ 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index b910d0f6ff60..601e35b2fa71 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -1292,13 +1292,13 @@ static void pm_genpd_complete(struct device *dev) } /** - * pm_genpd_syscore_switch - Switch power during system core suspend or resume. + * genpd_syscore_switch - Switch power during system core suspend or resume. * @dev: Device that normally is marked as "always on" to switch power for. * * This routine may only be called during the system core (syscore) suspend or * resume phase for devices whose "always on" flags are set. */ -void pm_genpd_syscore_switch(struct device *dev, bool suspend) +static void genpd_syscore_switch(struct device *dev, bool suspend) { struct generic_pm_domain *genpd; @@ -1314,7 +1314,18 @@ void pm_genpd_syscore_switch(struct device *dev, bool suspend) genpd->suspended_count--; } } -EXPORT_SYMBOL_GPL(pm_genpd_syscore_switch); + +void pm_genpd_syscore_poweroff(struct device *dev) +{ + genpd_syscore_switch(dev, true); +} +EXPORT_SYMBOL_GPL(pm_genpd_syscore_poweroff); + +void pm_genpd_syscore_poweron(struct device *dev) +{ + genpd_syscore_switch(dev, false); +} +EXPORT_SYMBOL_GPL(pm_genpd_syscore_poweron); #else diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 360c94ceeebb..3997af691600 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -266,19 +266,11 @@ static inline void pm_genpd_poweroff_unused(void) {} #endif #ifdef CONFIG_PM_GENERIC_DOMAINS_SLEEP -extern void pm_genpd_syscore_switch(struct device *dev, bool suspend); +extern void pm_genpd_syscore_poweroff(struct device *dev); +extern void pm_genpd_syscore_poweron(struct device *dev); #else -static inline void pm_genpd_syscore_switch(struct device *dev, bool suspend) {} +static inline void pm_genpd_syscore_poweroff(struct device *dev) {} +static inline void pm_genpd_syscore_poweron(struct device *dev) {} #endif -static inline void pm_genpd_syscore_poweroff(struct device *dev) -{ - pm_genpd_syscore_switch(dev, true); -} - -static inline void pm_genpd_syscore_poweron(struct device *dev) -{ - pm_genpd_syscore_switch(dev, false); -} - #endif /* _LINUX_PM_DOMAIN_H */ -- cgit v1.2.3 From d971f0b0eaaf3f2086bf21bbd64f7ea7e2f28459 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:25 +0200 Subject: PM / domains: Remove genpd_queue_power_off_work() API There are no active users of this API. Let's remove it and if future needs shows up we could consider to have a get/put API instead. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 5 ++++- include/linux/pm_domain.h | 2 -- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 601e35b2fa71..d71d0458d41f 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -422,7 +422,7 @@ static bool genpd_abort_poweroff(struct generic_pm_domain *genpd) * Queue up the execution of pm_genpd_poweroff() unless it's already been done * before. */ -void genpd_queue_power_off_work(struct generic_pm_domain *genpd) +static void genpd_queue_power_off_work(struct generic_pm_domain *genpd) { queue_work(pm_wq, &genpd->power_off_work); } @@ -729,6 +729,9 @@ static inline int genpd_dev_pm_qos_notifier(struct notifier_block *nb, return NOTIFY_DONE; } +static inline void +genpd_queue_power_off_work(struct generic_pm_domain *genpd) {} + static inline void genpd_power_off_work_fn(struct work_struct *work) {} #define pm_genpd_runtime_suspend NULL diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 3997af691600..c19c90b839b8 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -258,10 +258,8 @@ static inline int pm_genpd_name_add_device(const char *domain_name, } #ifdef CONFIG_PM_GENERIC_DOMAINS_RUNTIME -extern void genpd_queue_power_off_work(struct generic_pm_domain *genpd); extern void pm_genpd_poweroff_unused(void); #else -static inline void genpd_queue_power_off_work(struct generic_pm_domain *gpd) {} static inline void pm_genpd_poweroff_unused(void) {} #endif -- cgit v1.2.3 From 0f574d4c3a7a325cbbef28ee738dedca9851e957 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:30 +0200 Subject: PM / domains: Remove default_stop_ok() API There are currently no need to export default_stop_ok() as an API, instead let's keep it local to the PM domain governor. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain_governor.c | 7 ++----- include/linux/pm_domain.h | 6 ------ 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c index a089e3bcdfbc..d88a62e104d4 100644 --- a/drivers/base/power/domain_governor.c +++ b/drivers/base/power/domain_governor.c @@ -42,7 +42,7 @@ static int dev_update_qos_constraint(struct device *dev, void *data) * default_stop_ok - Default PM domain governor routine for stopping devices. * @dev: Device to check. */ -bool default_stop_ok(struct device *dev) +static bool default_stop_ok(struct device *dev) { struct gpd_timing_data *td = &dev_gpd_data(dev)->td; unsigned long flags; @@ -229,10 +229,7 @@ static bool always_on_power_down_ok(struct dev_pm_domain *domain) #else /* !CONFIG_PM_RUNTIME */ -bool default_stop_ok(struct device *dev) -{ - return false; -} +static inline bool default_stop_ok(struct device *dev) { return false; } #define default_power_down_ok NULL #define always_on_power_down_ok NULL diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c19c90b839b8..3b59f296e6ec 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -151,8 +151,6 @@ extern void pm_genpd_init(struct generic_pm_domain *genpd, extern int pm_genpd_poweron(struct generic_pm_domain *genpd); extern int pm_genpd_name_poweron(const char *domain_name); -extern bool default_stop_ok(struct device *dev); - extern struct dev_power_governor pm_domain_always_on_gov; #else @@ -231,10 +229,6 @@ static inline int pm_genpd_name_poweron(const char *domain_name) { return -ENOSYS; } -static inline bool default_stop_ok(struct device *dev) -{ - return false; -} #define simple_qos_governor NULL #define pm_domain_always_on_gov NULL #endif -- cgit v1.2.3 From ae3c511c2d72161b11e93866203b59a3a37dfac7 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 3 Sep 2014 12:52:31 +0200 Subject: PM / domains: Keep declaration of dev_power_governors together This is a pure code cleanup in the header file for the PM domain. No functional change. Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 3b59f296e6ec..aa03586c94a2 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -117,8 +117,6 @@ static inline struct generic_pm_domain_data *dev_gpd_data(struct device *dev) return to_gpd_data(dev->power.subsys_data->domain_data); } -extern struct dev_power_governor simple_qos_governor; - extern struct generic_pm_domain *dev_to_genpd(struct device *dev); extern int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, @@ -151,6 +149,7 @@ extern void pm_genpd_init(struct generic_pm_domain *genpd, extern int pm_genpd_poweron(struct generic_pm_domain *genpd); extern int pm_genpd_name_poweron(const char *domain_name); +extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; #else -- cgit v1.2.3 From 413fffc3a1db7f270afdf1ecb35c1edc013acc68 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 28 Aug 2014 11:22:23 +0530 Subject: cpufreq: Add support for per-policy driver data Drivers supporting multiple clusters or multiple 'struct cpufreq_policy' instances may need to keep per-policy data. If the core doesn't provide support for that, they might do it in the most unoptimized way: 'per-cpu' data. This patch adds another field in struct cpufreq_policy: 'driver_data'. It isn't accessed by core and is for driver's internal use only. Tested-by: Stephen Boyd Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7d1955afa62c..138336b6bb04 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -112,6 +112,9 @@ struct cpufreq_policy { spinlock_t transition_lock; wait_queue_head_t transition_wait; struct task_struct *transition_task; /* Task which is doing the transition */ + + /* For cpufreq driver's internal use */ + void *driver_data; }; /* Only for ACPI */ -- cgit v1.2.3 From 8ca28610e5e37193cd61fefa4310941e28de10ca Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Thu, 7 Aug 2014 15:14:06 +0200 Subject: mmc: include linux/types.h for bool definition in atmel-mci.h This patch adds an include of linux/types.h to make sure bool is defined before utilized in this header file. Signed-off-by: Hans-Christian Egtvedt Acked-by: Ludovic Desroches Signed-off-by: Ulf Hansson --- include/linux/atmel-mci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h index 4c7a4b2104bf..91b77f8d495d 100644 --- a/include/linux/atmel-mci.h +++ b/include/linux/atmel-mci.h @@ -1,6 +1,8 @@ #ifndef __LINUX_ATMEL_MCI_H #define __LINUX_ATMEL_MCI_H +#include + #define ATMCI_MAX_NR_SLOTS 2 /** -- cgit v1.2.3 From b3683994843a0ede0e19daccd1ac32a46b21eb39 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Wed, 13 Aug 2014 13:34:01 +0800 Subject: mmc: Correct the value of MMC_NUM_PHY_PARTITION eMMC card can support up to 7 physical partitions, including 2 boot, 1 RPMB and 4 GPs. Change MMC_NUM_PHY_PARTITION from 6 to 7, which is the correct value. Signed-off-by: Yi Sun Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index d424b9de3aff..bde5147a4221 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -214,11 +214,12 @@ enum mmc_blk_status { }; /* The number of MMC physical partitions. These consist of: - * boot partitions (2), general purpose partitions (4) in MMC v4.4. + * boot partitions (2), general purpose partitions (4) and + * RPMB partition (1) in MMC v4.4. */ #define MMC_NUM_BOOT_PARTITION 2 #define MMC_NUM_GP_PARTITION 4 -#define MMC_NUM_PHY_PARTITION 6 +#define MMC_NUM_PHY_PARTITION 7 #define MAX_MMC_PART_NAME_LEN 20 /* -- cgit v1.2.3 From 3d705d14fe4c72be83bae1610680e209ee226b9d Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Tue, 19 Aug 2014 10:45:51 +0200 Subject: mmc: implement Driver Stage Register handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some eMMC and SD cards implement a DSR register that allows to tune raise/fall times and drive strength of the CMD and DATA outputs. The values to use depend on the card in use and the host. It might be needed to reduce the drive strength to prevent voltage peaks above the host's specification. Implement a 'dsr' devicetree property that allows to specify the value to set the DSR to. For non-dt setups the new members of mmc_host can be set by board code. This patch was initially authored by Sascha Hauer. It contains improvements authored by Markus Niebel and Uwe Kleine-König. Signed-off-by: Sascha Hauer Signed-off-by: Markus Niebel Signed-off-by: Uwe Kleine-König Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/mmc.txt | 2 ++ drivers/mmc/core/host.c | 8 ++++++++ drivers/mmc/core/mmc.c | 8 ++++++++ drivers/mmc/core/mmc_ops.c | 20 ++++++++++++++++++++ drivers/mmc/core/mmc_ops.h | 1 + drivers/mmc/core/sd.c | 8 ++++++++ include/linux/mmc/card.h | 3 ++- include/linux/mmc/host.h | 3 +++ 8 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt index 431716e37a39..b52628b18a53 100644 --- a/Documentation/devicetree/bindings/mmc/mmc.txt +++ b/Documentation/devicetree/bindings/mmc/mmc.txt @@ -40,6 +40,8 @@ Optional properties: - mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported - mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported - mmc-hs400-1_2v: eMMC HS400 mode(1.2V I/O) is supported +- dsr: Value the card's (optional) Driver Stage Register (DSR) should be + programmed with. Valid range: [0 .. 0xffff]. *NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line polarity properties, we have to fix the meaning of the "normal" and "inverted" diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 95cceae96944..d572b2beb65a 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -452,6 +452,14 @@ int mmc_of_parse(struct mmc_host *host) if (of_find_property(np, "mmc-hs400-1_2v", &len)) host->caps2 |= MMC_CAP2_HS400_1_2V | MMC_CAP2_HS200_1_2V_SDR; + host->dsr_req = !of_property_read_u32(np, "dsr", &host->dsr); + if (host->dsr_req && (host->dsr & ~0xffff)) { + dev_err(host->parent, + "device tree specified broken value for DSR: 0x%x, ignoring\n", + host->dsr); + host->dsr_req = 0; + } + return 0; out: diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 1eda8dd8c867..31c43165f8bc 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -162,6 +162,7 @@ static int mmc_decode_csd(struct mmc_card *card) csd->read_partial = UNSTUFF_BITS(resp, 79, 1); csd->write_misalign = UNSTUFF_BITS(resp, 78, 1); csd->read_misalign = UNSTUFF_BITS(resp, 77, 1); + csd->dsr_imp = UNSTUFF_BITS(resp, 76, 1); csd->r2w_factor = UNSTUFF_BITS(resp, 26, 3); csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4); csd->write_partial = UNSTUFF_BITS(resp, 21, 1); @@ -1271,6 +1272,13 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, goto free_card; } + /* + * handling only for cards supporting DSR and hosts requesting + * DSR configuration + */ + if (card->csd.dsr_imp && host->dsr_req) + mmc_set_dsr(host); + /* * Select card, as all following commands rely on that. */ diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index f51b5ba3bbea..ba0275e90617 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -93,6 +93,26 @@ int mmc_deselect_cards(struct mmc_host *host) return _mmc_select_card(host, NULL); } +/* + * Write the value specified in the device tree or board code into the optional + * 16 bit Driver Stage Register. This can be used to tune raise/fall times and + * drive strength of the DAT and CMD outputs. The actual meaning of a given + * value is hardware dependant. + * The presence of the DSR register can be determined from the CSD register, + * bit 76. + */ +int mmc_set_dsr(struct mmc_host *host) +{ + struct mmc_command cmd = {0}; + + cmd.opcode = MMC_SET_DSR; + + cmd.arg = (host->dsr << 16) | 0xffff; + cmd.flags = MMC_RSP_NONE | MMC_CMD_AC; + + return mmc_wait_for_cmd(host, &cmd, MMC_CMD_RETRIES); +} + int mmc_go_idle(struct mmc_host *host) { int err; diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h index 80ae9f4e0293..390dac665b2a 100644 --- a/drivers/mmc/core/mmc_ops.h +++ b/drivers/mmc/core/mmc_ops.h @@ -14,6 +14,7 @@ int mmc_select_card(struct mmc_card *card); int mmc_deselect_cards(struct mmc_host *host); +int mmc_set_dsr(struct mmc_host *host); int mmc_go_idle(struct mmc_host *host); int mmc_send_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr); int mmc_all_send_cid(struct mmc_host *host, u32 *cid); diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 0c44510bf717..25913889cbaa 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -127,6 +127,7 @@ static int mmc_decode_csd(struct mmc_card *card) csd->read_partial = UNSTUFF_BITS(resp, 79, 1); csd->write_misalign = UNSTUFF_BITS(resp, 78, 1); csd->read_misalign = UNSTUFF_BITS(resp, 77, 1); + csd->dsr_imp = UNSTUFF_BITS(resp, 76, 1); csd->r2w_factor = UNSTUFF_BITS(resp, 26, 3); csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4); csd->write_partial = UNSTUFF_BITS(resp, 21, 1); @@ -953,6 +954,13 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr, mmc_decode_cid(card); } + /* + * handling only for cards supporting DSR and hosts requesting + * DSR configuration + */ + if (card->csd.dsr_imp && host->dsr_req) + mmc_set_dsr(host); + /* * Select card, as all following commands rely on that. */ diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index bde5147a4221..0ea795f5feb9 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -42,7 +42,8 @@ struct mmc_csd { unsigned int read_partial:1, read_misalign:1, write_partial:1, - write_misalign:1; + write_misalign:1, + dsr_imp:1; }; struct mmc_ext_csd { diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 7960424d0bc0..4cbf61476999 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -365,6 +365,9 @@ struct mmc_host { unsigned int slotno; /* used for sdio acpi binding */ + int dsr_req; /* DSR value is valid */ + u32 dsr; /* optional driver stage (DSR) value */ + unsigned long private[0] ____cacheline_aligned; }; -- cgit v1.2.3 From 384b2cbd56a02efb16358ed7c0c039e4afca5ed0 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Sun, 24 Aug 2014 19:58:48 -0700 Subject: mmc: tmio: care about DMA tx/rx addr offset Basically, SD_BUF0 Tx/Rx addresses are same in normal TMIO controller, but, it is different on Renesas R-Car SDHI controller if it uses DMAC (Rx address needs to add 0x2000 to Tx address) This patch adds new .dma_rx_offset and cares it Tested-by: Nguyen Xuan Nui Tested-by: Hiep Cao Minh Acked-by: Laurent Pinchart Acked-by: Ben Dooks Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mobile_sdhi.c | 3 +++ drivers/mmc/host/tmio_mmc_dma.c | 2 +- include/linux/mfd/tmio.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index 2dafe28c8df9..f0818cd0242c 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -39,6 +39,7 @@ struct sh_mobile_sdhi_of_data { unsigned long tmio_flags; unsigned long capabilities; unsigned long capabilities2; + dma_addr_t dma_rx_offset; }; static const struct sh_mobile_sdhi_of_data sh_mobile_sdhi_of_cfg[] = { @@ -56,6 +57,7 @@ static const struct sh_mobile_sdhi_of_data of_rcar_gen2_compatible = { .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE, .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ, .capabilities2 = MMC_CAP2_NO_MULTI_READ, + .dma_rx_offset = 0x2000, }; static const struct of_device_id sh_mobile_sdhi_of_match[] = { @@ -228,6 +230,7 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev) mmc_data->flags |= of_data->tmio_flags; mmc_data->capabilities |= of_data->capabilities; mmc_data->capabilities2 |= of_data->capabilities2; + dma_priv->dma_rx_offset = of_data->dma_rx_offset; } /* SD control register space size is 0x100, 0x200 for bus_shift=1 */ diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index eb8f1d5c34b1..bd646b041085 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -312,7 +312,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat if (pdata->dma->chan_priv_rx) cfg.slave_id = pdata->dma->slave_id_rx; cfg.direction = DMA_DEV_TO_MEM; - cfg.src_addr = cfg.dst_addr; + cfg.src_addr = cfg.dst_addr + pdata->dma->dma_rx_offset; cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES; cfg.dst_addr = 0; ret = dmaengine_slave_config(host->chan_rx, &cfg); diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 8f6f2e91e7ae..777e29b1ad0f 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -96,6 +96,7 @@ struct tmio_mmc_dma { int slave_id_tx; int slave_id_rx; int alignment_shift; + dma_addr_t dma_rx_offset; bool (*filter)(struct dma_chan *chan, void *arg); }; -- cgit v1.2.3 From b8d11962c2d83c984d5afd091e5b725ad2fd5607 Mon Sep 17 00:00:00 2001 From: Shinobu Uehara Date: Sun, 24 Aug 2014 20:00:25 -0700 Subject: mmc: tmio: control multiple block transfer mode Renesas SDHI has "Multiple Block Transfer Mode" settings on SD_CMD register which controls CMD12 automatically. This patch cares it, because CMD12 is not needed when CMD53 (= SD_IO_RW_EXTENDED) [Kuninori Morimoto: tidyuped for upstreaming enabled this flags for all SH-Mobile/R-Car] Tested-by: Nguyen Xuan Nui Tested-by: Hiep Cao Minh Signed-off-by: Shinobu Uehara Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mobile_sdhi.c | 5 +++++ drivers/mmc/host/tmio_mmc_pio.c | 10 ++++++++++ include/linux/mfd/tmio.h | 6 ++++++ 3 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index f0818cd0242c..4727586b063e 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -225,6 +225,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev) */ mmc_data->flags |= TMIO_MMC_SDIO_IRQ; + /* + * All SDHI have CMD12 controll bit + */ + mmc_data->flags |= TMIO_MMC_HAVE_CMD12_CTRL; + if (of_id && of_id->data) { const struct sh_mobile_sdhi_of_data *of_data = of_id->data; mmc_data->flags |= of_data->tmio_flags; diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c index 3d0ad737abea..c2804827be9f 100644 --- a/drivers/mmc/host/tmio_mmc_pio.c +++ b/drivers/mmc/host/tmio_mmc_pio.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -310,6 +311,7 @@ static void tmio_mmc_done_work(struct work_struct *work) #define TRANSFER_READ 0x1000 #define TRANSFER_MULTI 0x2000 #define SECURITY_CMD 0x4000 +#define NO_CMD12_ISSUE 0x4000 /* TMIO_MMC_HAVE_CMD12_CTRL */ static int tmio_mmc_start_command(struct tmio_mmc_host *host, struct mmc_command *cmd) { @@ -346,6 +348,14 @@ static int tmio_mmc_start_command(struct tmio_mmc_host *host, struct mmc_command if (data->blocks > 1) { sd_ctrl_write16(host, CTL_STOP_INTERNAL_ACTION, 0x100); c |= TRANSFER_MULTI; + + /* + * Disable auto CMD12 at IO_RW_EXTENDED when + * multiple block transfer + */ + if ((host->pdata->flags & TMIO_MMC_HAVE_CMD12_CTRL) && + (cmd->opcode == SD_IO_RW_EXTENDED)) + c |= NO_CMD12_ISSUE; } if (data->flags & MMC_DATA_READ) c |= TRANSFER_READ; diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 777e29b1ad0f..7432d95b08e2 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -83,6 +83,12 @@ */ #define TMIO_MMC_HAVE_HIGH_REG (1 << 6) +/* + * Some controllers have CMD12 automatically + * issue/non-issue register + */ +#define TMIO_MMC_HAVE_CMD12_CTRL (1 << 7) + int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base); int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base); void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state); -- cgit v1.2.3 From 6b98757e53cb0e93b02db4067c14afcb32c90615 Mon Sep 17 00:00:00 2001 From: Shinobu Uehara Date: Sun, 24 Aug 2014 20:00:52 -0700 Subject: mmc: tmio: add TMIO_MMC_SDIO_STATUS_QUIRK Renesas R-Car SDHI should set reserved bits on CTL_SDIO_STATUS register when writing. This patch adds new TMIO_MMC_SDIO_STATUS_QUIRK flags for this purpose [Kuninori Morimoto: tidyuped for upstreaming enabled this flags for all SH-Mobile/R-Car] Tested-by: Nguyen Xuan Nui Tested-by: Hiep Cao Minh Signed-off-by: Shinobu Uehara Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mobile_sdhi.c | 5 +++++ drivers/mmc/host/tmio_mmc_pio.c | 7 ++++++- include/linux/mfd/tmio.h | 5 +++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index 4727586b063e..ebcfc659b799 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -230,6 +230,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev) */ mmc_data->flags |= TMIO_MMC_HAVE_CMD12_CTRL; + /* + * All SDHI need SDIO_INFO1 reserved bit + */ + mmc_data->flags |= TMIO_MMC_SDIO_STATUS_QUIRK; + if (of_id && of_id->data) { const struct sh_mobile_sdhi_of_data *of_data = of_id->data; mmc_data->flags |= of_data->tmio_flags; diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c index c2804827be9f..c5ffa92ccb18 100644 --- a/drivers/mmc/host/tmio_mmc_pio.c +++ b/drivers/mmc/host/tmio_mmc_pio.c @@ -665,6 +665,7 @@ irqreturn_t tmio_mmc_sdio_irq(int irq, void *devid) struct mmc_host *mmc = host->mmc; struct tmio_mmc_data *pdata = host->pdata; unsigned int ireg, status; + unsigned int sdio_status; if (!(pdata->flags & TMIO_MMC_SDIO_IRQ)) return IRQ_HANDLED; @@ -672,7 +673,11 @@ irqreturn_t tmio_mmc_sdio_irq(int irq, void *devid) status = sd_ctrl_read16(host, CTL_SDIO_STATUS); ireg = status & TMIO_SDIO_MASK_ALL & ~host->sdcard_irq_mask; - sd_ctrl_write16(host, CTL_SDIO_STATUS, status & ~TMIO_SDIO_MASK_ALL); + sdio_status = status & ~TMIO_SDIO_MASK_ALL; + if (pdata->flags & TMIO_MMC_SDIO_STATUS_QUIRK) + sdio_status |= 6; + + sd_ctrl_write16(host, CTL_SDIO_STATUS, sdio_status); if (mmc->caps & MMC_CAP_SDIO_IRQ && ireg & TMIO_SDIO_STAT_IOIRQ) mmc_signal_sdio_irq(mmc); diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 7432d95b08e2..a7493ae01b58 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -89,6 +89,11 @@ */ #define TMIO_MMC_HAVE_CMD12_CTRL (1 << 7) +/* + * Some controllers needs to set 1 on SDIO status reserved bits + */ +#define TMIO_MMC_SDIO_STATUS_QUIRK (1 << 8) + int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base); int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base); void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state); -- cgit v1.2.3 From e85dd04ea8c8d32ba8eae278959d28df34338e9d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Sun, 24 Aug 2014 20:01:54 -0700 Subject: mmc: tmio: remove Renesas specific #ifdef This patch adds new TMIO_MMC_HAVE_CTL_DMA_REG flag, and remove Renesas specific #ifdef from tmio driver Tested-by: Nguyen Xuan Nui Tested-by: Hiep Cao Minh Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mobile_sdhi.c | 5 +++++ drivers/mmc/host/tmio_mmc_dma.c | 6 ++---- include/linux/mfd/tmio.h | 5 +++++ 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index ebcfc659b799..a077da02bb8e 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -235,6 +235,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev) */ mmc_data->flags |= TMIO_MMC_SDIO_STATUS_QUIRK; + /* + * All SDHI have DMA control register + */ + mmc_data->flags |= TMIO_MMC_HAVE_CTL_DMA_REG; + if (of_id && of_id->data) { const struct sh_mobile_sdhi_of_data *of_data = of_id->data; mmc_data->flags |= of_data->tmio_flags; diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index bd646b041085..7d077388b9eb 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -28,10 +28,8 @@ void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable) if (!host->chan_tx || !host->chan_rx) return; -#if defined(CONFIG_SUPERH) || defined(CONFIG_ARCH_SHMOBILE) - /* Switch DMA mode on or off - SuperH specific? */ - sd_ctrl_write16(host, CTL_DMA_ENABLE, enable ? 2 : 0); -#endif + if (host->pdata->flags & TMIO_MMC_HAVE_CTL_DMA_REG) + sd_ctrl_write16(host, CTL_DMA_ENABLE, enable ? 2 : 0); } void tmio_mmc_abort_dma(struct tmio_mmc_host *host) diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index a7493ae01b58..adcb0cdb2fdb 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -94,6 +94,11 @@ */ #define TMIO_MMC_SDIO_STATUS_QUIRK (1 << 8) +/* + * Some controllers have DMA enable/disable register + */ +#define TMIO_MMC_HAVE_CTL_DMA_REG (1 << 9) + int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base); int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base); void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state); -- cgit v1.2.3 From da29fe2bf573f0ae56fdc2e790387cb73fc8c6f8 Mon Sep 17 00:00:00 2001 From: Shinobu Uehara Date: Sun, 24 Aug 2014 20:03:00 -0700 Subject: mmc: tmio: add actual clock support as option Some controller is supporting actual clock on SD_CLK_CTRL :: DIV[7:0]. Renesas SH-Mobile SDHI doesn't support, but, Renesas R-Car SDHI supports it. This patch adds new TMIO_MMC_CLK_ACTUAL flag for it. [Kuninori Morimoto: tidyuped for upstreaming] Tested-by: Nguyen Xuan Nui Tested-by: Hiep Cao Minh Signed-off-by: Shinobu Uehara Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mobile_sdhi.c | 6 ++++-- drivers/mmc/host/tmio_mmc_pio.c | 5 +++++ include/linux/mfd/tmio.h | 5 +++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index a077da02bb8e..d5d493719491 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -49,12 +49,14 @@ static const struct sh_mobile_sdhi_of_data sh_mobile_sdhi_of_cfg[] = { }; static const struct sh_mobile_sdhi_of_data of_rcar_gen1_compatible = { - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE, + .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE | + TMIO_MMC_CLK_ACTUAL, .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ, }; static const struct sh_mobile_sdhi_of_data of_rcar_gen2_compatible = { - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE, + .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE | + TMIO_MMC_CLK_ACTUAL, .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ, .capabilities2 = MMC_CAP2_NO_MULTI_READ, .dma_rx_offset = 0x2000, diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c index 7cfe939992de..ba454131f9a8 100644 --- a/drivers/mmc/host/tmio_mmc_pio.c +++ b/drivers/mmc/host/tmio_mmc_pio.c @@ -159,6 +159,11 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, for (clock = host->mmc->f_min, clk = 0x80000080; new_clock >= (clock<<1); clk >>= 1) clock <<= 1; + + /* 1/1 clock is option */ + if ((host->pdata->flags & TMIO_MMC_CLK_ACTUAL) && + ((clk >> 22) & 0x1)) + clk |= 0xff; } if (host->set_clk_div) diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index adcb0cdb2fdb..90436d523e5e 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -99,6 +99,11 @@ */ #define TMIO_MMC_HAVE_CTL_DMA_REG (1 << 9) +/* + * Some controllers allows to set SDx actual clock + */ +#define TMIO_MMC_CLK_ACTUAL (1 << 10) + int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base); int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base); void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state); -- cgit v1.2.3 From 51da2240906cb94e8f6ba55e403b6206df6fb2dd Mon Sep 17 00:00:00 2001 From: Yuvaraj CD Date: Fri, 22 Aug 2014 19:17:50 +0530 Subject: mmc: dw_mmc: use mmc_regulator_get_supply to handle regulators This patch makes use of mmc_regulator_get_supply() to handle the vmmc and vqmmc regulators.Also it moves the code handling the these regulators to dw_mci_set_ios().It turned on the vmmc and vqmmc during MMC_POWER_UP and MMC_POWER_ON,and turned off during MMC_POWER_OFF. Signed-off-by: Yuvaraj Kumar C D Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 72 ++++++++++++++++++++++------------------------ include/linux/mmc/dw_mmc.h | 2 +- 2 files changed, 36 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 7f227e964265..aadb0d6aa63f 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -936,6 +936,7 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) struct dw_mci_slot *slot = mmc_priv(mmc); const struct dw_mci_drv_data *drv_data = slot->host->drv_data; u32 regs; + int ret; switch (ios->bus_width) { case MMC_BUS_WIDTH_4: @@ -974,12 +975,38 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) switch (ios->power_mode) { case MMC_POWER_UP: + if (!IS_ERR(mmc->supply.vmmc)) { + ret = mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, + ios->vdd); + if (ret) { + dev_err(slot->host->dev, + "failed to enable vmmc regulator\n"); + /*return, if failed turn on vmmc*/ + return; + } + } + if (!IS_ERR(mmc->supply.vqmmc) && !slot->host->vqmmc_enabled) { + ret = regulator_enable(mmc->supply.vqmmc); + if (ret < 0) + dev_err(slot->host->dev, + "failed to enable vqmmc regulator\n"); + else + slot->host->vqmmc_enabled = true; + } set_bit(DW_MMC_CARD_NEED_INIT, &slot->flags); regs = mci_readl(slot->host, PWREN); regs |= (1 << slot->id); mci_writel(slot->host, PWREN, regs); break; case MMC_POWER_OFF: + if (!IS_ERR(mmc->supply.vmmc)) + mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0); + + if (!IS_ERR(mmc->supply.vqmmc) && slot->host->vqmmc_enabled) { + regulator_disable(mmc->supply.vqmmc); + slot->host->vqmmc_enabled = false; + } + regs = mci_readl(slot->host, PWREN); regs &= ~(1 << slot->id); mci_writel(slot->host, PWREN, regs); @@ -2110,7 +2137,13 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) mmc->f_max = freq[1]; } - mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; + /*if there are external regulators, get them*/ + ret = mmc_regulator_get_supply(mmc); + if (ret == -EPROBE_DEFER) + goto err_setup_bus; + + if (!mmc->ocr_avail) + mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; if (host->pdata->caps) mmc->caps = host->pdata->caps; @@ -2176,7 +2209,7 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id) err_setup_bus: mmc_free_host(mmc); - return -EINVAL; + return ret; } static void dw_mci_cleanup_slot(struct dw_mci_slot *slot, unsigned int id) @@ -2469,24 +2502,6 @@ int dw_mci_probe(struct dw_mci *host) } } - host->vmmc = devm_regulator_get_optional(host->dev, "vmmc"); - if (IS_ERR(host->vmmc)) { - ret = PTR_ERR(host->vmmc); - if (ret == -EPROBE_DEFER) - goto err_clk_ciu; - - dev_info(host->dev, "no vmmc regulator found: %d\n", ret); - host->vmmc = NULL; - } else { - ret = regulator_enable(host->vmmc); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(host->dev, - "regulator_enable fail: %d\n", ret); - goto err_clk_ciu; - } - } - host->quirks = host->pdata->quirks; spin_lock_init(&host->lock); @@ -2630,8 +2645,6 @@ err_workqueue: err_dmaunmap: if (host->use_dma && host->dma_ops->exit) host->dma_ops->exit(host); - if (host->vmmc) - regulator_disable(host->vmmc); err_clk_ciu: if (!IS_ERR(host->ciu_clk)) @@ -2667,9 +2680,6 @@ void dw_mci_remove(struct dw_mci *host) if (host->use_dma && host->dma_ops->exit) host->dma_ops->exit(host); - if (host->vmmc) - regulator_disable(host->vmmc); - if (!IS_ERR(host->ciu_clk)) clk_disable_unprepare(host->ciu_clk); @@ -2686,9 +2696,6 @@ EXPORT_SYMBOL(dw_mci_remove); */ int dw_mci_suspend(struct dw_mci *host) { - if (host->vmmc) - regulator_disable(host->vmmc); - return 0; } EXPORT_SYMBOL(dw_mci_suspend); @@ -2697,15 +2704,6 @@ int dw_mci_resume(struct dw_mci *host) { int i, ret; - if (host->vmmc) { - ret = regulator_enable(host->vmmc); - if (ret) { - dev_err(host->dev, - "failed to enable regulator: %d\n", ret); - return ret; - } - } - if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS)) { ret = -ENODEV; return ret; diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index 29ce014ab421..84e2827d0f0b 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -188,7 +188,7 @@ struct dw_mci { /* Workaround flags */ u32 quirks; - struct regulator *vmmc; /* Power regulator */ + bool vqmmc_enabled; unsigned long irq_flags; /* IRQ flags */ int irq; }; -- cgit v1.2.3 From 0173055842cd1d9ed3984e70891c22dbf2f29372 Mon Sep 17 00:00:00 2001 From: Doug Anderson Date: Fri, 22 Aug 2014 19:17:51 +0530 Subject: mmc: dw_mmc: Support voltage changes For UHS cards we need the ability to switch voltages from 3.3V to 1.8V. Add support to the dw_mmc driver to handle this. Note that dw_mmc needs a little bit of extra code since the interface needs a special bit programmed to the CMD register while CMD11 is progressing. This means adding a few extra states to the state machine to track. Signed-off-by: Doug Anderson Signed-off-by: Yuvaraj Kumar C D Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 137 ++++++++++++++++++++++++++++++++++++++++++--- drivers/mmc/host/dw_mmc.h | 5 +- include/linux/mmc/dw_mmc.h | 2 + 3 files changed, 134 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index aadb0d6aa63f..23719249182b 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -234,10 +235,13 @@ err: } #endif /* defined(CONFIG_DEBUG_FS) */ +static void mci_send_cmd(struct dw_mci_slot *slot, u32 cmd, u32 arg); + static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd) { struct mmc_data *data; struct dw_mci_slot *slot = mmc_priv(mmc); + struct dw_mci *host = slot->host; const struct dw_mci_drv_data *drv_data = slot->host->drv_data; u32 cmdr; cmd->error = -EINPROGRESS; @@ -253,6 +257,34 @@ static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd) else if (cmd->opcode != MMC_SEND_STATUS && cmd->data) cmdr |= SDMMC_CMD_PRV_DAT_WAIT; + if (cmd->opcode == SD_SWITCH_VOLTAGE) { + u32 clk_en_a; + + /* Special bit makes CMD11 not die */ + cmdr |= SDMMC_CMD_VOLT_SWITCH; + + /* Change state to continue to handle CMD11 weirdness */ + WARN_ON(slot->host->state != STATE_SENDING_CMD); + slot->host->state = STATE_SENDING_CMD11; + + /* + * We need to disable low power mode (automatic clock stop) + * while doing voltage switch so we don't confuse the card, + * since stopping the clock is a specific part of the UHS + * voltage change dance. + * + * Note that low power mode (SDMMC_CLKEN_LOW_PWR) will be + * unconditionally turned back on in dw_mci_setup_bus() if it's + * ever called with a non-zero clock. That shouldn't happen + * until the voltage change is all done. + */ + clk_en_a = mci_readl(host, CLKENA); + clk_en_a &= ~(SDMMC_CLKEN_LOW_PWR << slot->id); + mci_writel(host, CLKENA, clk_en_a); + mci_send_cmd(slot, SDMMC_CMD_UPD_CLK | + SDMMC_CMD_PRV_DAT_WAIT, 0); + } + if (cmd->flags & MMC_RSP_PRESENT) { /* We expect a response, so set this bit */ cmdr |= SDMMC_CMD_RESP_EXP; @@ -775,11 +807,15 @@ static void dw_mci_setup_bus(struct dw_mci_slot *slot, bool force_clkinit) unsigned int clock = slot->clock; u32 div; u32 clk_en_a; + u32 sdmmc_cmd_bits = SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT; + + /* We must continue to set bit 28 in CMD until the change is complete */ + if (host->state == STATE_WAITING_CMD11_DONE) + sdmmc_cmd_bits |= SDMMC_CMD_VOLT_SWITCH; if (!clock) { mci_writel(host, CLKENA, 0); - mci_send_cmd(slot, - SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0); + mci_send_cmd(slot, sdmmc_cmd_bits, 0); } else if (clock != host->current_speed || force_clkinit) { div = host->bus_hz / clock; if (host->bus_hz % clock && host->bus_hz > clock) @@ -803,15 +839,13 @@ static void dw_mci_setup_bus(struct dw_mci_slot *slot, bool force_clkinit) mci_writel(host, CLKSRC, 0); /* inform CIU */ - mci_send_cmd(slot, - SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0); + mci_send_cmd(slot, sdmmc_cmd_bits, 0); /* set clock to desired speed */ mci_writel(host, CLKDIV, div); /* inform CIU */ - mci_send_cmd(slot, - SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0); + mci_send_cmd(slot, sdmmc_cmd_bits, 0); /* enable clock; only low power if no SDIO */ clk_en_a = SDMMC_CLKEN_ENABLE << slot->id; @@ -820,8 +854,7 @@ static void dw_mci_setup_bus(struct dw_mci_slot *slot, bool force_clkinit) mci_writel(host, CLKENA, clk_en_a); /* inform CIU */ - mci_send_cmd(slot, - SDMMC_CMD_UPD_CLK | SDMMC_CMD_PRV_DAT_WAIT, 0); + mci_send_cmd(slot, sdmmc_cmd_bits, 0); /* keep the clock with reflecting clock dividor */ slot->__clk_old = clock << div; @@ -897,6 +930,17 @@ static void dw_mci_queue_request(struct dw_mci *host, struct dw_mci_slot *slot, slot->mrq = mrq; + if (host->state == STATE_WAITING_CMD11_DONE) { + dev_warn(&slot->mmc->class_dev, + "Voltage change didn't complete\n"); + /* + * this case isn't expected to happen, so we can + * either crash here or just try to continue on + * in the closest possible state + */ + host->state = STATE_IDLE; + } + if (host->state == STATE_IDLE) { host->state = STATE_SENDING_CMD; dw_mci_start_request(host, slot); @@ -973,6 +1017,9 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) /* Slot specific timing and width adjustment */ dw_mci_setup_bus(slot, false); + if (slot->host->state == STATE_WAITING_CMD11_DONE && ios->clock != 0) + slot->host->state = STATE_IDLE; + switch (ios->power_mode) { case MMC_POWER_UP: if (!IS_ERR(mmc->supply.vmmc)) { @@ -1016,6 +1063,59 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) } } +static int dw_mci_card_busy(struct mmc_host *mmc) +{ + struct dw_mci_slot *slot = mmc_priv(mmc); + u32 status; + + /* + * Check the busy bit which is low when DAT[3:0] + * (the data lines) are 0000 + */ + status = mci_readl(slot->host, STATUS); + + return !!(status & SDMMC_STATUS_BUSY); +} + +static int dw_mci_switch_voltage(struct mmc_host *mmc, struct mmc_ios *ios) +{ + struct dw_mci_slot *slot = mmc_priv(mmc); + struct dw_mci *host = slot->host; + u32 uhs; + u32 v18 = SDMMC_UHS_18V << slot->id; + int min_uv, max_uv; + int ret; + + /* + * Program the voltage. Note that some instances of dw_mmc may use + * the UHS_REG for this. For other instances (like exynos) the UHS_REG + * does no harm but you need to set the regulator directly. Try both. + */ + uhs = mci_readl(host, UHS_REG); + if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_330) { + min_uv = 2700000; + max_uv = 3600000; + uhs &= ~v18; + } else { + min_uv = 1700000; + max_uv = 1950000; + uhs |= v18; + } + if (!IS_ERR(mmc->supply.vqmmc)) { + ret = regulator_set_voltage(mmc->supply.vqmmc, min_uv, max_uv); + + if (ret) { + dev_err(&mmc->class_dev, + "Regulator set error %d: %d - %d\n", + ret, min_uv, max_uv); + return ret; + } + } + mci_writel(host, UHS_REG, uhs); + + return 0; +} + static int dw_mci_get_ro(struct mmc_host *mmc) { int read_only; @@ -1158,6 +1258,9 @@ static const struct mmc_host_ops dw_mci_ops = { .get_cd = dw_mci_get_cd, .enable_sdio_irq = dw_mci_enable_sdio_irq, .execute_tuning = dw_mci_execute_tuning, + .card_busy = dw_mci_card_busy, + .start_signal_voltage_switch = dw_mci_switch_voltage, + }; static void dw_mci_request_end(struct dw_mci *host, struct mmc_request *mrq) @@ -1181,7 +1284,11 @@ static void dw_mci_request_end(struct dw_mci *host, struct mmc_request *mrq) dw_mci_start_request(host, slot); } else { dev_vdbg(host->dev, "list empty\n"); - host->state = STATE_IDLE; + + if (host->state == STATE_SENDING_CMD11) + host->state = STATE_WAITING_CMD11_DONE; + else + host->state = STATE_IDLE; } spin_unlock(&host->lock); @@ -1292,8 +1399,10 @@ static void dw_mci_tasklet_func(unsigned long priv) switch (state) { case STATE_IDLE: + case STATE_WAITING_CMD11_DONE: break; + case STATE_SENDING_CMD11: case STATE_SENDING_CMD: if (!test_and_clear_bit(EVENT_CMD_COMPLETE, &host->pending_events)) @@ -1894,6 +2003,14 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id) } if (pending) { + /* Check volt switch first, since it can look like an error */ + if ((host->state == STATE_SENDING_CMD11) && + (pending & SDMMC_INT_VOLT_SWITCH)) { + mci_writel(host, RINTSTS, SDMMC_INT_VOLT_SWITCH); + pending &= ~SDMMC_INT_VOLT_SWITCH; + dw_mci_cmd_interrupt(host, pending); + } + if (pending & DW_MCI_CMD_ERROR_FLAGS) { mci_writel(host, RINTSTS, DW_MCI_CMD_ERROR_FLAGS); host->cmd_status = pending; @@ -1999,7 +2116,9 @@ static void dw_mci_work_routine_card(struct work_struct *work) switch (host->state) { case STATE_IDLE: + case STATE_WAITING_CMD11_DONE: break; + case STATE_SENDING_CMD11: case STATE_SENDING_CMD: mrq->cmd->error = -ENOMEDIUM; if (!mrq->data) diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 08fd956d81f3..01b99e8a9190 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -99,6 +99,7 @@ #define SDMMC_INT_HLE BIT(12) #define SDMMC_INT_FRUN BIT(11) #define SDMMC_INT_HTO BIT(10) +#define SDMMC_INT_VOLT_SWITCH BIT(10) /* overloads bit 10! */ #define SDMMC_INT_DRTO BIT(9) #define SDMMC_INT_RTO BIT(8) #define SDMMC_INT_DCRC BIT(7) @@ -113,6 +114,7 @@ /* Command register defines */ #define SDMMC_CMD_START BIT(31) #define SDMMC_CMD_USE_HOLD_REG BIT(29) +#define SDMMC_CMD_VOLT_SWITCH BIT(28) #define SDMMC_CMD_CCS_EXP BIT(23) #define SDMMC_CMD_CEATA_RD BIT(22) #define SDMMC_CMD_UPD_CLK BIT(21) @@ -130,6 +132,7 @@ /* Status register defines */ #define SDMMC_GET_FCNT(x) (((x)>>17) & 0x1FFF) #define SDMMC_STATUS_DMA_REQ BIT(31) +#define SDMMC_STATUS_BUSY BIT(9) /* FIFOTH register defines */ #define SDMMC_SET_FIFOTH(m, r, t) (((m) & 0x7) << 28 | \ ((r) & 0xFFF) << 16 | \ @@ -150,7 +153,7 @@ #define SDMMC_GET_VERID(x) ((x) & 0xFFFF) /* Card read threshold */ #define SDMMC_SET_RD_THLD(v, x) (((v) & 0x1FFF) << 16 | (x)) - +#define SDMMC_UHS_18V BIT(0) /* All ctrl reset bits */ #define SDMMC_CTRL_ALL_RESET_FLAGS \ (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET | SDMMC_CTRL_DMA_RESET) diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h index 84e2827d0f0b..001366927cf4 100644 --- a/include/linux/mmc/dw_mmc.h +++ b/include/linux/mmc/dw_mmc.h @@ -26,6 +26,8 @@ enum dw_mci_state { STATE_DATA_BUSY, STATE_SENDING_STOP, STATE_DATA_ERROR, + STATE_SENDING_CMD11, + STATE_WAITING_CMD11_DONE, }; enum { -- cgit v1.2.3 From e99783a45220a2c5f5a598e0e81213ecf2dbcf2f Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Sat, 30 Aug 2014 12:40:40 +0900 Subject: mmc: sdhci: handle busy-end interrupt during command It is fully legal for a controller to start handling busy-end interrupt before it has signaled that the command has completed. So make sure we do things in the proper order, Or it results that command interrupt is ignored so it can cause unexpected operations. This is founded at some toshiba emmc with the bellow warning. "mmc0: Got command interrupt 0x00000001 even though no command operation was in progress." This issue has been also reported by Youssef TRIKI: It is not specific to Toshiba devices, and happens with eMMC devices as well as SD card which support Auto-CMD12 rather than CMD23. Also, similar patch is submitted by: Gwendal Grignou Changes since v1: Fixed conflict with the next of git.linaro.org/people/ulf.hansson/mmc.git and Tested if issue is fixed again. Signed-off-by: Hankyung Yu Signed-off-by: Chanho Min Tested-by: Youssef TRIKI Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci.c | 17 +++++++++++++++-- include/linux/mmc/sdhci.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index b5e22b8c4885..335cdf3b5224 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1016,6 +1016,7 @@ void sdhci_send_command(struct sdhci_host *host, struct mmc_command *cmd) mod_timer(&host->timer, timeout); host->cmd = cmd; + host->busy_handle = 0; sdhci_prepare_data(host, cmd); @@ -2261,8 +2262,12 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask) if (host->cmd->data) DBG("Cannot wait for busy signal when also " "doing a data transfer"); - else if (!(host->quirks & SDHCI_QUIRK_NO_BUSY_IRQ)) + else if (!(host->quirks & SDHCI_QUIRK_NO_BUSY_IRQ) + && !host->busy_handle) { + /* Mark that command complete before busy is ended */ + host->busy_handle = 1; return; + } /* The controller does not support the end-of-busy IRQ, * fall through and take the SDHCI_INT_RESPONSE */ @@ -2330,7 +2335,15 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) return; } if (intmask & SDHCI_INT_DATA_END) { - sdhci_finish_command(host); + /* + * Some cards handle busy-end interrupt + * before the command completed, so make + * sure we do things in the proper order. + */ + if (host->busy_handle) + sdhci_finish_command(host); + else + host->busy_handle = 1; return; } } diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h index 09ebe57d5ce9..0aa85ca0c788 100644 --- a/include/linux/mmc/sdhci.h +++ b/include/linux/mmc/sdhci.h @@ -146,6 +146,7 @@ struct sdhci_host { struct mmc_command *cmd; /* Current command */ struct mmc_data *data; /* Current data request */ unsigned int data_early:1; /* Data finished before cmd */ + unsigned int busy_handle:1; /* Handling the order of Busy-end */ struct sg_mapping_iter sg_miter; /* SG state for PIO */ unsigned int blocks; /* remaining PIO blocks */ -- cgit v1.2.3 From 2e47e84245adcb1b3872210678b6146f674fb3ff Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 2 Sep 2014 19:08:53 -0700 Subject: mmc: Add .multi_io_quirk callback for multi I/O HW bug Historically, we have been using MMC_CAP* to handle host HW issues and currently the block layer uses MMC_CAP2_NO_MULTI_READ flag for a multi I/O HW bug workaround. There are a few tweaks needed to make MMC_CAP2_NO_MULTI_READ suite all situations. Therefore let's add an optional host ops callback to enable host drivers to return the number of blocks it allows per request. In a future patch and when host drivers have converted to the new callback, MMC_CAP2_NO_MULTI_READ shall be removed. Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/card/block.c | 10 ++++++++++ include/linux/mmc/host.h | 7 +++++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index ede41f05c392..adab9038f6f7 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1402,6 +1402,16 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, if (card->host->caps2 & MMC_CAP2_NO_MULTI_READ && rq_data_dir(req) == READ) brq->data.blocks = 1; + + /* + * Some controllers have HW issues while operating + * in multiple I/O mode + */ + if (card->host->ops->multi_io_quirk) + brq->data.blocks = card->host->ops->multi_io_quirk(card, + (rq_data_dir(req) == READ) ? + MMC_DATA_READ : MMC_DATA_WRITE, + brq->data.blocks); } if (brq->data.blocks > 1 || do_rel_wr) { diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 4cbf61476999..10e2bd6985ae 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -139,6 +139,13 @@ struct mmc_host_ops { int (*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv); void (*hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); + + /* + * Optional callback to support controllers with HW issues for multiple + * I/O. Returns the number of supported blocks for the request. + */ + int (*multi_io_quirk)(struct mmc_card *card, + unsigned int direction, int blk_size); }; struct mmc_card; -- cgit v1.2.3 From bbf0208d39121bd8873b032459cb2b5f35e14593 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 8 Sep 2014 23:45:25 -0700 Subject: mmc: use .multi_io_quirk on tmio_mmc Now, tmio_mmc can use .multi_io_quirk callback instead of MMC_CAP2_NO_MULTI_READ flags. let's use it. Signed-off-by: Kuninori Morimoto Acked-by: Lee Jones Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_pio.c | 13 +++++++++++++ include/linux/mfd/tmio.h | 3 +++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c index ba454131f9a8..ff5ff0f725c9 100644 --- a/drivers/mmc/host/tmio_mmc_pio.c +++ b/drivers/mmc/host/tmio_mmc_pio.c @@ -970,12 +970,25 @@ static int tmio_mmc_get_ro(struct mmc_host *mmc) return ret; } +static int tmio_multi_io_quirk(struct mmc_card *card, + unsigned int direction, int blk_size) +{ + struct tmio_mmc_host *host = mmc_priv(card->host); + struct tmio_mmc_data *pdata = host->pdata; + + if (pdata->multi_io_quirk) + return pdata->multi_io_quirk(card, direction, blk_size); + + return blk_size; +} + static const struct mmc_host_ops tmio_mmc_ops = { .request = tmio_mmc_request, .set_ios = tmio_mmc_set_ios, .get_ro = tmio_mmc_get_ro, .get_cd = mmc_gpio_get_cd, .enable_sdio_irq = tmio_mmc_enable_sdio_irq, + .multi_io_quirk = tmio_multi_io_quirk, }; static int tmio_mmc_init_ocr(struct tmio_mmc_host *host) diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 90436d523e5e..57388171610d 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -142,6 +143,8 @@ struct tmio_mmc_data { /* clock management callbacks */ int (*clk_enable)(struct platform_device *pdev, unsigned int *f); void (*clk_disable)(struct platform_device *pdev); + int (*multi_io_quirk)(struct mmc_card *card, + unsigned int direction, int blk_size); }; /* -- cgit v1.2.3 From 0abb71feb228ddbd17e0dfa13216541e036bb549 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 8 Sep 2014 23:46:49 -0700 Subject: mmc: remove MMC_CAP2_NO_MULTI_READ flags Now, mmc framework uses multi_io_quirk for I/O HW bug workaround. MMC_CAP2_NO_MULTI_READ flag is no longer needed Signed-off-by: Kuninori Morimoto Signed-off-by: Ulf Hansson --- drivers/mmc/card/block.c | 5 ----- include/linux/mmc/host.h | 1 - 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index adab9038f6f7..413f984cb76d 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1398,11 +1398,6 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, if (disable_multi) brq->data.blocks = 1; - /* Some controllers can't do multiblock reads due to hw bugs */ - if (card->host->caps2 & MMC_CAP2_NO_MULTI_READ && - rq_data_dir(req) == READ) - brq->data.blocks = 1; - /* * Some controllers have HW issues while operating * in multiple I/O mode diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 10e2bd6985ae..797ae657dc3d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -272,7 +272,6 @@ struct mmc_host { #define MMC_CAP2_BOOTPART_NOACC (1 << 0) /* Boot partition no access */ #define MMC_CAP2_FULL_PWR_CYCLE (1 << 2) /* Can do full power cycle */ -#define MMC_CAP2_NO_MULTI_READ (1 << 3) /* Multiblock reads don't work */ #define MMC_CAP2_HS200_1_8V_SDR (1 << 5) /* can support */ #define MMC_CAP2_HS200_1_2V_SDR (1 << 6) /* can support */ #define MMC_CAP2_HS200 (MMC_CAP2_HS200_1_8V_SDR | \ -- cgit v1.2.3 From 9d2fa2428ae149ba3a5b7a4ceb0a9e11f1882b3b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 27 Aug 2014 13:00:51 +0200 Subject: mmc: slot-gpio: add gpiod variant to get wp GPIO This makes it possible to get the write protect (read only) GPIO line from a GPIO descriptor. Written to exactly mirror the card detect function. Acked-by: Alexandre Courbot Signed-off-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/slot-gpio.c | 48 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mmc/slot-gpio.h | 3 +++ 2 files changed, 51 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c index 908c2b29e79f..e3fce4493fab 100644 --- a/drivers/mmc/core/slot-gpio.c +++ b/drivers/mmc/core/slot-gpio.c @@ -325,6 +325,54 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id, } EXPORT_SYMBOL(mmc_gpiod_request_cd); +/** + * mmc_gpiod_request_ro - request a gpio descriptor for write protection + * @host: mmc host + * @con_id: function within the GPIO consumer + * @idx: index of the GPIO to obtain in the consumer + * @override_active_level: ignore %GPIO_ACTIVE_LOW flag + * @debounce: debounce time in microseconds + * + * Use this function in place of mmc_gpio_request_ro() to use the GPIO + * descriptor API. Note that it is paired with mmc_gpiod_free_ro() not + * mmc_gpio_free_ro(). + * + * Returns zero on success, else an error. + */ +int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id, + unsigned int idx, bool override_active_level, + unsigned int debounce) +{ + struct mmc_gpio *ctx; + struct gpio_desc *desc; + int ret; + + ret = mmc_gpio_alloc(host); + if (ret < 0) + return ret; + + ctx = host->slot.handler_priv; + + if (!con_id) + con_id = ctx->ro_label; + + desc = devm_gpiod_get_index(host->parent, con_id, idx, GPIOD_IN); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + if (debounce) { + ret = gpiod_set_debounce(desc, debounce); + if (ret < 0) + return ret; + } + + ctx->override_ro_active_level = override_active_level; + ctx->ro_gpio = desc; + + return 0; +} +EXPORT_SYMBOL(mmc_gpiod_request_ro); + /** * mmc_gpiod_free_cd - free the card-detection gpio descriptor * @host: mmc host diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h index d2433381e828..a0d0442c15bf 100644 --- a/include/linux/mmc/slot-gpio.h +++ b/include/linux/mmc/slot-gpio.h @@ -25,6 +25,9 @@ void mmc_gpio_free_cd(struct mmc_host *host); int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id, unsigned int idx, bool override_active_level, unsigned int debounce); +int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id, + unsigned int idx, bool override_active_level, + unsigned int debounce); void mmc_gpiod_free_cd(struct mmc_host *host); void mmc_gpiod_request_cd_irq(struct mmc_host *host); -- cgit v1.2.3 From 3034a146820c26fe6da66a45f6340fe87fe0983a Mon Sep 17 00:00:00 2001 From: Dmitry Kasatkin Date: Fri, 27 Jun 2014 18:15:44 +0300 Subject: ima: pass 'opened' flag to identify newly created files Empty files and missing xattrs do not guarantee that a file was just created. This patch passes FILE_CREATED flag to IMA to reliably identify new files. Signed-off-by: Dmitry Kasatkin Signed-off-by: Mimi Zohar Cc: 3.14+ --- fs/namei.c | 2 +- fs/nfsd/vfs.c | 2 +- include/linux/ima.h | 4 ++-- security/integrity/ima/ima.h | 4 ++-- security/integrity/ima/ima_appraise.c | 4 ++-- security/integrity/ima/ima_main.c | 16 ++++++++-------- 6 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 985c6f368485..005771f97189 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3058,7 +3058,7 @@ opened: error = open_check_o_direct(file); if (error) goto exit_fput; - error = ima_file_check(file, op->acc_mode); + error = ima_file_check(file, op->acc_mode, *opened); if (error) goto exit_fput; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 140c496f612c..d49c778faecb 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -709,7 +709,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, host_err = PTR_ERR(*filp); *filp = NULL; } else { - host_err = ima_file_check(*filp, may_flags); + host_err = ima_file_check(*filp, may_flags, 0); if (may_flags & NFSD_MAY_64BIT_COOKIE) (*filp)->f_mode |= FMODE_64BITHASH; diff --git a/include/linux/ima.h b/include/linux/ima.h index 7cf5e9b32550..120ccc53fcb7 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -15,7 +15,7 @@ struct linux_binprm; #ifdef CONFIG_IMA extern int ima_bprm_check(struct linux_binprm *bprm); -extern int ima_file_check(struct file *file, int mask); +extern int ima_file_check(struct file *file, int mask, int opened); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); extern int ima_module_check(struct file *file); @@ -27,7 +27,7 @@ static inline int ima_bprm_check(struct linux_binprm *bprm) return 0; } -static inline int ima_file_check(struct file *file, int mask) +static inline int ima_file_check(struct file *file, int mask, int opened) { return 0; } diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 57da4bd7ba0c..0fb456c20eda 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -177,7 +177,7 @@ void ima_delete_rules(void); int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len); + int xattr_len, int opened); int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, @@ -193,7 +193,7 @@ static inline int ima_appraise_measurement(int func, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len) + int xattr_len, int opened) { return INTEGRITY_UNKNOWN; } diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index a4605d677248..225fd944a4ef 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -183,7 +183,7 @@ int ima_read_xattr(struct dentry *dentry, int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, - int xattr_len) + int xattr_len, int opened) { static const char op[] = "appraise_data"; char *cause = "unknown"; @@ -203,7 +203,7 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, cause = "missing-hash"; status = INTEGRITY_NOLABEL; - if (inode->i_size == 0) { + if (opened & FILE_CREATED) { iint->flags |= IMA_NEW_FILE; status = INTEGRITY_PASS; } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 0a2298f90c9c..f82cf9b8e92b 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -157,7 +157,7 @@ void ima_file_free(struct file *file) } static int process_measurement(struct file *file, const char *filename, - int mask, int function) + int mask, int function, int opened) { struct inode *inode = file_inode(file); struct integrity_iint_cache *iint; @@ -226,7 +226,7 @@ static int process_measurement(struct file *file, const char *filename, xattr_value, xattr_len); if (action & IMA_APPRAISE_SUBMASK) rc = ima_appraise_measurement(_func, iint, file, pathname, - xattr_value, xattr_len); + xattr_value, xattr_len, opened); if (action & IMA_AUDIT) ima_audit_measurement(iint, pathname); kfree(pathbuf); @@ -255,7 +255,7 @@ out: int ima_file_mmap(struct file *file, unsigned long prot) { if (file && (prot & PROT_EXEC)) - return process_measurement(file, NULL, MAY_EXEC, MMAP_CHECK); + return process_measurement(file, NULL, MAY_EXEC, MMAP_CHECK, 0); return 0; } @@ -277,7 +277,7 @@ int ima_bprm_check(struct linux_binprm *bprm) return process_measurement(bprm->file, (strcmp(bprm->filename, bprm->interp) == 0) ? bprm->filename : bprm->interp, - MAY_EXEC, BPRM_CHECK); + MAY_EXEC, BPRM_CHECK, 0); } /** @@ -290,12 +290,12 @@ int ima_bprm_check(struct linux_binprm *bprm) * On success return 0. On integrity appraisal error, assuming the file * is in policy and IMA-appraisal is in enforcing mode, return -EACCES. */ -int ima_file_check(struct file *file, int mask) +int ima_file_check(struct file *file, int mask, int opened) { ima_rdwr_violation_check(file); return process_measurement(file, NULL, mask & (MAY_READ | MAY_WRITE | MAY_EXEC), - FILE_CHECK); + FILE_CHECK, opened); } EXPORT_SYMBOL_GPL(ima_file_check); @@ -318,7 +318,7 @@ int ima_module_check(struct file *file) #endif return 0; /* We rely on module signature checking */ } - return process_measurement(file, NULL, MAY_EXEC, MODULE_CHECK); + return process_measurement(file, NULL, MAY_EXEC, MODULE_CHECK, 0); } int ima_fw_from_file(struct file *file, char *buf, size_t size) @@ -329,7 +329,7 @@ int ima_fw_from_file(struct file *file, char *buf, size_t size) return -EACCES; /* INTEGRITY_UNKNOWN */ return 0; } - return process_measurement(file, NULL, MAY_EXEC, FIRMWARE_CHECK); + return process_measurement(file, NULL, MAY_EXEC, FIRMWARE_CHECK, 0); } static int __init init_ima(void) -- cgit v1.2.3 From d0449b90f80f263e17e8b3ce31442e45121dc46c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 22 Aug 2014 10:18:42 -0400 Subject: locks: Remove unused conf argument from lm_grant This argument is always NULL so don't pass it around. [jlayton: remove dependencies on previous patches in series] Signed-off-by: Joe Perches Signed-off-by: Jeff Layton --- fs/dlm/plock.c | 8 ++++---- fs/lockd/svclock.c | 12 +++--------- include/linux/fs.h | 2 +- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index f704458ea5f5..e0ab3a93eeff 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -30,7 +30,7 @@ struct plock_op { struct plock_xop { struct plock_op xop; - void *callback; + int (*callback)(struct file_lock *fl, int result); void *fl; void *file; struct file_lock flc; @@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op) struct file *file; struct file_lock *fl; struct file_lock *flc; - int (*notify)(void *, void *, int) = NULL; + int (*notify)(struct file_lock *fl, int result) = NULL; struct plock_xop *xop = (struct plock_xop *)op; int rv = 0; @@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op) notify = xop->callback; if (op->info.rv) { - notify(fl, NULL, op->info.rv); + notify(fl, op->info.rv); goto out; } @@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op) (unsigned long long)op->info.number, file, fl); } - rv = notify(fl, NULL, 0); + rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ log_print("dlm_plock_callback: lock granted after lock request " diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index ab798a88ec1d..2a6170133c1d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -667,22 +667,16 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l * deferred rpc for GETLK and SETLK. */ static void -nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, - int result) +nlmsvc_update_deferred_block(struct nlm_block *block, int result) { block->b_flags |= B_GOT_CALLBACK; if (result == 0) block->b_granted = 1; else block->b_flags |= B_TIMED_OUT; - if (conf) { - if (block->b_fl) - __locks_copy_lock(block->b_fl, conf); - } } -static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, - int result) +static int nlmsvc_grant_deferred(struct file_lock *fl, int result) { struct nlm_block *block; int rc = -ENOENT; @@ -697,7 +691,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, rc = -ENOLCK; break; } - nlmsvc_update_deferred_block(block, conf, result); + nlmsvc_update_deferred_block(block, result); } else if (result == 0) block->b_granted = 1; diff --git a/include/linux/fs.h b/include/linux/fs.h index 94187721ad41..908af4f81680 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -869,7 +869,7 @@ struct lock_manager_operations { int (*lm_compare_owner)(struct file_lock *, struct file_lock *); unsigned long (*lm_owner_key)(struct file_lock *); void (*lm_notify)(struct file_lock *); /* unblock callback */ - int (*lm_grant)(struct file_lock *, struct file_lock *, int); + int (*lm_grant)(struct file_lock *, int); void (*lm_break)(struct file_lock *); int (*lm_change)(struct file_lock **, int); }; -- cgit v1.2.3 From 3fe0fff18fe87c6a2179837de68d1174903c6367 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 22 Aug 2014 10:18:42 -0400 Subject: locks: Rename __locks_copy_lock() to locks_copy_conflock() Jeff advice, " Right now __locks_copy_lock is only used to copy conflocks. It would be good to rename that to something more distinct (i.e.locks_copy_conflock), to make it clear that we're generating a conflock there." v5: change order from 3/6 to 2/6 v4: new patch only renaming function name Signed-off-by: Kinglong Mee Signed-off-by: Jeff Layton --- fs/locks.c | 10 +++++----- include/linux/fs.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index bb08857f90b5..ec9becd02d3d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -281,7 +281,7 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl) /* * Initialize a new lock from an existing file_lock structure. */ -void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) +void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { new->fl_owner = fl->fl_owner; new->fl_pid = fl->fl_pid; @@ -293,14 +293,14 @@ void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) new->fl_ops = NULL; new->fl_lmops = NULL; } -EXPORT_SYMBOL(__locks_copy_lock); +EXPORT_SYMBOL(locks_copy_conflock); void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { /* "new" must be a freshly-initialized lock */ WARN_ON_ONCE(new->fl_ops); - __locks_copy_lock(new, fl); + locks_copy_conflock(new, fl); new->fl_file = fl->fl_file; new->fl_ops = fl->fl_ops; new->fl_lmops = fl->fl_lmops; @@ -735,7 +735,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) break; } if (cfl) { - __locks_copy_lock(fl, cfl); + locks_copy_conflock(fl, cfl); if (cfl->fl_nspid) fl->fl_pid = pid_vnr(cfl->fl_nspid); } else @@ -941,7 +941,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (!posix_locks_conflict(request, fl)) continue; if (conflock) - __locks_copy_lock(conflock, fl); + locks_copy_conflock(conflock, fl); error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 908af4f81680..5ab86f44b697 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -966,7 +966,7 @@ void locks_free_lock(struct file_lock *fl); extern void locks_init_lock(struct file_lock *); extern struct file_lock * locks_alloc_lock(void); extern void locks_copy_lock(struct file_lock *, struct file_lock *); -extern void __locks_copy_lock(struct file_lock *, const struct file_lock *); +extern void locks_copy_conflock(struct file_lock *, struct file_lock *); extern void locks_remove_posix(struct file *, fl_owner_t); extern void locks_remove_file(struct file *); extern void locks_release_private(struct file_lock *); @@ -1026,7 +1026,7 @@ static inline void locks_init_lock(struct file_lock *fl) return; } -static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) +static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } -- cgit v1.2.3 From 5c97d7b1479982a48cf2129062b880c2555049ac Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 22 Aug 2014 10:18:43 -0400 Subject: locks: New ops in lock_manager_operations for get/put owner NFSD or other lockmanager may increase the owner's reference, so adds two new options for copying and releasing owner. v5: change order from 2/6 to 3/6 v4: rename lm_copy_owner/lm_release_owner to lm_get_owner/lm_put_owner Reviewed-by: Jeff Layton Signed-off-by: Kinglong Mee Signed-off-by: Jeff Layton --- fs/locks.c | 12 ++++++++++-- include/linux/fs.h | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index ec9becd02d3d..5e83f3a99377 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -230,8 +230,12 @@ void locks_release_private(struct file_lock *fl) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; } - fl->fl_lmops = NULL; + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_put_owner) + fl->fl_lmops->lm_put_owner(fl); + fl->fl_lmops = NULL; + } } EXPORT_SYMBOL_GPL(locks_release_private); @@ -274,8 +278,12 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl) fl->fl_ops->fl_copy_lock(new, fl); new->fl_ops = fl->fl_ops; } - if (fl->fl_lmops) + + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_get_owner) + fl->fl_lmops->lm_get_owner(new, fl); new->fl_lmops = fl->fl_lmops; + } } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ab86f44b697..3b07ce2698de 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -868,6 +868,8 @@ struct file_lock_operations { struct lock_manager_operations { int (*lm_compare_owner)(struct file_lock *, struct file_lock *); unsigned long (*lm_owner_key)(struct file_lock *); + void (*lm_get_owner)(struct file_lock *, struct file_lock *); + void (*lm_put_owner)(struct file_lock *); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); void (*lm_break)(struct file_lock *); -- cgit v1.2.3 From 09802fd2a8caea2a2147fca8d7975697c5de573d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2014 10:18:44 -0400 Subject: lockd: rip out deferred lock handling from testlock codepath As Kinglong points out, the nlm_block->b_fl field is no longer used at all. Also, vfs_test_lock in the generic locking code will only return FILE_LOCK_DEFERRED if FL_SLEEP is set, and it isn't here. The only other place that returns that value is the DLM lock code, but it only does that in dlm_posix_lock, never in dlm_posix_get. Remove all of the deferred locking code from the testlock codepath since it doesn't appear to ever be used anyway. I do have a small concern that this might cause a behavior change in the case where you have a block already sitting on the list when the testlock request comes in, but that looks like it doesn't really work properly anyway. I think it's best to just pass that down to vfs_test_lock and let the filesystem report that instead of trying to infer what's going on with the lock by looking at an existing block. Cc: cluster-devel@redhat.com Signed-off-by: Jeff Layton Reviewed-by: Kinglong Mee --- fs/lockd/svclock.c | 55 +++++---------------------------------------- include/linux/lockd/lockd.h | 1 - 2 files changed, 6 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index acfa94d5b489..13db95f54176 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -245,7 +245,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, block->b_daemon = rqstp->rq_server; block->b_host = host; block->b_file = file; - block->b_fl = NULL; file->f_count++; /* Add to file's list of blocks */ @@ -295,7 +294,6 @@ static void nlmsvc_free_block(struct kref *kref) nlmsvc_freegrantargs(block->b_call); nlmsvc_release_call(block->b_call); nlm_release_file(block->b_file); - kfree(block->b_fl); kfree(block); } @@ -508,7 +506,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, struct nlm_lock *conflock, struct nlm_cookie *cookie) { - struct nlm_block *block = NULL; int error; __be32 ret; @@ -519,63 +516,26 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - /* Get existing block (in case client is busy-waiting) */ - block = nlmsvc_lookup_block(file, lock); - - if (block == NULL) { - struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - - if (conf == NULL) - return nlm_granted; - block = nlmsvc_create_block(rqstp, host, file, lock, cookie); - if (block == NULL) { - kfree(conf); - return nlm_granted; - } - block->b_fl = conf; - } - if (block->b_flags & B_QUEUED) { - dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n", - block, block->b_flags, block->b_fl); - if (block->b_flags & B_TIMED_OUT) { - nlmsvc_unlink_block(block); - ret = nlm_lck_denied; - goto out; - } - if (block->b_flags & B_GOT_CALLBACK) { - nlmsvc_unlink_block(block); - if (block->b_fl != NULL - && block->b_fl->fl_type != F_UNLCK) { - lock->fl = *block->b_fl; - goto conf_lock; - } else { - ret = nlm_granted; - goto out; - } - } - ret = nlm_drop_reply; - goto out; - } - if (locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } + error = vfs_test_lock(file->f_file, &lock->fl); - if (error == FILE_LOCK_DEFERRED) { - ret = nlmsvc_defer_lock_rqst(rqstp, block); - goto out; - } if (error) { + /* We can't currently deal with deferred test requests */ + if (error == FILE_LOCK_DEFERRED) + WARN_ON_ONCE(1); + ret = nlm_lck_denied_nolocks; goto out; } + if (lock->fl.fl_type == F_UNLCK) { ret = nlm_granted; goto out; } -conf_lock: dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -589,8 +549,6 @@ conf_lock: locks_release_private(&lock->fl); ret = nlm_lck_denied; out: - if (block) - nlmsvc_release_block(block); return ret; } @@ -661,7 +619,6 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l * This is a callback from the filesystem for VFS file lock requests. * It will be used if lm_grant is defined and the filesystem can not * respond to the request immediately. - * For GETLK request it will copy the reply to the nlm_block. * For SETLK or SETLKW request it will get the local posix lock. * In all cases it will move the block to the head of nlm_blocked q where * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 219d79627c05..ff82a32871b5 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -178,7 +178,6 @@ struct nlm_block { unsigned char b_granted; /* VFS granted lock */ struct nlm_file * b_file; /* file in question */ struct cache_req * b_cache_req; /* deferred request handling */ - struct file_lock * b_fl; /* set for GETLK */ struct cache_deferred_req * b_deferred_req; unsigned int b_flags; /* block flags */ #define B_QUEUED 1 /* lock queued */ -- cgit v1.2.3 From 699688a416524c3cea9eafaca69fc6c06c13c02e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2014 10:18:44 -0400 Subject: locks: remove lock_may_read and lock_may_write There are no callers of these functions. Signed-off-by: Jeff Layton --- fs/locks.c | 80 ------------------------------------------------------ include/linux/fs.h | 14 ---------- 2 files changed, 94 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index c3991dc80137..5200ffd2ba9b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2597,86 +2597,6 @@ static int __init proc_locks_init(void) module_init(proc_locks_init); #endif -/** - * lock_may_read - checks that the region is free of locks - * @inode: the inode that is being read - * @start: the first byte to read - * @len: the number of bytes to read - * - * Emulates Windows locking requirements. Whole-file - * mandatory locks (share modes) can prohibit a read and - * byte-range POSIX locks can prohibit a read if they overlap. - * - * N.B. this function is only ever called - * from knfsd and ownership of locks is never checked. - */ -int lock_may_read(struct inode *inode, loff_t start, unsigned long len) -{ - struct file_lock *fl; - int result = 1; - - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (IS_POSIX(fl)) { - if (fl->fl_type == F_RDLCK) - continue; - if ((fl->fl_end < start) || (fl->fl_start > (start + len))) - continue; - } else if (IS_FLOCK(fl)) { - if (!(fl->fl_type & LOCK_MAND)) - continue; - if (fl->fl_type & LOCK_READ) - continue; - } else - continue; - result = 0; - break; - } - spin_unlock(&inode->i_lock); - return result; -} - -EXPORT_SYMBOL(lock_may_read); - -/** - * lock_may_write - checks that the region is free of locks - * @inode: the inode that is being written - * @start: the first byte to write - * @len: the number of bytes to write - * - * Emulates Windows locking requirements. Whole-file - * mandatory locks (share modes) can prohibit a write and - * byte-range POSIX locks can prohibit a write if they overlap. - * - * N.B. this function is only ever called - * from knfsd and ownership of locks is never checked. - */ -int lock_may_write(struct inode *inode, loff_t start, unsigned long len) -{ - struct file_lock *fl; - int result = 1; - - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (IS_POSIX(fl)) { - if ((fl->fl_end < start) || (fl->fl_start > (start + len))) - continue; - } else if (IS_FLOCK(fl)) { - if (!(fl->fl_type & LOCK_MAND)) - continue; - if (fl->fl_type & LOCK_WRITE) - continue; - } else - continue; - result = 0; - break; - } - spin_unlock(&inode->i_lock); - return result; -} - -EXPORT_SYMBOL(lock_may_write); - static int __init filelock_init(void) { int i; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3b07ce2698de..458f733c96bd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -985,8 +985,6 @@ extern void lease_get_mtime(struct inode *, struct timespec *time); extern int generic_setlease(struct file *, long, struct file_lock **); extern int vfs_setlease(struct file *, long, struct file_lock **); extern int lease_modify(struct file_lock **, int); -extern int lock_may_read(struct inode *, loff_t start, unsigned long count); -extern int lock_may_write(struct inode *, loff_t start, unsigned long count); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1117,18 +1115,6 @@ static inline int lease_modify(struct file_lock **before, int arg) { return -EINVAL; } - -static inline int lock_may_read(struct inode *inode, loff_t start, - unsigned long len) -{ - return 1; -} - -static inline int lock_may_write(struct inode *inode, loff_t start, - unsigned long len) -{ - return 1; -} #endif /* !CONFIG_FILE_LOCKING */ -- cgit v1.2.3 From 1c994a0909a556508c2cc26ab5d9e13c5ce33aa0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 Aug 2014 06:49:41 -0400 Subject: locks: consolidate "nolease" routines GFS2 and NFS have setlease routines that always just return -EINVAL. Turn that into a generic routine that can live in fs/libfs.c. Cc: Cc: Steven Whitehouse Cc: Signed-off-by: Jeff Layton Acked-by: Trond Myklebust Reviewed-by: Christoph Hellwig --- fs/gfs2/file.c | 22 +--------------------- fs/libfs.c | 16 ++++++++++++++++ fs/nfs/file.c | 13 +------------ fs/nfs/internal.h | 1 - fs/nfs/nfs4file.c | 2 +- include/linux/fs.h | 1 + 6 files changed, 20 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 26b3f952e6b1..2c02478a86b0 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -912,26 +912,6 @@ out_uninit: #ifdef CONFIG_GFS2_FS_LOCKING_DLM -/** - * gfs2_setlease - acquire/release a file lease - * @file: the file pointer - * @arg: lease type - * @fl: file lock - * - * We don't currently have a way to enforce a lease across the whole - * cluster; until we do, disable leases (by just returning -EINVAL), - * unless the administrator has requested purely local locking. - * - * Locking: called under i_lock - * - * Returns: errno - */ - -static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) -{ - return -EINVAL; -} - /** * gfs2_lock - acquire/release a posix lock on a file * @file: the file pointer @@ -1069,7 +1049,7 @@ const struct file_operations gfs2_file_fops = { .flock = gfs2_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .setlease = gfs2_setlease, + .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, }; diff --git a/fs/libfs.c b/fs/libfs.c index 88e3e00e2eca..29012a303ef8 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1075,3 +1075,19 @@ struct inode *alloc_anon_inode(struct super_block *s) return inode; } EXPORT_SYMBOL(alloc_anon_inode); + +/** + * simple_nosetlease - generic helper for prohibiting leases + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: new lease supplied for insertion + * + * Generic helper for filesystems that do not wish to allow leases to be set. + * All arguments are ignored and it just returns -EINVAL. + */ +int +simple_nosetlease(struct file *filp, long arg, struct file_lock **flp) +{ + return -EINVAL; +} +EXPORT_SYMBOL(simple_nosetlease); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 524dd80d1898..8c4048ecdad1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -891,17 +891,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) } EXPORT_SYMBOL_GPL(nfs_flock); -/* - * There is no protocol support for leases, so we have no way to implement - * them correctly in the face of opens by other clients. - */ -int nfs_setlease(struct file *file, long arg, struct file_lock **fl) -{ - dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg); - return -EINVAL; -} -EXPORT_SYMBOL_GPL(nfs_setlease); - const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read = new_sync_read, @@ -918,6 +907,6 @@ const struct file_operations nfs_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = nfs_setlease, + .setlease = simple_nosetlease, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9056622d2230..94d922ebb5ac 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -346,7 +346,6 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); -int nfs_setlease(struct file *, long, struct file_lock **); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a816f0627a6c..3e987ad9ae25 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -131,5 +131,5 @@ const struct file_operations nfs4_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = nfs_setlease, + .setlease = simple_nosetlease, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 458f733c96bd..435e3d9ec5cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2599,6 +2599,7 @@ extern int simple_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata); extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); +extern int simple_nosetlease(struct file *, long, struct file_lock **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); -- cgit v1.2.3 From e0b93eddfe17dcb7d644eb5d6ad02a86fc41a977 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2014 11:27:32 -0400 Subject: security: make security_file_set_fowner, f_setown and __f_setown void return security_file_set_fowner always returns 0, so make it f_setown and __f_setown void return functions and fix up the error handling in the callers. Cc: linux-security-module@vger.kernel.org Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig --- drivers/net/tun.c | 4 +--- drivers/tty/tty_io.c | 3 ++- fs/fcntl.c | 21 +++++++-------------- fs/locks.c | 2 +- fs/notify/dnotify/dnotify.c | 8 +------- include/linux/fs.h | 4 ++-- include/linux/security.h | 8 ++++---- net/socket.c | 3 ++- security/capability.c | 4 ++-- security/security.c | 4 ++-- security/selinux/hooks.c | 4 +--- security/smack/smack_lsm.c | 3 +-- 12 files changed, 26 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index acaaf6784179..186ce541c657 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2152,9 +2152,7 @@ static int tun_chr_fasync(int fd, struct file *file, int on) goto out; if (on) { - ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0); - if (ret) - goto out; + __f_setown(file, task_pid(current), PIDTYPE_PID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 8fbad3410c75..aea3b66f7bf2 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2163,8 +2163,9 @@ static int __tty_fasync(int fd, struct file *filp, int on) } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl_lock, flags); - retval = __f_setown(filp, pid, type, 0); + __f_setown(filp, pid, type, 0); put_pid(pid); + retval = 0; } out: return retval; diff --git a/fs/fcntl.c b/fs/fcntl.c index 22d1c3df61ac..99d440a4a6ba 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -98,26 +98,19 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, write_unlock_irq(&filp->f_owner.lock); } -int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, +void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - int err; - - err = security_file_set_fowner(filp); - if (err) - return err; - + security_file_set_fowner(filp); f_modown(filp, pid, type, force); - return 0; } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +void f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; struct pid *pid; int who = arg; - int result; type = PIDTYPE_PID; if (who < 0) { type = PIDTYPE_PGID; @@ -125,9 +118,8 @@ int f_setown(struct file *filp, unsigned long arg, int force) } rcu_read_lock(); pid = find_vpid(who); - result = __f_setown(filp, pid, type, force); + __f_setown(filp, pid, type, force); rcu_read_unlock(); - return result; } EXPORT_SYMBOL(f_setown); @@ -181,7 +173,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) if (owner.pid && !pid) ret = -ESRCH; else - ret = __f_setown(filp, pid, type, 1); + __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; @@ -302,7 +294,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + f_setown(filp, arg, 1); + err = 0; break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); diff --git a/fs/locks.c b/fs/locks.c index 5200ffd2ba9b..f5f648e003dd 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1776,7 +1776,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) new = NULL; - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); out_unlock: spin_unlock(&inode->i_lock); if (fl) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index abc8cbcfe90e..caaaf9dfe353 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -346,13 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out; } - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - if (error) { - /* if we added, we must shoot */ - if (dn_mark == new_dn_mark) - destroy = 1; - goto out; - } + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 435e3d9ec5cf..96528f73dda4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1139,8 +1139,8 @@ extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); -extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); -extern int f_setown(struct file *filp, unsigned long arg, int force); +extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); +extern void f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); diff --git a/include/linux/security.h b/include/linux/security.h index 623f90e5f38d..b10e7af95d3b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1559,7 +1559,7 @@ struct security_operations { int (*file_lock) (struct file *file, unsigned int cmd); int (*file_fcntl) (struct file *file, unsigned int cmd, unsigned long arg); - int (*file_set_fowner) (struct file *file); + void (*file_set_fowner) (struct file *file); int (*file_send_sigiotask) (struct task_struct *tsk, struct fown_struct *fown, int sig); int (*file_receive) (struct file *file); @@ -1834,7 +1834,7 @@ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot); int security_file_lock(struct file *file, unsigned int cmd); int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg); -int security_file_set_fowner(struct file *file); +void security_file_set_fowner(struct file *file); int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig); int security_file_receive(struct file *file); @@ -2312,9 +2312,9 @@ static inline int security_file_fcntl(struct file *file, unsigned int cmd, return 0; } -static inline int security_file_set_fowner(struct file *file) +static inline void security_file_set_fowner(struct file *file) { - return 0; + return; } static inline int security_file_send_sigiotask(struct task_struct *tsk, diff --git a/net/socket.c b/net/socket.c index 95ee7d8682e7..769c9671847e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1069,7 +1069,8 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; - err = f_setown(sock->file, pid, 1); + f_setown(sock->file, pid, 1); + err = 0; break; case FIOGETOWN: case SIOCGPGRP: diff --git a/security/capability.c b/security/capability.c index a74fde6a7468..d68c57a62bcf 100644 --- a/security/capability.c +++ b/security/capability.c @@ -343,9 +343,9 @@ static int cap_file_fcntl(struct file *file, unsigned int cmd, return 0; } -static int cap_file_set_fowner(struct file *file) +static void cap_file_set_fowner(struct file *file) { - return 0; + return; } static int cap_file_send_sigiotask(struct task_struct *tsk, diff --git a/security/security.c b/security/security.c index e41b1a8d7644..18b35c63fc0c 100644 --- a/security/security.c +++ b/security/security.c @@ -775,9 +775,9 @@ int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) return security_ops->file_fcntl(file, cmd, arg); } -int security_file_set_fowner(struct file *file) +void security_file_set_fowner(struct file *file) { - return security_ops->file_set_fowner(file); + security_ops->file_set_fowner(file); } int security_file_send_sigiotask(struct task_struct *tsk, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b0e940497e23..ada0d0bf3463 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3346,14 +3346,12 @@ static int selinux_file_fcntl(struct file *file, unsigned int cmd, return err; } -static int selinux_file_set_fowner(struct file *file) +static void selinux_file_set_fowner(struct file *file) { struct file_security_struct *fsec; fsec = file->f_security; fsec->fown_sid = current_sid(); - - return 0; } static int selinux_file_send_sigiotask(struct task_struct *tsk, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index e6ab307ce86e..69e5635d89e5 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1390,12 +1390,11 @@ static int smack_mmap_file(struct file *file, * Returns 0 * Further research may be required on this one. */ -static int smack_file_set_fowner(struct file *file) +static void smack_file_set_fowner(struct file *file) { struct smack_known *skp = smk_of_current(); file->f_security = skp->smk_known; - return 0; } /** -- cgit v1.2.3 From 87354059881ce9315181604dc17076c535f4d744 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Jul 2014 20:41:42 -0400 Subject: ftrace: Add helper function ftrace_ops_get_func() Add the helper function to what the mcount trampoline is to call for a ftrace_ops function. This helper will be used by arch code in the future to set up dynamic trampolines. But as this does the same tests that are performed in choosing what function to call for the default mcount trampoline, might as well use it to clean up the existing code. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 47 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f0b0edbf55a9..ef37286547fc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -56,6 +56,8 @@ struct ftrace_ops; typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); + /* * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are * set in the flags member. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 17b606362ab4..dabf734f909c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -259,20 +259,12 @@ static void update_ftrace_function(void) * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - !FTRACE_FORCE_LIST_FUNC)) { + (ftrace_ops_list->next == &ftrace_list_end)) { + /* Set the ftrace_ops that the arch callback uses */ set_function_trace_op = ftrace_ops_list; - /* - * If the func handles its own recursion, call it directly. - * Otherwise call the recursion protected function that - * will call the ftrace ops function. - */ - if (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) - func = ftrace_ops_list->func; - else - func = ftrace_ops_recurs_func; + + func = ftrace_ops_get_func(ftrace_ops_list); } else { /* Just use the default ftrace_ops */ set_function_trace_op = &ftrace_list_end; @@ -4856,6 +4848,37 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, trace_clear_recursion(bit); } +/** + * ftrace_ops_get_func - get the function a trampoline should call + * @ops: the ops to get the function for + * + * Normally the mcount trampoline will call the ops->func, but there + * are times that it should not. For example, if the ops does not + * have its own recursion protection, then it should call the + * ftrace_ops_recurs_func() instead. + * + * Returns the function that the trampoline should call for @ops. + */ +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + /* + * If the func handles its own recursion, call it directly. + * Otherwise call the recursion protected function that + * will call the ftrace ops function. + */ + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) + return ftrace_ops_recurs_func; + + return ops->func; +} + static void clear_ftrace_swapper(void) { struct task_struct *p; -- cgit v1.2.3 From e1effa0144a1ddf5b456c388ffaf784f3c5163fd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Aug 2014 17:19:38 -0400 Subject: ftrace: Annotate the ops operation on update Add three new flags for ftrace_ops: FTRACE_OPS_FL_ADDING FTRACE_OPS_FL_REMOVING FTRACE_OPS_FL_MODIFYING These will be set for the ftrace_ops when they are first added to the function tracing, being removed from function tracing or just having their functions changed from function tracing, respectively. This will be needed to remove the tramp_hash, which can grow quite big. The tramp_hash is used to note what functions a ftrace_ops is using a trampoline for. Denoting which ftrace_ops is being modified, will allow us to use the ftrace_ops hashes themselves, which are much smaller as they have a global flag to denote if a ftrace_ops is tracing all functions, as well as a notrace hash if the ftrace_ops is tracing all but a few. The tramp_hash just creates a hash item for every function, which can go into the 10s of thousands if all functions are using the ftrace_ops trampoline. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 6 ++++++ kernel/trace/ftrace.c | 45 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ef37286547fc..d9216f6385d9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -91,6 +91,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * INITIALIZED - The ftrace_ops has already been initialized (first use time * register_ftrace_function() is called, it will initialized the ops) * DELETED - The ops are being deleted, do not let them be registered again. + * ADDING - The ops is in the process of being added. + * REMOVING - The ops is in the process of being removed. + * MODIFYING - The ops is in the process of changing its filter functions. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -102,6 +105,9 @@ enum { FTRACE_OPS_FL_STUB = 1 << 6, FTRACE_OPS_FL_INITIALIZED = 1 << 7, FTRACE_OPS_FL_DELETED = 1 << 8, + FTRACE_OPS_FL_ADDING = 1 << 9, + FTRACE_OPS_FL_REMOVING = 1 << 10, + FTRACE_OPS_FL_MODIFYING = 1 << 11, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 858ac16f8492..e43c793093e5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1057,6 +1057,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; static struct ftrace_ops *removed_ops; +/* + * Set when doing a global update, like enabling all recs or disabling them. + * It is not set when just updating a single ftrace_ops. + */ +static bool update_all_ops; + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif @@ -2366,6 +2372,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +{ + ops->flags |= FTRACE_OPS_FL_MODIFYING; + ftrace_run_update_code(command); + ops->flags &= ~FTRACE_OPS_FL_MODIFYING; +} + static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; @@ -2387,6 +2400,13 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } +static void ftrace_startup_all(int command) +{ + update_all_ops = true; + ftrace_startup_enable(command); + update_all_ops = false; +} + static int ftrace_startup(struct ftrace_ops *ops, int command) { int ret; @@ -2401,12 +2421,22 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - ops->flags |= FTRACE_OPS_FL_ENABLED; + /* + * Note that ftrace probes uses this to start up + * and modify functions it will probe. But we still + * set the ADDING flag for modification, as probes + * do not have trampolines. If they add them in the + * future, then the probes will need to distinguish + * between adding and updating probes. + */ + ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); + ops->flags &= ~FTRACE_OPS_FL_ADDING; + return 0; } @@ -2456,11 +2486,12 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If the ops uses a trampoline, then it needs to be * tested first on update. */ + ops->flags |= FTRACE_OPS_FL_REMOVING; removed_ops = ops; ftrace_run_update_code(command); - removed_ops = NULL; + ops->flags &= ~FTRACE_OPS_FL_REMOVING; /* * Dynamic ops may be freed, we must make sure that all @@ -3373,7 +3404,7 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); return; } @@ -3792,7 +3823,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) static void ftrace_ops_update_code(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); } static int @@ -4717,6 +4748,7 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } +static inline void ftrace_startup_all(int command) { } /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ @@ -5016,7 +5048,8 @@ static int ftrace_pid_add(int p) set_ftrace_pid_task(pid); ftrace_update_pid_func(); - ftrace_startup_enable(0); + + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); return 0; @@ -5045,7 +5078,7 @@ static void ftrace_pid_reset(void) } ftrace_update_pid_func(); - ftrace_startup_enable(0); + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From fef5aeeee9e3717e7aea991a7ae9ff6a7a2d4c85 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 24 Jul 2014 12:25:47 -0400 Subject: ftrace: Replace tramp_hash with old_*_hash to save space Allowing function callbacks to declare their own trampolines requires that each ftrace_ops that has a trampoline must have some sort of accounting that keeps track of which ops has a trampoline attached to a record. The easy way to solve this was to add a "tramp_hash" that created a hash entry for every function that a ops uses with a trampoline. But since we can have literally tens of thousands of functions being traced, that means we need tens of thousands of descriptors to map the ops to the function in the hash. This is quite expensive and can cause enabling and disabling the function graph tracer to take some time to start and stop. It can take up to several seconds to disable or enable all functions in the function graph tracer for this reason. The better approach albeit more complex, is to keep track of how ops are being enabled and disabled, and use that along with the counting of the number of ops attached to records, to determive what ops has a trampoline attached to a record at enabling and disabling of tracing. To do this, the tramp_hash has been replaced with an old_filter_hash and old_notrace_hash, which get the copy of the ops filter_hash and notrace_hash respectively. The old hashes is kept until the ops has been modified or removed and the old hashes are used with the logic of the accounting to determine the ops that have the trampoline of a record. The reason this has less of a footprint is due to the trick that an "empty" hash in the filter_hash means "all functions" and an empty hash in the notrace hash means "no functions" in the hash. This is much more efficienct, doesn't have the delay, and takes up much less memory, as we do not need to map all the functions but just figure out which functions are mapped at the time it is enabled or disabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 239 +++++++++++++++++-------------------------------- 2 files changed, 85 insertions(+), 156 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d9216f6385d9..662697babd48 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -140,7 +140,7 @@ struct ftrace_ops { int nr_trampolines; struct ftrace_ops_hash local_hash; struct ftrace_ops_hash *func_hash; - struct ftrace_hash *tramp_hash; + struct ftrace_ops_hash old_hash; unsigned long trampoline; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e43c793093e5..d325a1e76554 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1373,6 +1373,21 @@ update: return 0; } +static bool hash_contains_ip(unsigned long ip, + struct ftrace_ops_hash *hash) +{ + /* + * The function record is a match if it exists in the filter + * hash and not in the notrace hash. Note, an emty hash is + * considered a match for the filter hash, but an empty + * notrace hash is considered not in the notrace hash. + */ + return (ftrace_hash_empty(hash->filter_hash) || + ftrace_lookup_ip(hash->filter_hash, ip)) && + (ftrace_hash_empty(hash->notrace_hash) || + !ftrace_lookup_ip(hash->notrace_hash, ip)); +} + /* * Test the hashes for this ops to see if we want to call * the ops->func or not. @@ -1388,8 +1403,7 @@ update: static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_ops_hash hash; int ret; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS @@ -1402,13 +1416,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); - if ((ftrace_hash_empty(filter_hash) || - ftrace_lookup_ip(filter_hash, ip)) && - (ftrace_hash_empty(notrace_hash) || - !ftrace_lookup_ip(notrace_hash, ip))) + if (hash_contains_ip(ip, &hash)) ret = 1; else ret = 0; @@ -1520,46 +1531,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void ftrace_remove_tramp(struct ftrace_ops *ops, - struct dyn_ftrace *rec) -{ - /* If TRAMP is not set, no ops should have a trampoline for this */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - return; - - rec->flags &= ~FTRACE_FL_TRAMP; - - if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || - ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return; - /* - * The tramp_hash entry will be removed at time - * of update. - */ - ops->nr_trampolines--; -} - -static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) -{ - struct ftrace_ops *op; - - /* If TRAMP is not set, no ops should have a trampoline for this */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - /* - * This function is called to clear other tramps - * not the one that is being updated. - */ - if (op == ops) - continue; - if (op->nr_trampolines) - ftrace_remove_tramp(op, rec); - } while_for_each_ftrace_op(op); -} - static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1648,18 +1619,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * function, and the ops has a trampoline registered * for it, then we can call it directly. */ - if (ftrace_rec_count(rec) == 1 && ops->trampoline) { + if (ftrace_rec_count(rec) == 1 && ops->trampoline) rec->flags |= FTRACE_FL_TRAMP; - ops->nr_trampolines++; - } else { + else /* * If we are adding another function callback * to this function, and the previous had a * custom trampoline in use, then we need to go * back to the default trampoline. */ - ftrace_clear_tramps(rec, ops); - } + rec->flags &= ~FTRACE_FL_TRAMP; /* * If any ops wants regs saved for this function @@ -1672,9 +1641,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, return; rec->flags--; - if (ops->trampoline && !ftrace_rec_count(rec)) - ftrace_remove_tramp(ops, rec); - /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -1688,6 +1654,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags &= ~FTRACE_FL_REGS; } + /* + * If the rec had TRAMP enabled, then it needs to + * be cleared. As TRAMP can only be enabled iff + * there is only a single ops attached to it. + * In otherwords, always disable it on decrementing. + * In the future, we may set it if rec count is + * decremented to one, and the ops that is left + * has a trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + /* * flags will be cleared in ftrace_check_record() * if rec count is zero. @@ -1910,15 +1887,14 @@ static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; do_for_each_ftrace_op(op, ftrace_ops_list) { if (!op->trampoline) continue; - if (ftrace_lookup_ip(op->func_hash->filter_hash, rec->ip) && - (ftrace_hash_empty(op->func_hash->notrace_hash) || - !ftrace_lookup_ip(op->func_hash->notrace_hash, rec->ip))) + if (hash_contains_ip(ip, op->func_hash)) return op; } while_for_each_ftrace_op(op); @@ -1929,18 +1905,51 @@ static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; - /* Removed ops need to be tested first */ - if (removed_ops && removed_ops->tramp_hash) { - if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) + /* + * Need to check removed ops first. + * If they are being removed, and this rec has a tramp, + * and this rec is in the ops list, then it would be the + * one with the tramp. + */ + if (removed_ops) { + if (hash_contains_ip(ip, &removed_ops->old_hash)) return removed_ops; } + /* + * Need to find the current trampoline for a rec. + * Now, a trampoline is only attached to a rec if there + * was a single 'ops' attached to it. But this can be called + * when we are adding another op to the rec or removing the + * current one. Thus, if the op is being added, we can + * ignore it because it hasn't attached itself to the rec + * yet. That means we just need to find the op that has a + * trampoline and is not beeing added. + */ do_for_each_ftrace_op(op, ftrace_ops_list) { - if (!op->tramp_hash) + + if (!op->trampoline) + continue; + + /* + * If the ops is being added, it hasn't gotten to + * the point to be removed from this tree yet. + */ + if (op->flags & FTRACE_OPS_FL_ADDING) continue; - if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) + /* + * If the ops is not being added and has a trampoline, + * then it must be the one that we want! + */ + if (hash_contains_ip(ip, op->func_hash)) + return op; + + /* If the ops is being modified, it may be in the old hash. */ + if ((op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, &op->old_hash)) return op; } while_for_each_ftrace_op(op); @@ -1952,10 +1961,11 @@ static struct ftrace_ops * ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; do_for_each_ftrace_op(op, ftrace_ops_list) { /* pass rec in as regs to have non-NULL val */ - if (ftrace_ops_test(op, rec->ip, rec)) + if (hash_contains_ip(ip, op->func_hash)) return op; } while_for_each_ftrace_op(op); @@ -2262,92 +2272,6 @@ void __weak arch_ftrace_update_code(int command) ftrace_run_stop_machine(command); } -static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) -{ - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int size, bits; - int ret; - - size = ops->nr_trampolines; - bits = 0; - /* - * Make the hash size about 1/2 the # found - */ - for (size /= 2; size; size >>= 1) - bits++; - - ops->tramp_hash = alloc_ftrace_hash(bits); - /* - * TODO: a failed allocation is going to screw up - * the accounting of what needs to be modified - * and not. For now, we kill ftrace if we fail - * to allocate here. But there are ways around this, - * but that will take a little more work. - */ - if (!ops->tramp_hash) - return -ENOMEM; - - do_for_each_ftrace_rec(pg, rec) { - if (ftrace_rec_count(rec) == 1 && - ftrace_ops_test(ops, rec->ip, rec)) { - - /* - * If another ops adds to a rec, the rec will - * lose its trampoline and never get it back - * until all ops are off of it. - */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - continue; - - /* This record had better have a trampoline */ - if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) - return -1; - - ret = add_hash_entry(ops->tramp_hash, rec->ip); - if (ret < 0) - return ret; - } - } while_for_each_ftrace_rec(); - - /* The number of recs in the hash must match nr_trampolines */ - if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) - pr_warn("count=%ld trampolines=%d\n", - ops->tramp_hash->count, - ops->nr_trampolines); - - return 0; -} - -static int ftrace_save_tramp_hashes(void) -{ - struct ftrace_ops *op; - int ret; - - /* - * Now that any trampoline is being used, we need to save the - * hashes for the ops that have them. This allows the mapping - * back from the record to the ops that has the trampoline to - * know what code is being replaced. Modifying code must always - * verify what it is changing. - */ - do_for_each_ftrace_op(op, ftrace_ops_list) { - - /* The tramp_hash is recreated each time. */ - free_ftrace_hash(op->tramp_hash); - op->tramp_hash = NULL; - - if (op->nr_trampolines) { - ret = ftrace_save_ops_tramp_hash(op); - if (ret) - return ret; - } - - } while_for_each_ftrace_op(op); - - return 0; -} - static void ftrace_run_update_code(int command) { int ret; @@ -2367,9 +2291,6 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); - - ret = ftrace_save_tramp_hashes(); - FTRACE_WARN_ON(ret); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) @@ -2489,8 +2410,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) ops->flags |= FTRACE_OPS_FL_REMOVING; removed_ops = ops; + /* The trampoline logic checks the old hashes */ + ops->old_hash.filter_hash = ops->func_hash->filter_hash; + ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; + ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + + removed_ops = NULL; ops->flags &= ~FTRACE_OPS_FL_REMOVING; /* @@ -3017,7 +2946,7 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_ops *ops; ops = ftrace_find_tramp_ops_any(rec); - if (ops && ops->trampoline) + if (ops) seq_printf(m, "\ttramp: %pS", (void *)ops->trampoline); else -- cgit v1.2.3 From 3a630178fd5f30c285fd7016c5340a176b625913 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Aug 2014 10:52:04 -0700 Subject: tracing: generate RCU warnings even when tracepoints are disabled Dave Jones reported seeing a bug from one of my TLB tracepoints: http://lkml.kernel.org/r/20140806181801.GA4605@redhat.com I've been running these patches for months and never saw this. But, a big chunk of my testing, especially with all the debugging enabled, was in a vm where intel_idle doesn't work. On the systems where I was using intel_idle, I never had lockdep enabled and this tracepoint on at the same time. This patch ensures that whenever we have lockdep available, we do _some_ RCU activity at the site of the tracepoint, despite whether the tracepoint's condition matches or even if the tracepoint itself is completely disabled. This is a bit of a hack, but it is pretty self-contained. I confirmed that with this patch plus lockdep I get the same splat as Dave Jones did, but without enabling the tracepoint explicitly. Link: http://lkml.kernel.org/p/20140807175204.C257CAC5@viggo.jf.intel.com Signed-off-by: Dave Hansen Cc: Dave Hansen Cc: Dave Jones , Cc: paulmck@linux.vnet.ibm.com Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index b1293f15f592..e08e21e5f601 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -157,6 +157,12 @@ extern void syscall_unregfunc(void); * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. + * + * When lockdep is enabled, we make sure to always do the RCU portions of + * the tracepoint code, regardless of whether tracing is on or we match the + * condition. This lets us find RCU issues triggered with tracepoints even + * when this tracepoint is off. This code has no purpose other than poking + * RCU a bit. */ #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ extern struct tracepoint __tracepoint_##name; \ @@ -167,6 +173,11 @@ extern void syscall_unregfunc(void); TP_PROTO(data_proto), \ TP_ARGS(data_args), \ TP_CONDITION(cond),,); \ + if (IS_ENABLED(CONFIG_LOCKDEP)) { \ + rcu_read_lock_sched_notrace(); \ + rcu_dereference_sched(__tracepoint_##name.funcs);\ + rcu_read_unlock_sched_notrace(); \ + } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ PARAMS(cond), PARAMS(data_proto), PARAMS(data_args)) \ -- cgit v1.2.3 From 6314b6796e3c070d4c8086b08dfd453a0aeac4cf Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 4 Sep 2014 23:37:49 -0700 Subject: clk: Don't hold prepare_lock across debugfs creation Rob Clark reports a lockdep splat that involves the prepare_lock chained with the mmap semaphore. ====================================================== [ INFO: possible circular locking dependency detected ] 3.17.0-rc1-00050-g07a489b #802 Tainted: G W ------------------------------------------------------- Xorg.bin/5413 is trying to acquire lock: (prepare_lock){+.+.+.}, at: [] clk_prepare_lock+0x88/0xfc but task is already holding lock: (qcom_iommu_lock){+.+...}, at: [] qcom_iommu_unmap+0x1c/0x1f0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (qcom_iommu_lock){+.+...}: [] qcom_iommu_map+0x28/0x450 [] iommu_map+0xc8/0x12c [] msm_iommu_map+0xb4/0x130 [] msm_gem_get_iova_locked+0x9c/0xe8 [] msm_gem_get_iova+0x4c/0x64 [] mdp4_kms_init+0x4c4/0x6c0 [] msm_load+0x2ac/0x34c [] drm_dev_register+0xac/0x108 [] drm_platform_init+0x50/0xf0 [] try_to_bring_up_master.part.3+0xc8/0x108 [] component_master_add_with_match+0xa8/0x104 [] msm_pdev_probe+0x64/0x70 [] platform_drv_probe+0x2c/0x60 [] driver_probe_device+0x108/0x234 [] bus_for_each_drv+0x64/0x98 [] device_attach+0x78/0x8c [] bus_probe_device+0x88/0xac [] deferred_probe_work_func+0x68/0x9c [] process_one_work+0x1a0/0x40c [] worker_thread+0x44/0x4d8 [] kthread+0xd8/0xec [] ret_from_fork+0x14/0x2c -> #3 (&dev->struct_mutex){+.+.+.}: [] drm_gem_mmap+0x38/0xd0 [] msm_gem_mmap+0xc/0x5c [] mmap_region+0x35c/0x6c8 [] do_mmap_pgoff+0x314/0x398 [] vm_mmap_pgoff+0x84/0xb4 [] SyS_mmap_pgoff+0x94/0xbc [] ret_fast_syscall+0x0/0x48 -> #2 (&mm->mmap_sem){++++++}: [] filldir64+0x68/0x180 [] dcache_readdir+0x188/0x22c [] iterate_dir+0x9c/0x11c [] SyS_getdents64+0x78/0xe8 [] ret_fast_syscall+0x0/0x48 -> #1 (&sb->s_type->i_mutex_key#3){+.+.+.}: [] __create_file+0x58/0x1dc [] debugfs_create_dir+0x1c/0x24 [] clk_debug_create_subtree+0x20/0x170 [] clk_debug_init+0xec/0x14c [] do_one_initcall+0x8c/0x1c8 [] kernel_init_freeable+0x13c/0x1dc [] kernel_init+0x8/0xe8 [] ret_from_fork+0x14/0x2c -> #0 (prepare_lock){+.+.+.}: [] mutex_lock_nested+0x70/0x3e8 [] clk_prepare_lock+0x88/0xfc [] clk_prepare+0xc/0x24 [] __enable_clocks.isra.4+0x18/0xa4 [] __flush_iotlb_va+0xe0/0x114 [] qcom_iommu_unmap+0xac/0x1f0 [] iommu_unmap+0x9c/0xe8 [] msm_iommu_unmap+0x64/0x84 [] msm_gem_free_object+0x11c/0x338 [] drm_gem_object_handle_unreference_unlocked+0xfc/0x130 [] drm_gem_object_release_handle+0x50/0x68 [] idr_for_each+0xa8/0xdc [] drm_gem_release+0x1c/0x28 [] drm_release+0x370/0x428 [] __fput+0x98/0x1e8 [] task_work_run+0xb0/0xfc [] do_exit+0x2ec/0x948 [] do_group_exit+0x4c/0xb8 [] get_signal+0x28c/0x6ac [] do_signal+0xc4/0x3e4 [] do_work_pending+0xb4/0xc4 [] work_pending+0xc/0x20 other info that might help us debug this: Chain exists of: prepare_lock --> &dev->struct_mutex --> qcom_iommu_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(qcom_iommu_lock); lock(&dev->struct_mutex); lock(qcom_iommu_lock); lock(prepare_lock); *** DEADLOCK *** 3 locks held by Xorg.bin/5413: #0: (drm_global_mutex){+.+.+.}, at: [] drm_release+0x34/0x428 #1: (&dev->struct_mutex){+.+.+.}, at: [] drm_gem_object_handle_unreference_unlocked+0xcc/0x130 #2: (qcom_iommu_lock){+.+...}, at: [] qcom_iommu_unmap+0x1c/0x1f0 stack backtrace: CPU: 1 PID: 5413 Comm: Xorg.bin Tainted: G W 3.17.0-rc1-00050-g07a489b #802 [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x98/0xb8) [] (dump_stack) from [] (print_circular_bug+0x218/0x340) [] (print_circular_bug) from [] (__lock_acquire+0x1d24/0x20b8) [] (__lock_acquire) from [] (lock_acquire+0x9c/0xbc) [] (lock_acquire) from [] (mutex_lock_nested+0x70/0x3e8) [] (mutex_lock_nested) from [] (clk_prepare_lock+0x88/0xfc) [] (clk_prepare_lock) from [] (clk_prepare+0xc/0x24) [] (clk_prepare) from [] (__enable_clocks.isra.4+0x18/0xa4) [] (__enable_clocks.isra.4) from [] (__flush_iotlb_va+0xe0/0x114) [] (__flush_iotlb_va) from [] (qcom_iommu_unmap+0xac/0x1f0) [] (qcom_iommu_unmap) from [] (iommu_unmap+0x9c/0xe8) [] (iommu_unmap) from [] (msm_iommu_unmap+0x64/0x84) [] (msm_iommu_unmap) from [] (msm_gem_free_object+0x11c/0x338) [] (msm_gem_free_object) from [] (drm_gem_object_handle_unreference_unlocked+0xfc/0x130) [] (drm_gem_object_handle_unreference_unlocked) from [] (drm_gem_object_release_handle+0x50/0x68) [] (drm_gem_object_release_handle) from [] (idr_for_each+0xa8/0xdc) [] (idr_for_each) from [] (drm_gem_release+0x1c/0x28) [] (drm_gem_release) from [] (drm_release+0x370/0x428) [] (drm_release) from [] (__fput+0x98/0x1e8) [] (__fput) from [] (task_work_run+0xb0/0xfc) [] (task_work_run) from [] (do_exit+0x2ec/0x948) [] (do_exit) from [] (do_group_exit+0x4c/0xb8) [] (do_group_exit) from [] (get_signal+0x28c/0x6ac) [] (get_signal) from [] (do_signal+0xc4/0x3e4) [] (do_signal) from [] (do_work_pending+0xb4/0xc4) [] (do_work_pending) from [] (work_pending+0xc/0x20) We can break this chain if we don't hold the prepare_lock while creating debugfs directories. We only hold the prepare_lock right now because we're traversing the clock tree recursively and we don't want the hierarchy to change during the traversal. Replacing this traversal with a simple linked list walk allows us to only grab a list lock instead of the prepare_lock, thus breaking the lock chain. Signed-off-by: Stephen Boyd Signed-off-by: Mike Turquette --- drivers/clk/clk.c | 73 +++++++++++++++++---------------------------- include/linux/clk-private.h | 1 + 2 files changed, 28 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index b76fa69b44cb..8ca28189e4e9 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -100,6 +100,8 @@ static void clk_enable_unlock(unsigned long flags) static struct dentry *rootdir; static int inited = 0; +static DEFINE_MUTEX(clk_debug_lock); +static HLIST_HEAD(clk_debug_list); static struct hlist_head *all_lists[] = { &clk_root_list, @@ -300,28 +302,6 @@ out: return ret; } -/* caller must hold prepare_lock */ -static int clk_debug_create_subtree(struct clk *clk, struct dentry *pdentry) -{ - struct clk *child; - int ret = -EINVAL;; - - if (!clk || !pdentry) - goto out; - - ret = clk_debug_create_one(clk, pdentry); - - if (ret) - goto out; - - hlist_for_each_entry(child, &clk->children, child_node) - clk_debug_create_subtree(child, pdentry); - - ret = 0; -out: - return ret; -} - /** * clk_debug_register - add a clk node to the debugfs clk tree * @clk: the clk being added to the debugfs clk tree @@ -329,20 +309,21 @@ out: * Dynamically adds a clk to the debugfs clk tree if debugfs has been * initialized. Otherwise it bails out early since the debugfs clk tree * will be created lazily by clk_debug_init as part of a late_initcall. - * - * Caller must hold prepare_lock. Only clk_init calls this function (so - * far) so this is taken care. */ static int clk_debug_register(struct clk *clk) { int ret = 0; + mutex_lock(&clk_debug_lock); + hlist_add_head(&clk->debug_node, &clk_debug_list); + if (!inited) - goto out; + goto unlock; - ret = clk_debug_create_subtree(clk, rootdir); + ret = clk_debug_create_one(clk, rootdir); +unlock: + mutex_unlock(&clk_debug_lock); -out: return ret; } @@ -353,12 +334,18 @@ out: * Dynamically removes a clk and all it's children clk nodes from the * debugfs clk tree if clk->dentry points to debugfs created by * clk_debug_register in __clk_init. - * - * Caller must hold prepare_lock. */ static void clk_debug_unregister(struct clk *clk) { + mutex_lock(&clk_debug_lock); + if (!clk->dentry) + goto out; + + hlist_del_init(&clk->debug_node); debugfs_remove_recursive(clk->dentry); + clk->dentry = NULL; +out: + mutex_unlock(&clk_debug_lock); } struct dentry *clk_debugfs_add_file(struct clk *clk, char *name, umode_t mode, @@ -415,17 +402,12 @@ static int __init clk_debug_init(void) if (!d) return -ENOMEM; - clk_prepare_lock(); - - hlist_for_each_entry(clk, &clk_root_list, child_node) - clk_debug_create_subtree(clk, rootdir); - - hlist_for_each_entry(clk, &clk_orphan_list, child_node) - clk_debug_create_subtree(clk, rootdir); + mutex_lock(&clk_debug_lock); + hlist_for_each_entry(clk, &clk_debug_list, debug_node) + clk_debug_create_one(clk, rootdir); inited = 1; - - clk_prepare_unlock(); + mutex_unlock(&clk_debug_lock); return 0; } @@ -2087,14 +2069,16 @@ void clk_unregister(struct clk *clk) { unsigned long flags; - if (!clk || WARN_ON_ONCE(IS_ERR(clk))) - return; + if (!clk || WARN_ON_ONCE(IS_ERR(clk))) + return; + + clk_debug_unregister(clk); clk_prepare_lock(); if (clk->ops == &clk_nodrv_ops) { pr_err("%s: unregistered clock: %s\n", __func__, clk->name); - goto out; + return; } /* * Assign empty clock ops for consumers that might still hold @@ -2113,16 +2097,13 @@ void clk_unregister(struct clk *clk) clk_set_parent(child, NULL); } - clk_debug_unregister(clk); - hlist_del_init(&clk->child_node); if (clk->prepare_count) pr_warn("%s: unregistering prepared clock: %s\n", __func__, clk->name); - kref_put(&clk->ref, __clk_release); -out: + clk_prepare_unlock(); } EXPORT_SYMBOL_GPL(clk_unregister); diff --git a/include/linux/clk-private.h b/include/linux/clk-private.h index efbf70b9fd84..4ed34105c371 100644 --- a/include/linux/clk-private.h +++ b/include/linux/clk-private.h @@ -48,6 +48,7 @@ struct clk { unsigned long accuracy; struct hlist_head children; struct hlist_node child_node; + struct hlist_node debug_node; unsigned int notifier_count; #ifdef CONFIG_DEBUG_FS struct dentry *dentry; -- cgit v1.2.3 From 3d598f47e804a77208c6bb0a454123018e2f2281 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Aug 2014 20:29:12 +0300 Subject: dmaengine: dw: move dw_dmac.h to where it belongs to There is a common storage for platform data related structures and definitions inside kernel source tree. The patch moves file from include/linux to include/linux/platform_data and renames it acoordingly. The users are also updated. Signed-off-by: Andy Shevchenko Acked-by: Viresh Kumar [For the arch/avr32/.* and .*sound/atmel.*] Acked-by: Hans-Christian Egtvedt Signed-off-by: Vinod Koul --- MAINTAINERS | 2 +- arch/avr32/mach-at32ap/at32ap700x.c | 2 +- arch/avr32/mach-at32ap/include/mach/atmel-mci.h | 2 +- drivers/dma/dw/internal.h | 2 +- drivers/dma/dw/regs.h | 6 +- include/linux/dw_dmac.h | 111 ------------------------ include/linux/platform_data/dma-dw.h | 111 ++++++++++++++++++++++++ include/sound/atmel-abdac.h | 2 +- include/sound/atmel-ac97c.h | 2 +- sound/atmel/abdac.c | 2 +- sound/atmel/ac97c.c | 2 +- 11 files changed, 122 insertions(+), 122 deletions(-) delete mode 100644 include/linux/dw_dmac.h create mode 100644 include/linux/platform_data/dma-dw.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index aefa94841ff3..a10072963889 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7875,7 +7875,7 @@ SYNOPSYS DESIGNWARE DMAC DRIVER M: Viresh Kumar M: Andy Shevchenko S: Maintained -F: include/linux/dw_dmac.h +F: include/linux/platform_data/dma-dw.h F: drivers/dma/dw/ SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c index db85b5ec3351..2a1aa712c6e7 100644 --- a/arch/avr32/mach-at32ap/at32ap700x.c +++ b/arch/avr32/mach-at32ap/at32ap700x.c @@ -7,7 +7,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/avr32/mach-at32ap/include/mach/atmel-mci.h b/arch/avr32/mach-at32ap/include/mach/atmel-mci.h index 4bba58561d5c..11d7f4b28dc8 100644 --- a/arch/avr32/mach-at32ap/include/mach/atmel-mci.h +++ b/arch/avr32/mach-at32ap/include/mach/atmel-mci.h @@ -1,7 +1,7 @@ #ifndef __MACH_ATMEL_MCI_H #define __MACH_ATMEL_MCI_H -#include +#include /** * struct mci_dma_data - DMA data for MCI interface diff --git a/drivers/dma/dw/internal.h b/drivers/dma/dw/internal.h index 32667f9e0dda..43cc1dfad5c9 100644 --- a/drivers/dma/dw/internal.h +++ b/drivers/dma/dw/internal.h @@ -12,7 +12,7 @@ #define _DW_DMAC_INTERNAL_H #include -#include +#include #include "regs.h" diff --git a/drivers/dma/dw/regs.h b/drivers/dma/dw/regs.h index bb98d3e91e8b..af02439155e9 100644 --- a/drivers/dma/dw/regs.h +++ b/drivers/dma/dw/regs.h @@ -11,7 +11,7 @@ #include #include -#include +#include #define DW_DMA_MAX_NR_CHANNELS 8 #define DW_DMA_MAX_NR_REQUESTS 16 @@ -161,7 +161,7 @@ struct dw_dma_regs { #define DWC_CTLH_DONE 0x00001000 #define DWC_CTLH_BLOCK_TS_MASK 0x00000fff -/* Bitfields in CFG_LO. Platform-configurable bits are in */ +/* Bitfields in CFG_LO. Platform-configurable bits are in */ #define DWC_CFGL_CH_PRIOR_MASK (0x7 << 5) /* priority mask */ #define DWC_CFGL_CH_PRIOR(x) ((x) << 5) /* priority */ #define DWC_CFGL_CH_SUSP (1 << 8) /* pause xfer */ @@ -172,7 +172,7 @@ struct dw_dma_regs { #define DWC_CFGL_RELOAD_SAR (1 << 30) #define DWC_CFGL_RELOAD_DAR (1 << 31) -/* Bitfields in CFG_HI. Platform-configurable bits are in */ +/* Bitfields in CFG_HI. Platform-configurable bits are in */ #define DWC_CFGH_DS_UPD_EN (1 << 5) #define DWC_CFGH_SS_UPD_EN (1 << 6) diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h deleted file mode 100644 index 68b4024184de..000000000000 --- a/include/linux/dw_dmac.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Driver for the Synopsys DesignWare DMA Controller - * - * Copyright (C) 2007 Atmel Corporation - * Copyright (C) 2010-2011 ST Microelectronics - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef DW_DMAC_H -#define DW_DMAC_H - -#include - -/** - * struct dw_dma_slave - Controller-specific information about a slave - * - * @dma_dev: required DMA master device. Depricated. - * @bus_id: name of this device channel, not just a device name since - * devices may have more than one channel e.g. "foo_tx" - * @cfg_hi: Platform-specific initializer for the CFG_HI register - * @cfg_lo: Platform-specific initializer for the CFG_LO register - * @src_master: src master for transfers on allocated channel. - * @dst_master: dest master for transfers on allocated channel. - */ -struct dw_dma_slave { - struct device *dma_dev; - u32 cfg_hi; - u32 cfg_lo; - u8 src_master; - u8 dst_master; -}; - -/** - * struct dw_dma_platform_data - Controller configuration parameters - * @nr_channels: Number of channels supported by hardware (max 8) - * @is_private: The device channels should be marked as private and not for - * by the general purpose DMA channel allocator. - * @chan_allocation_order: Allocate channels starting from 0 or 7 - * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0. - * @block_size: Maximum block size supported by the controller - * @nr_masters: Number of AHB masters supported by the controller - * @data_width: Maximum data width supported by hardware per AHB master - * (0 - 8bits, 1 - 16bits, ..., 5 - 256bits) - */ -struct dw_dma_platform_data { - unsigned int nr_channels; - bool is_private; -#define CHAN_ALLOCATION_ASCENDING 0 /* zero to seven */ -#define CHAN_ALLOCATION_DESCENDING 1 /* seven to zero */ - unsigned char chan_allocation_order; -#define CHAN_PRIORITY_ASCENDING 0 /* chan0 highest */ -#define CHAN_PRIORITY_DESCENDING 1 /* chan7 highest */ - unsigned char chan_priority; - unsigned short block_size; - unsigned char nr_masters; - unsigned char data_width[4]; -}; - -/* bursts size */ -enum dw_dma_msize { - DW_DMA_MSIZE_1, - DW_DMA_MSIZE_4, - DW_DMA_MSIZE_8, - DW_DMA_MSIZE_16, - DW_DMA_MSIZE_32, - DW_DMA_MSIZE_64, - DW_DMA_MSIZE_128, - DW_DMA_MSIZE_256, -}; - -/* Platform-configurable bits in CFG_HI */ -#define DWC_CFGH_FCMODE (1 << 0) -#define DWC_CFGH_FIFO_MODE (1 << 1) -#define DWC_CFGH_PROTCTL(x) ((x) << 2) -#define DWC_CFGH_SRC_PER(x) ((x) << 7) -#define DWC_CFGH_DST_PER(x) ((x) << 11) - -/* Platform-configurable bits in CFG_LO */ -#define DWC_CFGL_LOCK_CH_XFER (0 << 12) /* scope of LOCK_CH */ -#define DWC_CFGL_LOCK_CH_BLOCK (1 << 12) -#define DWC_CFGL_LOCK_CH_XACT (2 << 12) -#define DWC_CFGL_LOCK_BUS_XFER (0 << 14) /* scope of LOCK_BUS */ -#define DWC_CFGL_LOCK_BUS_BLOCK (1 << 14) -#define DWC_CFGL_LOCK_BUS_XACT (2 << 14) -#define DWC_CFGL_LOCK_CH (1 << 15) /* channel lockout */ -#define DWC_CFGL_LOCK_BUS (1 << 16) /* busmaster lockout */ -#define DWC_CFGL_HS_DST_POL (1 << 18) /* dst handshake active low */ -#define DWC_CFGL_HS_SRC_POL (1 << 19) /* src handshake active low */ - -/* DMA API extensions */ -struct dw_cyclic_desc { - struct dw_desc **desc; - unsigned long periods; - void (*period_callback)(void *param); - void *period_callback_param; -}; - -struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, - dma_addr_t buf_addr, size_t buf_len, size_t period_len, - enum dma_transfer_direction direction); -void dw_dma_cyclic_free(struct dma_chan *chan); -int dw_dma_cyclic_start(struct dma_chan *chan); -void dw_dma_cyclic_stop(struct dma_chan *chan); - -dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan); - -dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan); - -#endif /* DW_DMAC_H */ diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h new file mode 100644 index 000000000000..68b4024184de --- /dev/null +++ b/include/linux/platform_data/dma-dw.h @@ -0,0 +1,111 @@ +/* + * Driver for the Synopsys DesignWare DMA Controller + * + * Copyright (C) 2007 Atmel Corporation + * Copyright (C) 2010-2011 ST Microelectronics + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef DW_DMAC_H +#define DW_DMAC_H + +#include + +/** + * struct dw_dma_slave - Controller-specific information about a slave + * + * @dma_dev: required DMA master device. Depricated. + * @bus_id: name of this device channel, not just a device name since + * devices may have more than one channel e.g. "foo_tx" + * @cfg_hi: Platform-specific initializer for the CFG_HI register + * @cfg_lo: Platform-specific initializer for the CFG_LO register + * @src_master: src master for transfers on allocated channel. + * @dst_master: dest master for transfers on allocated channel. + */ +struct dw_dma_slave { + struct device *dma_dev; + u32 cfg_hi; + u32 cfg_lo; + u8 src_master; + u8 dst_master; +}; + +/** + * struct dw_dma_platform_data - Controller configuration parameters + * @nr_channels: Number of channels supported by hardware (max 8) + * @is_private: The device channels should be marked as private and not for + * by the general purpose DMA channel allocator. + * @chan_allocation_order: Allocate channels starting from 0 or 7 + * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0. + * @block_size: Maximum block size supported by the controller + * @nr_masters: Number of AHB masters supported by the controller + * @data_width: Maximum data width supported by hardware per AHB master + * (0 - 8bits, 1 - 16bits, ..., 5 - 256bits) + */ +struct dw_dma_platform_data { + unsigned int nr_channels; + bool is_private; +#define CHAN_ALLOCATION_ASCENDING 0 /* zero to seven */ +#define CHAN_ALLOCATION_DESCENDING 1 /* seven to zero */ + unsigned char chan_allocation_order; +#define CHAN_PRIORITY_ASCENDING 0 /* chan0 highest */ +#define CHAN_PRIORITY_DESCENDING 1 /* chan7 highest */ + unsigned char chan_priority; + unsigned short block_size; + unsigned char nr_masters; + unsigned char data_width[4]; +}; + +/* bursts size */ +enum dw_dma_msize { + DW_DMA_MSIZE_1, + DW_DMA_MSIZE_4, + DW_DMA_MSIZE_8, + DW_DMA_MSIZE_16, + DW_DMA_MSIZE_32, + DW_DMA_MSIZE_64, + DW_DMA_MSIZE_128, + DW_DMA_MSIZE_256, +}; + +/* Platform-configurable bits in CFG_HI */ +#define DWC_CFGH_FCMODE (1 << 0) +#define DWC_CFGH_FIFO_MODE (1 << 1) +#define DWC_CFGH_PROTCTL(x) ((x) << 2) +#define DWC_CFGH_SRC_PER(x) ((x) << 7) +#define DWC_CFGH_DST_PER(x) ((x) << 11) + +/* Platform-configurable bits in CFG_LO */ +#define DWC_CFGL_LOCK_CH_XFER (0 << 12) /* scope of LOCK_CH */ +#define DWC_CFGL_LOCK_CH_BLOCK (1 << 12) +#define DWC_CFGL_LOCK_CH_XACT (2 << 12) +#define DWC_CFGL_LOCK_BUS_XFER (0 << 14) /* scope of LOCK_BUS */ +#define DWC_CFGL_LOCK_BUS_BLOCK (1 << 14) +#define DWC_CFGL_LOCK_BUS_XACT (2 << 14) +#define DWC_CFGL_LOCK_CH (1 << 15) /* channel lockout */ +#define DWC_CFGL_LOCK_BUS (1 << 16) /* busmaster lockout */ +#define DWC_CFGL_HS_DST_POL (1 << 18) /* dst handshake active low */ +#define DWC_CFGL_HS_SRC_POL (1 << 19) /* src handshake active low */ + +/* DMA API extensions */ +struct dw_cyclic_desc { + struct dw_desc **desc; + unsigned long periods; + void (*period_callback)(void *param); + void *period_callback_param; +}; + +struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan, + dma_addr_t buf_addr, size_t buf_len, size_t period_len, + enum dma_transfer_direction direction); +void dw_dma_cyclic_free(struct dma_chan *chan); +int dw_dma_cyclic_start(struct dma_chan *chan); +void dw_dma_cyclic_stop(struct dma_chan *chan); + +dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan); + +dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan); + +#endif /* DW_DMAC_H */ diff --git a/include/sound/atmel-abdac.h b/include/sound/atmel-abdac.h index edff6a8ba1b5..a8f735d677fa 100644 --- a/include/sound/atmel-abdac.h +++ b/include/sound/atmel-abdac.h @@ -10,7 +10,7 @@ #ifndef __INCLUDE_SOUND_ATMEL_ABDAC_H #define __INCLUDE_SOUND_ATMEL_ABDAC_H -#include +#include /** * struct atmel_abdac_pdata - board specific ABDAC configuration diff --git a/include/sound/atmel-ac97c.h b/include/sound/atmel-ac97c.h index 00e6c289a936..f2a1cdc37661 100644 --- a/include/sound/atmel-ac97c.h +++ b/include/sound/atmel-ac97c.h @@ -10,7 +10,7 @@ #ifndef __INCLUDE_SOUND_ATMEL_AC97C_H #define __INCLUDE_SOUND_ATMEL_AC97C_H -#include +#include #define AC97C_CAPTURE 0x01 #define AC97C_PLAYBACK 0x02 diff --git a/sound/atmel/abdac.c b/sound/atmel/abdac.c index edf2ca72d518..154a7c44e38d 100644 --- a/sound/atmel/abdac.c +++ b/sound/atmel/abdac.c @@ -9,7 +9,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/sound/atmel/ac97c.c b/sound/atmel/ac97c.c index a04d23174dc2..1dfb35afef8f 100644 --- a/sound/atmel/ac97c.c +++ b/sound/atmel/ac97c.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include -- cgit v1.2.3 From 7e1e2f27c5508518e58e5cbb11e26cbb815f4c56 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Aug 2014 20:29:14 +0300 Subject: dmaengine: dw: convert dw_dma_slave to use explicit HS interfaces Instead of exposing the possibility to set DMA registers CFG_HI and CFG_LO strict user to provide handshake interfaces explicitly. Signed-off-by: Andy Shevchenko Acked-by: Hans-Christian Egtvedt Signed-off-by: Vinod Koul --- arch/avr32/mach-at32ap/at32ap700x.c | 15 +++++---------- drivers/dma/dw/core.c | 4 ++-- include/linux/platform_data/dma-dw.h | 10 ++++------ 3 files changed, 11 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c index ec7be287a97e..37b75602adf6 100644 --- a/arch/avr32/mach-at32ap/at32ap700x.c +++ b/arch/avr32/mach-at32ap/at32ap700x.c @@ -1356,10 +1356,8 @@ at32_add_device_mci(unsigned int id, struct mci_platform_data *data) goto fail; slave->sdata.dma_dev = &dw_dmac0_device.dev; - slave->sdata.cfg_hi = (DWC_CFGH_SRC_PER(0) - | DWC_CFGH_DST_PER(1)); - slave->sdata.cfg_lo &= ~(DWC_CFGL_HS_DST_POL - | DWC_CFGL_HS_SRC_POL); + slave->sdata.src_id = 0; + slave->sdata.dst_id = 1; slave->sdata.src_master = 1; slave->sdata.dst_master = 0; @@ -2054,8 +2052,7 @@ at32_add_device_ac97c(unsigned int id, struct ac97c_platform_data *data, /* Check if DMA slave interface for capture should be configured. */ if (flags & AC97C_CAPTURE) { rx_dws->dma_dev = &dw_dmac0_device.dev; - rx_dws->cfg_hi = DWC_CFGH_SRC_PER(3); - rx_dws->cfg_lo &= ~(DWC_CFGL_HS_DST_POL | DWC_CFGL_HS_SRC_POL); + rx_dws->src_id = 3; rx_dws->src_master = 0; rx_dws->dst_master = 1; } @@ -2063,8 +2060,7 @@ at32_add_device_ac97c(unsigned int id, struct ac97c_platform_data *data, /* Check if DMA slave interface for playback should be configured. */ if (flags & AC97C_PLAYBACK) { tx_dws->dma_dev = &dw_dmac0_device.dev; - tx_dws->cfg_hi = DWC_CFGH_DST_PER(4); - tx_dws->cfg_lo &= ~(DWC_CFGL_HS_DST_POL | DWC_CFGL_HS_SRC_POL); + tx_dws->dst_id = 4; tx_dws->src_master = 0; tx_dws->dst_master = 1; } @@ -2136,8 +2132,7 @@ at32_add_device_abdac(unsigned int id, struct atmel_abdac_pdata *data) dws = &data->dws; dws->dma_dev = &dw_dmac0_device.dev; - dws->cfg_hi = DWC_CFGH_DST_PER(2); - dws->cfg_lo &= ~(DWC_CFGL_HS_DST_POL | DWC_CFGL_HS_SRC_POL); + dws->dst_id = 2; dws->src_master = 0; dws->dst_master = 1; diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 1af731b83b3f..0a9c052d437c 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -155,8 +155,8 @@ static void dwc_initialize(struct dw_dma_chan *dwc) */ BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev); - cfghi = dws->cfg_hi; - cfglo |= dws->cfg_lo & ~DWC_CFGL_CH_PRIOR_MASK; + cfghi |= DWC_CFGH_DST_PER(dws->dst_id); + cfghi |= DWC_CFGH_SRC_PER(dws->src_id); } else { if (dwc->direction == DMA_MEM_TO_DEV) cfghi = DWC_CFGH_DST_PER(dwc->request_line); diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h index 68b4024184de..bc411a1bf8e7 100644 --- a/include/linux/platform_data/dma-dw.h +++ b/include/linux/platform_data/dma-dw.h @@ -17,17 +17,15 @@ * struct dw_dma_slave - Controller-specific information about a slave * * @dma_dev: required DMA master device. Depricated. - * @bus_id: name of this device channel, not just a device name since - * devices may have more than one channel e.g. "foo_tx" - * @cfg_hi: Platform-specific initializer for the CFG_HI register - * @cfg_lo: Platform-specific initializer for the CFG_LO register + * @src_id: src request line + * @dst_id: dst request line * @src_master: src master for transfers on allocated channel. * @dst_master: dest master for transfers on allocated channel. */ struct dw_dma_slave { struct device *dma_dev; - u32 cfg_hi; - u32 cfg_lo; + u8 src_id; + u8 dst_id; u8 src_master; u8 dst_master; }; -- cgit v1.2.3 From fbfa398b84a5fc6e085dedba5ec3e94f21815d05 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 28 Aug 2014 12:21:44 -0600 Subject: PCI: Remove unused pci_configure_slot() All pci_configure_slot() uses have been removed, so remove the definition as well. Signed-off-by: Bjorn Helgaas Acked-by: Yinghai Lu --- drivers/pci/probe.c | 28 ---------------------------- include/linux/pci_hotplug.h | 2 -- 2 files changed, 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index e9482867555b..4b3b29bc7a82 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1358,34 +1358,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) */ } -void pci_configure_slot(struct pci_dev *dev) -{ - struct pci_dev *cdev; - struct hotplug_params hpp; - int ret; - - if (!(dev->hdr_type == PCI_HEADER_TYPE_NORMAL || - (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE && - (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI))) - return; - - pcie_bus_configure_settings(dev->bus); - - memset(&hpp, 0, sizeof(hpp)); - ret = pci_get_hp_params(dev, &hpp); - - program_hpp_type2(dev, hpp.t2); - program_hpp_type1(dev, hpp.t1); - program_hpp_type0(dev, hpp.t0); - - if (dev->subordinate) { - list_for_each_entry(cdev, &dev->subordinate->devices, - bus_list) - pci_configure_slot(cdev); - } -} -EXPORT_SYMBOL_GPL(pci_configure_slot); - static void pci_configure_device(struct pci_dev *dev) { struct hotplug_params hpp; diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 5f2e559af6b0..2706ee9a4327 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -187,6 +187,4 @@ static inline int pci_get_hp_params(struct pci_dev *dev, return -ENODEV; } #endif - -void pci_configure_slot(struct pci_dev *dev); #endif -- cgit v1.2.3 From a80e49e2cc3145af014a8ae44f575829cc236192 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Aug 2014 17:47:18 +0200 Subject: nohz: Move nohz full init call to tick init This way we unbloat a bit main.c and more importantly we initialize nohz full after init_IRQ(). This dependency will be needed in further patches because nohz full needs irq work to raise its own IRQ. Information about the support for this ability on ARM64 is obtained on init_IRQ() which initialize the pointer to __smp_call_function. Since tick_init() is called right after init_IRQ(), this is a good place to call tick_nohz_init() and prepare for that dependency. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 2 -- init/main.c | 1 - kernel/time/tick-common.c | 1 + kernel/time/tick-internal.h | 7 +++++++ 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 9a82c7dc3fdd..595ee86f5e0d 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -181,14 +181,12 @@ static inline bool tick_nohz_full_cpu(int cpu) return cpumask_test_cpu(cpu, tick_nohz_full_mask); } -extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else -static inline void tick_nohz_init(void) { } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void __tick_nohz_full_check(void) { } diff --git a/init/main.c b/init/main.c index bb1aed928f21..8af2f1abfe38 100644 --- a/init/main.c +++ b/init/main.c @@ -577,7 +577,6 @@ asmlinkage __visible void __init start_kernel(void) local_irq_disable(); idr_init_cache(); rcu_init(); - tick_nohz_init(); context_tracking_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..052b4b53c3d6 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -400,4 +400,5 @@ void tick_resume(void) void __init tick_init(void) { tick_broadcast_init(); + tick_nohz_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c19c1d84b6f3..366aeb4f2c66 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif + /* * Broadcasting support */ -- cgit v1.2.3 From c5c38ef3d70377dc504a6a3f611a3ec814bc757b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 Sep 2014 15:43:02 +0200 Subject: irq_work: Introduce arch_irq_work_has_interrupt() The nohz full code needs irq work to trigger its own interrupt so that the subsystem can work even when the tick is stopped. Lets introduce arch_irq_work_has_interrupt() that archs can override to tell about their support for this ability. Signed-off-by: Peter Zijlstra Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm64/include/asm/Kbuild | 3 ++- arch/avr32/include/asm/Kbuild | 1 + arch/blackfin/include/asm/Kbuild | 1 + arch/c6x/include/asm/Kbuild | 1 + arch/cris/include/asm/Kbuild | 1 + arch/frv/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/ia64/include/asm/Kbuild | 1 + arch/m32r/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/metag/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/mn10300/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/score/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/tile/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/unicore32/include/asm/Kbuild | 1 + arch/x86/include/asm/Kbuild | 1 + arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/irq_work.h | 10 ++++++++++ include/linux/irq_work.h | 2 ++ 31 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 include/asm-generic/irq_work.h (limited to 'include/linux') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index e858aa0ad8af..a52cbf178c3a 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += clkdev.h generic-y += cputime.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += scatterlist.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index e76fd79f32b0..b8fffc1a2ac2 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -18,6 +18,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kmap_types.h generic-y += kvm_para.h generic-y += local.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 70cd84eb7fda..202905e7ea0c 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -11,6 +11,7 @@ generic-y += hash.h generic-y += ioctl.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += local.h generic-y += local64.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 0b3fcf86e6ba..d617789b1ebd 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -9,8 +9,8 @@ generic-y += current.h generic-y += delay.h generic-y += div64.h generic-y += dma.h -generic-y += emergency-restart.h generic-y += early_ioremap.h +generic-y += emergency-restart.h generic-y += errno.h generic-y += ftrace.h generic-y += hash.h @@ -19,6 +19,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index 00a0f3ccd6eb..2a71b1cb9848 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += exec.h generic-y += futex.h generic-y += hash.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 0d93b9a79ca9..46ed6bb9c679 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += hw_irq.h generic-y += ioctl.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 8dbdce8421b0..e77e0c1dbe75 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += local.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 31742dfadff9..802b94c4ca86 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -8,6 +8,7 @@ generic-y += clkdev.h generic-y += cputime.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += kvm_para.h generic-y += linkage.h generic-y += mcs_spinlock.h diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index 5b73921b6e9d..3caf05cabfc5 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += cputime.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += scatterlist.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 0e69796b58c7..5f234a5a2320 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += ioctls.h generic-y += iomap.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += local.h diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index e8317d2d6c8d..747320be9d0e 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += clkdev.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += preempt.h diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index accc10a3dc78..e02448b0648b 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += cputime.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += module.h generic-y += preempt.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index c67c94a2d672..dbaf9f3065e8 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -11,6 +11,7 @@ generic-y += hw_irq.h generic-y += ioctl.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild index c29ead89a317..7b8111c8f937 100644 --- a/arch/metag/include/asm/Kbuild +++ b/arch/metag/include/asm/Kbuild @@ -19,6 +19,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 27a3acda6c19..448143b8cabd 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += cputime.h generic-y += device.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += scatterlist.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 335e5290ec75..57012ef1f51e 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += cputime.h generic-y += current.h generic-y += emergency-restart.h generic-y += hash.h +generic-y += irq_work.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mutex.h diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index ecbd6676bd33..77eb1a68d13b 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += clkdev.h generic-y += cputime.h generic-y += exec.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += scatterlist.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 480af0d9c2f5..89b61d7dc790 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -31,6 +31,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index ecf25e6678ad..ffb024b8423f 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -10,6 +10,7 @@ generic-y += exec.h generic-y += hash.h generic-y += hw_irq.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kvm_para.h generic-y += local.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 7f23f162ce9c..31e8f59aff38 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -1,6 +1,7 @@ generic-y += clkdev.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += rwsem.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index b3fea0722ff1..773f86676588 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += clkdev.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += scatterlist.h diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index aad209199f7e..1909d2a5b82f 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -6,6 +6,7 @@ generic-y += barrier.h generic-y += clkdev.h generic-y += cputime.h generic-y += hash.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += scatterlist.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index c19e47dacb31..5a6c9acff0d2 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -12,6 +12,7 @@ generic-y += hash.h generic-y += ioctl.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kvm_para.h generic-y += local.h generic-y += local64.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index cdd1b447bb6c..f5f94ce1692c 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -8,6 +8,7 @@ generic-y += emergency-restart.h generic-y += exec.h generic-y += hash.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += linkage.h generic-y += local.h generic-y += local64.h diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 0aa5675e7025..e6462b8a6284 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -17,6 +17,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 7bd64aa2e94a..244b12c8cb39 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -14,6 +14,7 @@ generic-y += hash.h generic-y += hw_irq.h generic-y += io.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mutex.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index 1e5fb872a4aa..5a2bb53faa42 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += local.h diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 3bf000fab0ae..8fa909cc6553 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -7,5 +7,6 @@ genhdr-y += unistd_x32.h generic-y += clkdev.h generic-y += cputime.h generic-y += early_ioremap.h +generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += scatterlist.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index c3d20ba6eb86..105d38922c44 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -12,6 +12,7 @@ generic-y += hardirq.h generic-y += hash.h generic-y += ioctl.h generic-y += irq_regs.h +generic-y += irq_work.h generic-y += kdebug.h generic-y += kmap_types.h generic-y += kvm_para.h diff --git a/include/asm-generic/irq_work.h b/include/asm-generic/irq_work.h new file mode 100644 index 000000000000..a44f452c6590 --- /dev/null +++ b/include/asm-generic/irq_work.h @@ -0,0 +1,10 @@ +#ifndef __ASM_IRQ_WORK_H +#define __ASM_IRQ_WORK_H + +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} + +#endif /* __ASM_IRQ_WORK_H */ + diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index bf9422c3aefe..6b47b2ede405 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -42,6 +42,8 @@ void irq_work_run(void); void irq_work_sync(struct irq_work *work); #ifdef CONFIG_IRQ_WORK +#include + bool irq_work_needs_cpu(void); #else static inline bool irq_work_needs_cpu(void) { return false; } -- cgit v1.2.3 From 76a33061b9323b7fdb220ae5fa116c10833ec22e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Aug 2014 18:37:19 +0200 Subject: irq_work: Force raised irq work to run on irq work interrupt The nohz full kick, which restarts the tick when any resource depend on it, can't be executed anywhere given the operation it does on timers. If it is called from the scheduler or timers code, chances are that we run into a deadlock. This is why we run the nohz full kick from an irq work. That way we make sure that the kick runs on a virgin context. However if that's the case when irq work runs in its own dedicated self-ipi, things are different for the big bunch of archs that don't support the self triggered way. In order to support them, irq works are also handled by the timer interrupt as fallback. Now when irq works run on the timer interrupt, the context isn't blank. More precisely, they can run in the context of the hrtimer that runs the tick. But the nohz kick cancels and restarts this hrtimer and cancelling an hrtimer from itself isn't allowed. This is why we run in an endless loop: Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2 CPU: 2 PID: 7538 Comm: kworker/u8:8 Not tainted 3.16.0+ #34 Workqueue: btrfs-endio-write normal_work_helper [btrfs] ffff880244c06c88 000000001b486fe1 ffff880244c06bf0 ffffffff8a7f1e37 ffffffff8ac52a18 ffff880244c06c78 ffffffff8a7ef928 0000000000000010 ffff880244c06c88 ffff880244c06c20 000000001b486fe1 0000000000000000 Call Trace: ] dump_stack+0x4e/0x7a [] panic+0xd4/0x207 [] watchdog_overflow_callback+0x118/0x120 [] __perf_event_overflow+0xae/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? match_held_lock+0x8/0x1b0 [] default_do_nmi+0x72/0x1c0 [] do_nmi+0xb8/0x100 [] end_repeat_nmi+0x1e/0x2e [] ? match_held_lock+0x8/0x1b0 [] ? match_held_lock+0x8/0x1b0 [] ? match_held_lock+0x8/0x1b0 <] lock_acquired+0xaf/0x450 [] ? lock_hrtimer_base.isra.20+0x25/0x50 [] _raw_spin_lock_irqsave+0x78/0x90 [] ? lock_hrtimer_base.isra.20+0x25/0x50 [] lock_hrtimer_base.isra.20+0x25/0x50 [] hrtimer_try_to_cancel+0x33/0x1e0 [] hrtimer_cancel+0x1a/0x30 [] tick_nohz_restart+0x17/0x90 [] __tick_nohz_full_check+0xc3/0x100 [] nohz_full_kick_work_func+0xe/0x10 [] irq_work_run_list+0x44/0x70 [] irq_work_run+0x2a/0x50 [] update_process_times+0x5b/0x70 [] tick_sched_handle.isra.21+0x25/0x60 [] tick_sched_timer+0x41/0x60 [] __run_hrtimer+0x72/0x470 [] ? tick_sched_do_timer+0xb0/0xb0 [] hrtimer_interrupt+0x117/0x270 [] local_apic_timer_interrupt+0x37/0x60 [] smp_apic_timer_interrupt+0x3f/0x50 [] apic_timer_interrupt+0x6f/0x80 To fix this we force non-lazy irq works to run on irq work self-IPIs when available. That ability of the arch to trigger irq work self IPIs is available with arch_irq_work_has_interrupt(). Reported-by: Catalin Iacob Reported-by: Dave Jones Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/irq_work.h | 1 + kernel/irq_work.c | 15 +++++++++++++-- kernel/time/timer.c | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 6b47b2ede405..bf3fe719c7ce 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -39,6 +39,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu); #endif void irq_work_run(void); +void irq_work_tick(void); void irq_work_sync(struct irq_work *work); #ifdef CONFIG_IRQ_WORK diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e6bcbe756663..385b85aded19 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -115,8 +115,10 @@ bool irq_work_needs_cpu(void) raised = &__get_cpu_var(raised_list); lazy = &__get_cpu_var(lazy_list); - if (llist_empty(raised) && llist_empty(lazy)) - return false; + + if (llist_empty(raised) || arch_irq_work_has_interrupt()) + if (llist_empty(lazy)) + return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -171,6 +173,15 @@ void irq_work_run(void) } EXPORT_SYMBOL_GPL(irq_work_run); +void irq_work_tick(void) +{ + struct llist_head *raised = &__get_cpu_var(raised_list); + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); + irq_work_run_list(&__get_cpu_var(lazy_list)); +} + /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..9bbb8344ed3b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1385,7 +1385,7 @@ void update_process_times(int user_tick) rcu_check_callbacks(cpu, user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) - irq_work_run(); + irq_work_tick(); #endif scheduler_tick(); run_posix_cpu_timers(p); -- cgit v1.2.3 From 1d46fea7d091f9dc2d4fd3fcb9f0117ca288f9a5 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 27 Aug 2014 00:42:56 +0200 Subject: drm/rcar-du: Use struct videomode in platform data In preparation for DT support where panel timings will be described by a DRM-agnostic video mode, replace the struct drm_mode_modeinfo instance in the panel platform data with a struct videomode. Signed-off-by: Laurent Pinchart --- arch/arm/mach-shmobile/board-koelsch-reference.c | 19 +++++++++---------- arch/arm/mach-shmobile/board-koelsch.c | 19 +++++++++---------- arch/arm/mach-shmobile/board-lager-reference.c | 19 +++++++++---------- arch/arm/mach-shmobile/board-lager.c | 19 +++++++++---------- arch/arm/mach-shmobile/board-marzen.c | 19 +++++++++---------- drivers/gpu/drm/rcar-du/Kconfig | 1 + drivers/gpu/drm/rcar-du/rcar_du_lvdscon.c | 15 +++------------ include/linux/platform_data/rcar-du.h | 4 ++-- 8 files changed, 51 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-shmobile/board-koelsch-reference.c b/arch/arm/mach-shmobile/board-koelsch-reference.c index 3ff88c138896..364e69bf85d4 100644 --- a/arch/arm/mach-shmobile/board-koelsch-reference.c +++ b/arch/arm/mach-shmobile/board-koelsch-reference.c @@ -41,16 +41,15 @@ static struct rcar_du_encoder_data koelsch_du_encoders[] = { .width_mm = 210, .height_mm = 158, .mode = { - .clock = 65000, - .hdisplay = 1024, - .hsync_start = 1048, - .hsync_end = 1184, - .htotal = 1344, - .vdisplay = 768, - .vsync_start = 771, - .vsync_end = 777, - .vtotal = 806, - .flags = 0, + .pixelclock = 65000000, + .hactive = 1024, + .hfront_porch = 20, + .hback_porch = 160, + .hsync_len = 136, + .vactive = 768, + .vfront_porch = 3, + .vback_porch = 29, + .vsync_len = 6, }, }, }, diff --git a/arch/arm/mach-shmobile/board-koelsch.c b/arch/arm/mach-shmobile/board-koelsch.c index b7d5bc7659cd..ad10ddb6a321 100644 --- a/arch/arm/mach-shmobile/board-koelsch.c +++ b/arch/arm/mach-shmobile/board-koelsch.c @@ -63,16 +63,15 @@ static struct rcar_du_encoder_data koelsch_du_encoders[] = { .width_mm = 210, .height_mm = 158, .mode = { - .clock = 65000, - .hdisplay = 1024, - .hsync_start = 1048, - .hsync_end = 1184, - .htotal = 1344, - .vdisplay = 768, - .vsync_start = 771, - .vsync_end = 777, - .vtotal = 806, - .flags = 0, + .pixelclock = 65000000, + .hactive = 1024, + .hfront_porch = 20, + .hback_porch = 160, + .hsync_len = 136, + .vactive = 768, + .vfront_porch = 3, + .vback_porch = 29, + .vsync_len = 6, }, }, }, diff --git a/arch/arm/mach-shmobile/board-lager-reference.c b/arch/arm/mach-shmobile/board-lager-reference.c index 41c808e56005..12a53a1c3d02 100644 --- a/arch/arm/mach-shmobile/board-lager-reference.c +++ b/arch/arm/mach-shmobile/board-lager-reference.c @@ -43,16 +43,15 @@ static struct rcar_du_encoder_data lager_du_encoders[] = { .width_mm = 210, .height_mm = 158, .mode = { - .clock = 65000, - .hdisplay = 1024, - .hsync_start = 1048, - .hsync_end = 1184, - .htotal = 1344, - .vdisplay = 768, - .vsync_start = 771, - .vsync_end = 777, - .vtotal = 806, - .flags = 0, + .pixelclock = 65000000, + .hactive = 1024, + .hfront_porch = 20, + .hback_porch = 160, + .hsync_len = 136, + .vactive = 768, + .vfront_porch = 3, + .vback_porch = 29, + .vsync_len = 6, }, }, }, diff --git a/arch/arm/mach-shmobile/board-lager.c b/arch/arm/mach-shmobile/board-lager.c index e1d8215da0b0..80576c2ee668 100644 --- a/arch/arm/mach-shmobile/board-lager.c +++ b/arch/arm/mach-shmobile/board-lager.c @@ -99,16 +99,15 @@ static struct rcar_du_encoder_data lager_du_encoders[] = { .width_mm = 210, .height_mm = 158, .mode = { - .clock = 65000, - .hdisplay = 1024, - .hsync_start = 1048, - .hsync_end = 1184, - .htotal = 1344, - .vdisplay = 768, - .vsync_start = 771, - .vsync_end = 777, - .vtotal = 806, - .flags = 0, + .pixelclock = 65000000, + .hactive = 1024, + .hfront_porch = 20, + .hback_porch = 160, + .hsync_len = 136, + .vactive = 768, + .vfront_porch = 3, + .vback_porch = 29, + .vsync_len = 6, }, }, }, diff --git a/arch/arm/mach-shmobile/board-marzen.c b/arch/arm/mach-shmobile/board-marzen.c index e5cf4201e769..ce33d7825c49 100644 --- a/arch/arm/mach-shmobile/board-marzen.c +++ b/arch/arm/mach-shmobile/board-marzen.c @@ -192,16 +192,15 @@ static struct rcar_du_encoder_data du_encoders[] = { .width_mm = 210, .height_mm = 158, .mode = { - .clock = 65000, - .hdisplay = 1024, - .hsync_start = 1048, - .hsync_end = 1184, - .htotal = 1344, - .vdisplay = 768, - .vsync_start = 771, - .vsync_end = 777, - .vtotal = 806, - .flags = 0, + .pixelclock = 65000000, + .hactive = 1024, + .hfront_porch = 20, + .hback_porch = 160, + .hsync_len = 136, + .vactive = 768, + .vfront_porch = 3, + .vback_porch = 29, + .vsync_len = 6, }, }, }, diff --git a/drivers/gpu/drm/rcar-du/Kconfig b/drivers/gpu/drm/rcar-du/Kconfig index 2e3d7b5b0ad7..c96f6089f8bf 100644 --- a/drivers/gpu/drm/rcar-du/Kconfig +++ b/drivers/gpu/drm/rcar-du/Kconfig @@ -6,6 +6,7 @@ config DRM_RCAR_DU select DRM_KMS_CMA_HELPER select DRM_GEM_CMA_HELPER select DRM_KMS_FB_HELPER + select VIDEOMODE_HELPERS help Choose this option if you have an R-Car chipset. If M is selected the module will be called rcar-du-drm. diff --git a/drivers/gpu/drm/rcar-du/rcar_du_lvdscon.c b/drivers/gpu/drm/rcar-du/rcar_du_lvdscon.c index cfcf6e74ad0a..d29544121658 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_lvdscon.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_lvdscon.c @@ -40,18 +40,9 @@ static int rcar_du_lvds_connector_get_modes(struct drm_connector *connector) return 0; mode->type = DRM_MODE_TYPE_PREFERRED | DRM_MODE_TYPE_DRIVER; - mode->clock = lvdscon->panel->mode.clock; - mode->hdisplay = lvdscon->panel->mode.hdisplay; - mode->hsync_start = lvdscon->panel->mode.hsync_start; - mode->hsync_end = lvdscon->panel->mode.hsync_end; - mode->htotal = lvdscon->panel->mode.htotal; - mode->vdisplay = lvdscon->panel->mode.vdisplay; - mode->vsync_start = lvdscon->panel->mode.vsync_start; - mode->vsync_end = lvdscon->panel->mode.vsync_end; - mode->vtotal = lvdscon->panel->mode.vtotal; - mode->flags = lvdscon->panel->mode.flags; - - drm_mode_set_name(mode); + + drm_display_mode_from_videomode(&lvdscon->panel->mode, mode); + drm_mode_probed_add(connector, mode); return 1; diff --git a/include/linux/platform_data/rcar-du.h b/include/linux/platform_data/rcar-du.h index 1a2e9901a22e..a5f045e1d8fe 100644 --- a/include/linux/platform_data/rcar-du.h +++ b/include/linux/platform_data/rcar-du.h @@ -14,7 +14,7 @@ #ifndef __RCAR_DU_H__ #define __RCAR_DU_H__ -#include +#include