From 6ecec74520d8a357726e6c12f99080dbe7b347dd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 26 Sep 2012 12:49:27 -0600 Subject: NVMe: Define SMART log This data structure is defined in the NVMe specification. It's not used by the kernel, but is available for use by userspace software. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c25cccaa555a..4fa3b0b9b071 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -137,6 +137,34 @@ enum { NVME_LBAF_RP_DEGRADED = 3, }; +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __u8 rsvd192[320]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + struct nvme_lba_range_type { __u8 type; __u8 attributes; -- cgit v1.2.3 From 45d9550a0e7e9230606ca3c4c6f4dc6297848b2f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 19 Feb 2013 12:17:01 -0800 Subject: workqueue: allow more off-queue flag space When a work item is off-queue, its work->data contains WORK_STRUCT_* and WORK_OFFQ_* flags. As WORK_OFFQ_* flags are used only while a work item is off-queue, it can occupy bits of work->data which aren't used while off-queue. WORK_OFFQ_* currently only use bits used by on-queue CWQ pointer. As color bits aren't used while off-queue, there's no reason to not use them. Lower WORK_OFFQ_FLAG_BASE from WORK_STRUCT_FLAG_BITS to WORK_STRUCT_COLOR_SHIFT thus giving 4 more bits to off-queue flag space which is also used to record worker_pool ID while off-queue. This doesn't introduce any visible behavior difference. tj: Rewrote the description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8afab27cdbc2..5bd030f630a9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -68,7 +68,7 @@ enum { WORK_STRUCT_COLOR_BITS, /* data contains off-queue information when !WORK_STRUCT_PWQ */ - WORK_OFFQ_FLAG_BASE = WORK_STRUCT_FLAG_BITS, + WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), -- cgit v1.2.3 From d84ff0512f1bfc0d8c864efadb4523fce68919cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: workqueue: consistently use int for @cpu variables Workqueue is mixing unsigned int and int for @cpu variables. There's no point in using unsigned int for cpus - many of cpu related APIs take int anyway. Consistently use int for @cpu variables so that we can use negative values to mark special ones. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 6 +++--- kernel/workqueue.c | 24 +++++++++++------------- kernel/workqueue_internal.h | 5 ++--- 3 files changed, 16 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5bd030f630a9..899be6636d20 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -435,7 +435,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); -extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq); +extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); /* @@ -466,12 +466,12 @@ static inline bool __deprecated flush_delayed_work_sync(struct delayed_work *dwo } #ifndef CONFIG_SMP -static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); +long work_on_cpu(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 26c67c76b6c5..73c5f68065b5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,7 +124,7 @@ enum { struct worker_pool { spinlock_t lock; /* the pool lock */ - unsigned int cpu; /* I: the associated cpu */ + int cpu; /* I: the associated cpu */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -467,8 +467,7 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static struct pool_workqueue *get_pwq(unsigned int cpu, - struct workqueue_struct *wq) +static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) @@ -730,7 +729,7 @@ static void wake_up_worker(struct worker_pool *pool) * CONTEXT: * spin_lock_irq(rq->lock) */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_waking_up(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task); @@ -755,8 +754,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) * RETURNS: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -1159,7 +1157,7 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, +static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; @@ -1714,7 +1712,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(pool->cpu), - "kworker/%u:%d%s", pool->cpu, id, pri); + "kworker/%d:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d%s", id, pri); @@ -3345,7 +3343,7 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * RETURNS: * %true if congested, %false otherwise. */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq = get_pwq(cpu, wq); @@ -3461,7 +3459,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct worker_pool *pool; switch (action & ~CPU_TASKS_FROZEN) { @@ -3507,7 +3505,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct work_struct unbind_work; switch (action & ~CPU_TASKS_FROZEN) { @@ -3547,7 +3545,7 @@ static void work_for_cpu_fn(struct work_struct *work) * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; @@ -3705,7 +3703,7 @@ out_unlock: static int __init init_workqueues(void) { - unsigned int cpu; + int cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index f9c887731e2b..f116f071d919 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -59,8 +59,7 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit v1.2.3 From 7a4e344c5675eefbde93ed9a98ef45e0e4957bc2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: introduce workqueue_attrs Introduce struct workqueue_attrs which carries worker attributes - currently the nice level and allowed cpumask along with helper routines alloc_workqueue_attrs() and free_workqueue_attrs(). Each worker_pool now carries ->attrs describing the attributes of its workers. All functions dealing with cpumask and nice level of workers are updated to follow worker_pool->attrs instead of determining them from other characteristics of the worker_pool, and init_workqueues() is updated to set worker_pool->attrs appropriately for all standard pools. Note that create_worker() is updated to always perform set_user_nice() and use set_cpus_allowed_ptr() combined with manual assertion of PF_THREAD_BOUND instead of kthread_bind(). This simplifies handling random attributes without affecting the outcome. This patch doesn't introduce any behavior changes. v2: Missing cpumask_var_t definition caused build failure on some archs. linux/cpumask.h included. Signed-off-by: Tejun Heo Reported-by: kbuild test robot Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 13 ++++++ kernel/workqueue.c | 103 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 94 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 899be6636d20..00c1b9ba8252 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -11,6 +11,7 @@ #include #include #include +#include struct workqueue_struct; @@ -115,6 +116,15 @@ struct delayed_work { int cpu; }; +/* + * A struct for workqueue attributes. This can be used to change + * attributes of an unbound workqueue. + */ +struct workqueue_attrs { + int nice; /* nice level */ + cpumask_var_t cpumask; /* allowed CPUs */ +}; + static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); @@ -399,6 +409,9 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, extern void destroy_workqueue(struct workqueue_struct *wq); +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); +void free_workqueue_attrs(struct workqueue_attrs *attrs); + extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 094f16668e1b..b0d3cbb83f63 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ + struct workqueue_attrs *attrs; /* I: worker attributes */ + /* * The current concurrency level. As it's likely to be accessed * from other CPUs during try_to_wake_up(), put it in a separate @@ -1566,14 +1568,13 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, pool->attrs->cpumask); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(pool->cpu))) + cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) return true; spin_unlock_irq(&pool->lock); @@ -1679,7 +1680,7 @@ static void rebind_workers(struct worker_pool *pool) * wq doesn't really matter but let's keep @worker->pool * and @pwq->pool consistent for sanity. */ - if (std_worker_pool_pri(worker->pool)) + if (worker->pool->attrs->nice < 0) wq = system_highpri_wq; else wq = system_wq; @@ -1721,7 +1722,7 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = std_worker_pool_pri(pool) ? "H" : ""; + const char *pri = pool->attrs->nice < 0 ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1751,24 +1752,23 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; - if (std_worker_pool_pri(pool)) - set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + set_user_nice(worker->task, pool->attrs->nice); + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * Determine CPU binding of the new worker depending on - * %POOL_DISASSOCIATED. The caller is responsible for ensuring the - * flag remains stable across this function. See the comments - * above the flag definition for details. - * - * As an unbound worker may later become a regular one if CPU comes - * online, make sure every worker has %PF_THREAD_BOUND set. + * %PF_THREAD_BOUND is used to prevent userland from meddling with + * cpumask of workqueue workers. This is an abuse. We need + * %PF_NO_SETAFFINITY. */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - kthread_bind(worker->task, pool->cpu); - } else { - worker->task->flags |= PF_THREAD_BOUND; + worker->task->flags |= PF_THREAD_BOUND; + + /* + * The caller is responsible for ensuring %POOL_DISASSOCIATED + * remains stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - } return worker; fail: @@ -3123,7 +3123,52 @@ int keventd_up(void) return system_wq != NULL; } -static void init_worker_pool(struct worker_pool *pool) +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free + * + * Undo alloc_workqueue_attrs(). + */ +void free_workqueue_attrs(struct workqueue_attrs *attrs) +{ + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } +} + +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. Returns NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +{ + struct workqueue_attrs *attrs; + + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_setall(attrs->cpumask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; +} + +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Returns 0 on success, -errno on failure. + */ +static int init_worker_pool(struct worker_pool *pool) { spin_lock_init(&pool->lock); pool->flags |= POOL_DISASSOCIATED; @@ -3141,6 +3186,11 @@ static void init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) @@ -3792,7 +3842,8 @@ out_unlock: static int __init init_workqueues(void) { - int cpu; + int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int i, cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < @@ -3809,10 +3860,18 @@ static int __init init_workqueues(void) for_each_wq_cpu(cpu) { struct worker_pool *pool; + i = 0; for_each_std_worker_pool(pool, cpu) { - init_worker_pool(pool); + BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; + if (cpu != WORK_CPU_UNBOUND) + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + else + cpumask_setall(pool->attrs->cpumask); + + pool->attrs->nice = std_nice[i++]; + /* alloc pool ID */ BUG_ON(worker_pool_assign_id(pool)); } -- cgit v1.2.3 From 493008a8e475771a2126e0ce95a73e35b371d277 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: workqueue: drop WQ_RESCUER and test workqueue->rescuer for NULL instead WQ_RESCUER is superflous. WQ_MEM_RECLAIM indicates that the user wants a rescuer and testing wq->rescuer for NULL can answer whether a given workqueue has a rescuer or not. Drop WQ_RESCUER and test wq->rescuer directly. This will help simplifying __alloc_workqueue_key() failure path by allowing it to use destroy_workqueue() on a partially constructed workqueue, which in turn will help implementing dynamic management of pool_workqueues. While at it, clear wq->rescuer after freeing it in destroy_workqueue(). This is a precaution as scheduled changes will make destruction more complex. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 1 - kernel/workqueue.c | 22 ++++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 00c1b9ba8252..c270b4eedf16 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -295,7 +295,6 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ - WQ_RESCUER = 1 << 7, /* internal: workqueue has rescuer */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a8b86f7b6e34..7ff2b9c5cc3a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1827,7 +1827,7 @@ static void send_mayday(struct work_struct *work) lockdep_assert_held(&workqueue_lock); - if (!(wq->flags & WQ_RESCUER)) + if (!wq->rescuer) return; /* mayday mayday mayday */ @@ -2285,7 +2285,7 @@ sleep: * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each - * workqueue which has WQ_RESCUER set. + * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of @@ -2769,7 +2769,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) lock_map_acquire(&pwq->wq->lockdep_map); else lock_map_acquire_read(&pwq->wq->lockdep_map); @@ -3412,13 +3412,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, va_end(args); va_end(args1); - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) - flags |= WQ_RESCUER; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3449,7 +3442,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } local_irq_enable(); - if (flags & WQ_RESCUER) { + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; wq->rescuer = rescuer = alloc_worker(); @@ -3533,9 +3530,10 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock_irq(&workqueue_lock); - if (wq->flags & WQ_RESCUER) { + if (wq->rescuer) { kthread_stop(wq->rescuer->task); kfree(wq->rescuer); + wq->rescuer = NULL; } /* -- cgit v1.2.3 From 9e8cd2f5898ab6710ad81f4583fada08bf8049a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: implement apply_workqueue_attrs() Implement apply_workqueue_attrs() which applies workqueue_attrs to the specified unbound workqueue by creating a new pwq (pool_workqueue) linked to worker_pool with the specified attributes. A new pwq is linked at the head of wq->pwqs instead of tail and __queue_work() verifies that the first unbound pwq has positive refcnt before choosing it for the actual queueing. This is to cover the case where creation of a new pwq races with queueing. As base ref on a pwq won't be dropped without making another pwq the first one, __queue_work() is guaranteed to make progress and not add work item to a dead pwq. init_and_link_pwq() is updated to return the last first pwq the new pwq replaced, which is put by apply_workqueue_attrs(). Note that apply_workqueue_attrs() is almost identical to unbound pwq part of alloc_and_link_pwqs(). The only difference is that there is no previous first pwq. apply_workqueue_attrs() is implemented to handle such cases and replaces unbound pwq handling in alloc_and_link_pwqs(). Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 2 ++ kernel/workqueue.c | 91 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 73 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c270b4eedf16..e152394fa7eb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,6 +410,8 @@ extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); void free_workqueue_attrs(struct workqueue_attrs *attrs); +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16fb6747276a..2a67fbbd192c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1228,7 +1228,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; - +retry: /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) { if (cpu == WORK_CPU_UNBOUND) @@ -1262,6 +1262,25 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, spin_lock(&pwq->pool->lock); } + /* + * pwq is determined and locked. For unbound pools, we could have + * raced with pwq release and it could already be dead. If its + * refcnt is zero, repeat pwq selection. Note that pwqs never die + * without another pwq replacing it as the first pwq or while a + * work item is executing on it, so the retying is guaranteed to + * make forward-progress. + */ + if (unlikely(!pwq->refcnt)) { + if (wq->flags & WQ_UNBOUND) { + spin_unlock(&pwq->pool->lock); + cpu_relax(); + goto retry; + } + /* oops */ + WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", + wq->name, cpu); + } + /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); @@ -3425,7 +3444,8 @@ static void pwq_unbound_release_workfn(struct work_struct *work) static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, - struct worker_pool *pool) + struct worker_pool *pool, + struct pool_workqueue **p_last_pwq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); @@ -3445,13 +3465,58 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); + if (p_last_pwq) + *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&workqueue_lock); mutex_unlock(&wq->flush_mutex); } +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the + * current attributes, a new pwq is created and made the first pwq which + * will serve all new work items. Older pwqs are released as in-flight + * work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on + * failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct pool_workqueue *pwq, *last_pwq; + struct worker_pool *pool; + + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) + return -ENOMEM; + + pool = get_unbound_pool(attrs); + if (!pool) { + kmem_cache_free(pwq_cache, pwq); + return -ENOMEM; + } + + init_and_link_pwq(pwq, wq, pool, &last_pwq); + if (last_pwq) { + spin_lock_irq(&last_pwq->pool->lock); + put_pwq(last_pwq); + spin_unlock_irq(&last_pwq->pool->lock); + } + + return 0; +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3468,26 +3533,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - init_and_link_pwq(pwq, wq, &cpu_pools[highpri]); + init_and_link_pwq(pwq, wq, &cpu_pools[highpri], NULL); } + return 0; } else { - struct pool_workqueue *pwq; - struct worker_pool *pool; - - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) - return -ENOMEM; - - pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); - if (!pool) { - kmem_cache_free(pwq_cache, pwq); - return -ENOMEM; - } - - init_and_link_pwq(pwq, wq, pool); + return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - - return 0; } static int wq_clamp_max_active(int max_active, unsigned int flags, -- cgit v1.2.3 From 618b01eb426dd2d73a4b5e5ebc6379e4eee3b123 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: make it clear that WQ_DRAINING is an internal flag We're gonna add another internal WQ flag. Let's make the distinction clear. Prefix WQ_DRAINING with __ and move it to bit 16. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index e152394fa7eb..1751ec4c47c9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -294,7 +294,7 @@ enum { WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ - WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ + __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2a67fbbd192c..590f4d048ec7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1225,7 +1225,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DRAINING) && + if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; retry: @@ -2763,11 +2763,11 @@ void drain_workqueue(struct workqueue_struct *wq) /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. - * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ spin_lock_irq(&workqueue_lock); if (!wq->nr_drainers++) - wq->flags |= WQ_DRAINING; + wq->flags |= __WQ_DRAINING; spin_unlock_irq(&workqueue_lock); reflush: flush_workqueue(wq); @@ -2795,7 +2795,7 @@ reflush: spin_lock(&workqueue_lock); if (!--wq->nr_drainers) - wq->flags &= ~WQ_DRAINING; + wq->flags &= ~__WQ_DRAINING; spin_unlock(&workqueue_lock); local_irq_enable(); -- cgit v1.2.3 From 8719dceae2f98a578507c0f6b49c93f320bd729c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: reject adjusting max_active or applying attrs to ordered workqueues Adjusting max_active of or applying new workqueue_attrs to an ordered workqueue breaks its ordering guarantee. The former is obvious. The latter is because applying attrs creates a new pwq (pool_workqueue) and there is no ordering constraint between the old and new pwqs. Make apply_workqueue_attrs() and workqueue_set_max_active() trigger WARN_ON() if those operations are requested on an ordered workqueue and fail / ignore respectively. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 3 ++- kernel/workqueue.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1751ec4c47c9..5668ab249af5 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -295,6 +295,7 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ + __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -397,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 590f4d048ec7..cecd4ffe2c40 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3494,9 +3494,14 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, struct pool_workqueue *pwq, *last_pwq; struct worker_pool *pool; + /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) return -ENOMEM; @@ -3752,6 +3757,10 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { struct pool_workqueue *pwq; + /* disallow meddling with max_active for ordered workqueues */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return; + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); spin_lock_irq(&workqueue_lock); -- cgit v1.2.3 From ba630e4940924ad1962883c207a62890778ced63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: cpumask: implement cpumask_parse() We have cpulist_parse() but not cpumask_parse(). Implement it using bitmap_parse(). bitmap_parse() is weird in that it takes @len for a string in kernel-memory which also is inconsistent with bitmap_parselist(). Make cpumask_parse() calculate the length and don't expose the inconsistency to cpumask users. Maybe we can fix up bitmap_parse() later. This will be used to expose workqueue cpumask knobs to userland via sysfs. Signed-off-by: Tejun Heo Cc: Rusty Russell --- include/linux/cpumask.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 032560295fcb..d08e4d2a9b92 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -590,6 +590,21 @@ static inline int cpulist_scnprintf(char *buf, int len, nr_cpumask_bits); } +/** + * cpumask_parse - extract a cpumask from from a string + * @buf: the buffer to extract from + * @dstp: the cpumask to set. + * + * Returns -errno, or 0 for success. + */ +static inline int cpumask_parse(const char *buf, struct cpumask *dstp) +{ + char *nl = strchr(buf, '\n'); + int len = nl ? nl - buf : strlen(buf); + + return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); +} + /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from -- cgit v1.2.3 From d73ce004225a7b2ed75f4340bb63721d55552265 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:05 -0700 Subject: driver/base: implement subsys_virtual_register() Kay tells me the most appropriate place to expose workqueues to userland would be /sys/devices/virtual/workqueues/WQ_NAME which is symlinked to /sys/bus/workqueue/devices/WQ_NAME and that we're lacking a way to do that outside of driver core as virtual_device_parent() isn't exported and there's no inteface to conveniently create a virtual subsystem. This patch implements subsys_virtual_register() by factoring out subsys_register() from subsys_system_register() and using it with virtual_device_parent() as the origin directory. It's identical to subsys_system_register() other than the origin directory but we aren't gonna restrict the device names which should be used under it. This will be used to expose workqueue attributes to userland. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Kay Sievers --- drivers/base/base.h | 2 ++ drivers/base/bus.c | 73 +++++++++++++++++++++++++++++++++++--------------- drivers/base/core.c | 2 +- include/linux/device.h | 2 ++ 4 files changed, 57 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/base.h b/drivers/base/base.h index 6ee17bb391a9..b8bdfe61daa6 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -101,6 +101,8 @@ static inline int hypervisor_init(void) { return 0; } extern int platform_bus_init(void); extern void cpu_dev_init(void); +struct kobject *virtual_device_parent(struct device *dev); + extern int bus_add_device(struct device *dev); extern void bus_probe_device(struct device *dev); extern void bus_remove_device(struct device *dev); diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 519865b53f76..2ae2d2f92b6b 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -1205,26 +1205,10 @@ static void system_root_device_release(struct device *dev) { kfree(dev); } -/** - * subsys_system_register - register a subsystem at /sys/devices/system/ - * @subsys: system subsystem - * @groups: default attributes for the root device - * - * All 'system' subsystems have a /sys/devices/system/ root device - * with the name of the subsystem. The root device can carry subsystem- - * wide attributes. All registered devices are below this single root - * device and are named after the subsystem with a simple enumeration - * number appended. The registered devices are not explicitely named; - * only 'id' in the device needs to be set. - * - * Do not use this interface for anything new, it exists for compatibility - * with bad ideas only. New subsystems should use plain subsystems; and - * add the subsystem-wide attributes should be added to the subsystem - * directory itself and not some create fake root-device placed in - * /sys/devices/system/. - */ -int subsys_system_register(struct bus_type *subsys, - const struct attribute_group **groups) + +static int subsys_register(struct bus_type *subsys, + const struct attribute_group **groups, + struct kobject *parent_of_root) { struct device *dev; int err; @@ -1243,7 +1227,7 @@ int subsys_system_register(struct bus_type *subsys, if (err < 0) goto err_name; - dev->kobj.parent = &system_kset->kobj; + dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; @@ -1263,8 +1247,55 @@ err_dev: bus_unregister(subsys); return err; } + +/** + * subsys_system_register - register a subsystem at /sys/devices/system/ + * @subsys: system subsystem + * @groups: default attributes for the root device + * + * All 'system' subsystems have a /sys/devices/system/ root device + * with the name of the subsystem. The root device can carry subsystem- + * wide attributes. All registered devices are below this single root + * device and are named after the subsystem with a simple enumeration + * number appended. The registered devices are not explicitely named; + * only 'id' in the device needs to be set. + * + * Do not use this interface for anything new, it exists for compatibility + * with bad ideas only. New subsystems should use plain subsystems; and + * add the subsystem-wide attributes should be added to the subsystem + * directory itself and not some create fake root-device placed in + * /sys/devices/system/. + */ +int subsys_system_register(struct bus_type *subsys, + const struct attribute_group **groups) +{ + return subsys_register(subsys, groups, &system_kset->kobj); +} EXPORT_SYMBOL_GPL(subsys_system_register); +/** + * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ + * @subsys: virtual subsystem + * @groups: default attributes for the root device + * + * All 'virtual' subsystems have a /sys/devices/system/ root device + * with the name of the subystem. The root device can carry subsystem-wide + * attributes. All registered devices are below this single root device. + * There's no restriction on device naming. This is for kernel software + * constructs which need sysfs interface. + */ +int subsys_virtual_register(struct bus_type *subsys, + const struct attribute_group **groups) +{ + struct kobject *virtual_dir; + + virtual_dir = virtual_device_parent(NULL); + if (!virtual_dir) + return -ENOMEM; + + return subsys_register(subsys, groups, virtual_dir); +} + int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); diff --git a/drivers/base/core.c b/drivers/base/core.c index 56536f4b0f6b..f58084a86e8c 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -690,7 +690,7 @@ void device_initialize(struct device *dev) set_dev_node(dev, -1); } -static struct kobject *virtual_device_parent(struct device *dev) +struct kobject *virtual_device_parent(struct device *dev) { static struct kobject *virtual_dir = NULL; diff --git a/include/linux/device.h b/include/linux/device.h index 9d6464ea99c6..ee10d4e7be1a 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -302,6 +302,8 @@ void subsys_interface_unregister(struct subsys_interface *sif); int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups); +int subsys_virtual_register(struct bus_type *subsys, + const struct attribute_group **groups); /** * struct class - device classes -- cgit v1.2.3 From 226223ab3c4118ddd10688cc2c131135848371ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 11:30:05 -0700 Subject: workqueue: implement sysfs interface for workqueues There are cases where workqueue users want to expose control knobs to userland. e.g. Unbound workqueues with custom attributes are scheduled to be used for writeback workers and depending on configuration it can be useful to allow admins to tinker with the priority or allowed CPUs. This patch implements workqueue_sysfs_register(), which makes the workqueue visible under /sys/bus/workqueue/devices/WQ_NAME. There currently are two attributes common to both per-cpu and unbound pools and extra attributes for unbound pools including nice level and cpumask. If alloc_workqueue*() is called with WQ_SYSFS, workqueue_sysfs_register() is called automatically as part of workqueue creation. This is the preferred method unless the workqueue user wants to apply workqueue_attrs before making the workqueue visible to userland. v2: Disallow exposing ordered workqueues as ordered workqueues can't be tuned in any way. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 8 ++ kernel/workqueue.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5668ab249af5..7f6d29a417c0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -293,6 +293,7 @@ enum { WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ + WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ @@ -495,4 +496,11 @@ extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ +#ifdef CONFIG_SYSFS +int workqueue_sysfs_register(struct workqueue_struct *wq); +#else /* CONFIG_SYSFS */ +static inline int workqueue_sysfs_register(struct workqueue_struct *wq) +{ return 0; } +#endif /* CONFIG_SYSFS */ + #endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cecd4ffe2c40..c82feac0a878 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -210,6 +210,8 @@ struct wq_flusher { struct completion done; /* flush completion */ }; +struct wq_device; + /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: @@ -233,6 +235,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved pwq max_active */ + +#ifdef CONFIG_SYSFS + struct wq_device *wq_dev; /* I: for sysfs interface */ +#endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -442,6 +448,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) @@ -3153,6 +3161,281 @@ int keventd_up(void) return system_wq != NULL; } +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t wq_per_cpu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} + +static ssize_t wq_max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t wq_max_active_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} + +static struct device_attribute wq_sysfs_attrs[] = { + __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), + __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), + __ATTR_NULL, +}; + +static ssize_t wq_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct worker_pool *pool; + int written; + + rcu_read_lock_sched(); + pool = first_pwq(wq)->pool; + written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + rcu_read_lock_sched(); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + first_pwq(wq)->pool->attrs->nice); + rcu_read_unlock_sched(); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + rcu_read_lock_sched(); + copy_workqueue_attrs(attrs, first_pwq(wq)->pool->attrs); + rcu_read_unlock_sched(); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= -20 && attrs->nice <= 19) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + rcu_read_lock_sched(); + written = cpumask_scnprintf(buf, PAGE_SIZE, + first_pwq(wq)->pool->attrs->cpumask); + rcu_read_unlock_sched(); + + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_attrs = wq_sysfs_attrs, +}; + +static int __init wq_sysfs_init(void) +{ + return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Returns 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; + + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free @@ -3625,6 +3908,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wake_up_process(rescuer->task); } + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; + /* * workqueue_lock protects global freeze state and workqueues * list. Grab it, set max_active accordingly and add the new @@ -3693,6 +3979,8 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock_irq(&workqueue_lock); + workqueue_sysfs_unregister(wq); + if (wq->rescuer) { kthread_stop(wq->rescuer->task); kfree(wq->rescuer); -- cgit v1.2.3 From e62676169118bc2d42e5008b3f8872646313f077 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Mar 2013 17:41:37 -0700 Subject: workqueue: implement current_is_workqueue_rescuer() Implement a function which queries whether it currently is running off a workqueue rescuer. This will be used to convert writeback to workqueue. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 7f6d29a417c0..df30763c8682 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -451,6 +451,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); +extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c82feac0a878..f5c8bbb9ada3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4071,6 +4071,19 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) } EXPORT_SYMBOL_GPL(workqueue_set_max_active); +/** + * current_is_workqueue_rescuer - is %current workqueue rescuer? + * + * Determine whether %current is a workqueue rescuer. Can be used from + * work functions to determine whether it's being run off the rescuer task. + */ +bool current_is_workqueue_rescuer(void) +{ + struct worker *worker = current_wq_worker(); + + return worker && worker == worker->current_pwq->wq->rescuer; +} + /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question -- cgit v1.2.3 From 0ec83bd2460ed6aed0e7f29f9e0633b054621c02 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 13 Mar 2013 17:38:57 +0900 Subject: extcon: max77693: Initialize register of MUIC device to bring up it without platform data This patch set default value of MUIC register to bring up MUIC device. If user don't set some initial value for MUIC device through platform data, extcon-max77693 driver use 'default_init_data' to bring up base operation of MAX77693 MUIC device. Signed-off-by: Chanwoo Choi Signed-off-by: Myungjoo Ham --- drivers/extcon/extcon-max77693.c | 93 ++++++++++++++++++++++++++---------- include/linux/mfd/max77693-private.h | 23 +++++++++ 2 files changed, 91 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c index fea10624f3e5..8f3c947b0029 100644 --- a/drivers/extcon/extcon-max77693.c +++ b/drivers/extcon/extcon-max77693.c @@ -32,6 +32,38 @@ #define DEV_NAME "max77693-muic" #define DELAY_MS_DEFAULT 20000 /* unit: millisecond */ +/* + * Default value of MAX77693 register to bring up MUIC device. + * If user don't set some initial value for MUIC device through platform data, + * extcon-max77693 driver use 'default_init_data' to bring up base operation + * of MAX77693 MUIC device. + */ +struct max77693_reg_data default_init_data[] = { + { + /* STATUS2 - [3]ChgDetRun */ + .addr = MAX77693_MUIC_REG_STATUS2, + .data = STATUS2_CHGDETRUN_MASK, + }, { + /* INTMASK1 - Unmask [3]ADC1KM,[0]ADCM */ + .addr = MAX77693_MUIC_REG_INTMASK1, + .data = INTMASK1_ADC1K_MASK + | INTMASK1_ADC_MASK, + }, { + /* INTMASK2 - Unmask [0]ChgTypM */ + .addr = MAX77693_MUIC_REG_INTMASK2, + .data = INTMASK2_CHGTYP_MASK, + }, { + /* INTMASK3 - Mask all of interrupts */ + .addr = MAX77693_MUIC_REG_INTMASK3, + .data = 0x0, + }, { + /* CDETCTRL2 */ + .addr = MAX77693_MUIC_REG_CDETCTRL2, + .data = CDETCTRL2_VIDRMEN_MASK + | CDETCTRL2_DXOVPEN_MASK, + }, +}; + enum max77693_muic_adc_debounce_time { ADC_DEBOUNCE_TIME_5MS = 0, ADC_DEBOUNCE_TIME_10MS, @@ -1046,6 +1078,8 @@ static int max77693_muic_probe(struct platform_device *pdev) struct max77693_dev *max77693 = dev_get_drvdata(pdev->dev.parent); struct max77693_platform_data *pdata = dev_get_platdata(max77693->dev); struct max77693_muic_info *info; + struct max77693_reg_data *init_data; + int num_init_data; int delay_jiffies; int ret; int i; @@ -1144,35 +1178,44 @@ static int max77693_muic_probe(struct platform_device *pdev) goto err_irq; } + + /* Initialize MUIC register by using platform data or default data */ if (pdata->muic_data) { - struct max77693_muic_platform_data *muic_pdata = pdata->muic_data; + init_data = pdata->muic_data->init_data; + num_init_data = pdata->muic_data->num_init_data; + } else { + init_data = default_init_data; + num_init_data = ARRAY_SIZE(default_init_data); + } + + for (i = 0 ; i < num_init_data ; i++) { + enum max77693_irq_source irq_src + = MAX77693_IRQ_GROUP_NR; - /* Initialize MUIC register by using platform data */ - for (i = 0 ; i < muic_pdata->num_init_data ; i++) { - enum max77693_irq_source irq_src - = MAX77693_IRQ_GROUP_NR; - - max77693_write_reg(info->max77693->regmap_muic, - muic_pdata->init_data[i].addr, - muic_pdata->init_data[i].data); - - switch (muic_pdata->init_data[i].addr) { - case MAX77693_MUIC_REG_INTMASK1: - irq_src = MUIC_INT1; - break; - case MAX77693_MUIC_REG_INTMASK2: - irq_src = MUIC_INT2; - break; - case MAX77693_MUIC_REG_INTMASK3: - irq_src = MUIC_INT3; - break; - } - - if (irq_src < MAX77693_IRQ_GROUP_NR) - info->max77693->irq_masks_cur[irq_src] - = muic_pdata->init_data[i].data; + max77693_write_reg(info->max77693->regmap_muic, + init_data[i].addr, + init_data[i].data); + + switch (init_data[i].addr) { + case MAX77693_MUIC_REG_INTMASK1: + irq_src = MUIC_INT1; + break; + case MAX77693_MUIC_REG_INTMASK2: + irq_src = MUIC_INT2; + break; + case MAX77693_MUIC_REG_INTMASK3: + irq_src = MUIC_INT3; + break; } + if (irq_src < MAX77693_IRQ_GROUP_NR) + info->max77693->irq_masks_cur[irq_src] + = init_data[i].data; + } + + if (pdata->muic_data) { + struct max77693_muic_platform_data *muic_pdata = pdata->muic_data; + /* * Default usb/uart path whether UART/USB or AUX_UART/AUX_USB * h/w path of COMP2/COMN1 on CONTROL1 register. diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index 5b18ecde69b5..1aa4f13cdfa6 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -106,6 +106,29 @@ enum max77693_muic_reg { MAX77693_MUIC_REG_END, }; +/* MAX77693 INTMASK1~2 Register */ +#define INTMASK1_ADC1K_SHIFT 3 +#define INTMASK1_ADCERR_SHIFT 2 +#define INTMASK1_ADCLOW_SHIFT 1 +#define INTMASK1_ADC_SHIFT 0 +#define INTMASK1_ADC1K_MASK (1 << INTMASK1_ADC1K_SHIFT) +#define INTMASK1_ADCERR_MASK (1 << INTMASK1_ADCERR_SHIFT) +#define INTMASK1_ADCLOW_MASK (1 << INTMASK1_ADCLOW_SHIFT) +#define INTMASK1_ADC_MASK (1 << INTMASK1_ADC_SHIFT) + +#define INTMASK2_VIDRM_SHIFT 5 +#define INTMASK2_VBVOLT_SHIFT 4 +#define INTMASK2_DXOVP_SHIFT 3 +#define INTMASK2_DCDTMR_SHIFT 2 +#define INTMASK2_CHGDETRUN_SHIFT 1 +#define INTMASK2_CHGTYP_SHIFT 0 +#define INTMASK2_VIDRM_MASK (1 << INTMASK2_VIDRM_SHIFT) +#define INTMASK2_VBVOLT_MASK (1 << INTMASK2_VBVOLT_SHIFT) +#define INTMASK2_DXOVP_MASK (1 << INTMASK2_DXOVP_SHIFT) +#define INTMASK2_DCDTMR_MASK (1 << INTMASK2_DCDTMR_SHIFT) +#define INTMASK2_CHGDETRUN_MASK (1 << INTMASK2_CHGDETRUN_SHIFT) +#define INTMASK2_CHGTYP_MASK (1 << INTMASK2_CHGTYP_SHIFT) + /* MAX77693 MUIC - STATUS1~3 Register */ #define STATUS1_ADC_SHIFT (0) #define STATUS1_ADCLOW_SHIFT (5) -- cgit v1.2.3 From 8425e3d5bdbe8e741d2c73cf3189ed59b4038b84 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: workqueue: inline trivial wrappers There's no reason to make these trivial wrappers full (exported) functions. Inline the followings. queue_work() queue_delayed_work() mod_delayed_work() schedule_work_on() schedule_work() schedule_delayed_work_on() schedule_delayed_work() keventd_up() Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 123 +++++++++++++++++++++++++++++++++++++++++----- kernel/workqueue.c | 111 ----------------------------------------- 2 files changed, 111 insertions(+), 123 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index df30763c8682..835d12b76960 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -417,28 +417,16 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); -extern bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); -extern bool mod_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); -extern bool schedule_work_on(int cpu, struct work_struct *work); -extern bool schedule_work(struct work_struct *work); -extern bool schedule_delayed_work_on(int cpu, struct delayed_work *work, - unsigned long delay); -extern bool schedule_delayed_work(struct delayed_work *work, - unsigned long delay); extern int schedule_on_each_cpu(work_func_t func); -extern int keventd_up(void); int execute_in_process_context(work_func_t fn, struct execute_work *); @@ -455,6 +443,117 @@ extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns %false if @work was already on a queue, %true otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. + */ +static inline bool queue_work(struct workqueue_struct *wq, + struct work_struct *work) +{ + return queue_work_on(WORK_CPU_UNBOUND, wq, work); +} + +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. + */ +static inline bool queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +static inline bool mod_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} + +/** + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +static inline bool schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, system_wq, work); +} + +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. + */ +static inline bool schedule_work(struct work_struct *work) +{ + return queue_work(system_wq, work); +} + +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ +static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, system_wq, dwork, delay); +} + +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +static inline bool schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} + +/** + * keventd_up - is workqueue initialized yet? + */ +static inline bool keventd_up(void) +{ + return system_wq != NULL; +} + /* * Like above, but uses del_timer() instead of del_timer_sync(). This means, * if it returns 0 the timer function may be running and the queueing is in diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 147fc5a784f0..f37421fb4f35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1340,22 +1340,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_work_on); -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns %false if @work was already on a queue, %true otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -bool queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - return queue_work_on(WORK_CPU_UNBOUND, wq, work); -} -EXPORT_SYMBOL_GPL(queue_work); - void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; @@ -1430,21 +1414,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Equivalent to queue_delayed_work_on() but tries to use the local CPU. - */ -bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on @@ -1483,21 +1452,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(mod_delayed_work_on); -/** - * mod_delayed_work - modify delay of or queue a delayed work - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * mod_delayed_work_on() on local CPU. - */ -bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, - unsigned long delay) -{ - return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(mod_delayed_work); - /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -3001,66 +2955,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -/** - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -bool schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns %false if @work was already on the kernel-global workqueue and - * %true otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -bool schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call @@ -3154,11 +3048,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -int keventd_up(void) -{ - return system_wq != NULL; -} - #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via -- cgit v1.2.3 From 5bc7c33ca93a285dcfe7b7fd64970f6314440ad1 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Wed, 13 Mar 2013 09:51:31 -0700 Subject: mtd: nand: reintroduce NAND_NO_READRDY as NAND_NEED_READRDY This partially reverts commit 1696e6bc2ae83734e64e206ac99766ea19e9a14e ("mtd: nand: kill NAND_NO_READRDY"). In that patch I overlooked a few things. The original documentation for NAND_NO_READRDY included "True for all large page devices, as they do not support autoincrement." I was conflating "not support autoincrement" with the NAND_NO_AUTOINCR option, which was in fact doing nothing. So, when I dropped NAND_NO_AUTOINCR, I concluded that I then could harmlessly drop NAND_NO_READRDY. But of course the fact the NAND_NO_AUTOINCR was doing nothing didn't mean NAND_NO_READRDY was doing nothing... So, NAND_NO_READRDY is re-introduced as NAND_NEED_READRDY and applied only to those few remaining small-page NAND which needed it in the first place. Cc: stable@kernel.org [3.5+] Reported-by: Alexander Shiyan Tested-by: Alexander Shiyan Signed-off-by: Brian Norris Signed-off-by: David Woodhouse --- drivers/mtd/nand/nand_base.c | 16 +++++++++ drivers/mtd/nand/nand_ids.c | 80 +++++++++++++++++++++++--------------------- include/linux/mtd/nand.h | 7 ++++ 3 files changed, 64 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 43214151b882..42c63927609d 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1523,6 +1523,14 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from, oobreadlen -= toread; } } + + if (chip->options & NAND_NEED_READRDY) { + /* Apply delay or wait for ready/busy pin */ + if (!chip->dev_ready) + udelay(chip->chip_delay); + else + nand_wait_ready(mtd); + } } else { memcpy(buf, chip->buffers->databuf + col, bytes); buf += bytes; @@ -1787,6 +1795,14 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from, len = min(len, readlen); buf = nand_transfer_oob(chip, buf, ops, len); + if (chip->options & NAND_NEED_READRDY) { + /* Apply delay or wait for ready/busy pin */ + if (!chip->dev_ready) + udelay(chip->chip_delay); + else + nand_wait_ready(mtd); + } + readlen -= len; if (!readlen) break; diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c index e3aa2748a6e7..9c612388e5de 100644 --- a/drivers/mtd/nand/nand_ids.c +++ b/drivers/mtd/nand/nand_ids.c @@ -22,49 +22,51 @@ * 512 512 Byte page size */ struct nand_flash_dev nand_flash_ids[] = { +#define SP_OPTIONS NAND_NEED_READRDY +#define SP_OPTIONS16 (SP_OPTIONS | NAND_BUSWIDTH_16) #ifdef CONFIG_MTD_NAND_MUSEUM_IDS - {"NAND 1MiB 5V 8-bit", 0x6e, 256, 1, 0x1000, 0}, - {"NAND 2MiB 5V 8-bit", 0x64, 256, 2, 0x1000, 0}, - {"NAND 4MiB 5V 8-bit", 0x6b, 512, 4, 0x2000, 0}, - {"NAND 1MiB 3,3V 8-bit", 0xe8, 256, 1, 0x1000, 0}, - {"NAND 1MiB 3,3V 8-bit", 0xec, 256, 1, 0x1000, 0}, - {"NAND 2MiB 3,3V 8-bit", 0xea, 256, 2, 0x1000, 0}, - {"NAND 4MiB 3,3V 8-bit", 0xd5, 512, 4, 0x2000, 0}, - {"NAND 4MiB 3,3V 8-bit", 0xe3, 512, 4, 0x2000, 0}, - {"NAND 4MiB 3,3V 8-bit", 0xe5, 512, 4, 0x2000, 0}, - {"NAND 8MiB 3,3V 8-bit", 0xd6, 512, 8, 0x2000, 0}, - - {"NAND 8MiB 1,8V 8-bit", 0x39, 512, 8, 0x2000, 0}, - {"NAND 8MiB 3,3V 8-bit", 0xe6, 512, 8, 0x2000, 0}, - {"NAND 8MiB 1,8V 16-bit", 0x49, 512, 8, 0x2000, NAND_BUSWIDTH_16}, - {"NAND 8MiB 3,3V 16-bit", 0x59, 512, 8, 0x2000, NAND_BUSWIDTH_16}, + {"NAND 1MiB 5V 8-bit", 0x6e, 256, 1, 0x1000, SP_OPTIONS}, + {"NAND 2MiB 5V 8-bit", 0x64, 256, 2, 0x1000, SP_OPTIONS}, + {"NAND 4MiB 5V 8-bit", 0x6b, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 1MiB 3,3V 8-bit", 0xe8, 256, 1, 0x1000, SP_OPTIONS}, + {"NAND 1MiB 3,3V 8-bit", 0xec, 256, 1, 0x1000, SP_OPTIONS}, + {"NAND 2MiB 3,3V 8-bit", 0xea, 256, 2, 0x1000, SP_OPTIONS}, + {"NAND 4MiB 3,3V 8-bit", 0xd5, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 4MiB 3,3V 8-bit", 0xe3, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 4MiB 3,3V 8-bit", 0xe5, 512, 4, 0x2000, SP_OPTIONS}, + {"NAND 8MiB 3,3V 8-bit", 0xd6, 512, 8, 0x2000, SP_OPTIONS}, + + {"NAND 8MiB 1,8V 8-bit", 0x39, 512, 8, 0x2000, SP_OPTIONS}, + {"NAND 8MiB 3,3V 8-bit", 0xe6, 512, 8, 0x2000, SP_OPTIONS}, + {"NAND 8MiB 1,8V 16-bit", 0x49, 512, 8, 0x2000, SP_OPTIONS16}, + {"NAND 8MiB 3,3V 16-bit", 0x59, 512, 8, 0x2000, SP_OPTIONS16}, #endif - {"NAND 16MiB 1,8V 8-bit", 0x33, 512, 16, 0x4000, 0}, - {"NAND 16MiB 3,3V 8-bit", 0x73, 512, 16, 0x4000, 0}, - {"NAND 16MiB 1,8V 16-bit", 0x43, 512, 16, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 16MiB 3,3V 16-bit", 0x53, 512, 16, 0x4000, NAND_BUSWIDTH_16}, - - {"NAND 32MiB 1,8V 8-bit", 0x35, 512, 32, 0x4000, 0}, - {"NAND 32MiB 3,3V 8-bit", 0x75, 512, 32, 0x4000, 0}, - {"NAND 32MiB 1,8V 16-bit", 0x45, 512, 32, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 32MiB 3,3V 16-bit", 0x55, 512, 32, 0x4000, NAND_BUSWIDTH_16}, - - {"NAND 64MiB 1,8V 8-bit", 0x36, 512, 64, 0x4000, 0}, - {"NAND 64MiB 3,3V 8-bit", 0x76, 512, 64, 0x4000, 0}, - {"NAND 64MiB 1,8V 16-bit", 0x46, 512, 64, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 64MiB 3,3V 16-bit", 0x56, 512, 64, 0x4000, NAND_BUSWIDTH_16}, - - {"NAND 128MiB 1,8V 8-bit", 0x78, 512, 128, 0x4000, 0}, - {"NAND 128MiB 1,8V 8-bit", 0x39, 512, 128, 0x4000, 0}, - {"NAND 128MiB 3,3V 8-bit", 0x79, 512, 128, 0x4000, 0}, - {"NAND 128MiB 1,8V 16-bit", 0x72, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 128MiB 1,8V 16-bit", 0x49, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 128MiB 3,3V 16-bit", 0x74, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - {"NAND 128MiB 3,3V 16-bit", 0x59, 512, 128, 0x4000, NAND_BUSWIDTH_16}, - - {"NAND 256MiB 3,3V 8-bit", 0x71, 512, 256, 0x4000, 0}, + {"NAND 16MiB 1,8V 8-bit", 0x33, 512, 16, 0x4000, SP_OPTIONS}, + {"NAND 16MiB 3,3V 8-bit", 0x73, 512, 16, 0x4000, SP_OPTIONS}, + {"NAND 16MiB 1,8V 16-bit", 0x43, 512, 16, 0x4000, SP_OPTIONS16}, + {"NAND 16MiB 3,3V 16-bit", 0x53, 512, 16, 0x4000, SP_OPTIONS16}, + + {"NAND 32MiB 1,8V 8-bit", 0x35, 512, 32, 0x4000, SP_OPTIONS}, + {"NAND 32MiB 3,3V 8-bit", 0x75, 512, 32, 0x4000, SP_OPTIONS}, + {"NAND 32MiB 1,8V 16-bit", 0x45, 512, 32, 0x4000, SP_OPTIONS16}, + {"NAND 32MiB 3,3V 16-bit", 0x55, 512, 32, 0x4000, SP_OPTIONS16}, + + {"NAND 64MiB 1,8V 8-bit", 0x36, 512, 64, 0x4000, SP_OPTIONS}, + {"NAND 64MiB 3,3V 8-bit", 0x76, 512, 64, 0x4000, SP_OPTIONS}, + {"NAND 64MiB 1,8V 16-bit", 0x46, 512, 64, 0x4000, SP_OPTIONS16}, + {"NAND 64MiB 3,3V 16-bit", 0x56, 512, 64, 0x4000, SP_OPTIONS16}, + + {"NAND 128MiB 1,8V 8-bit", 0x78, 512, 128, 0x4000, SP_OPTIONS}, + {"NAND 128MiB 1,8V 8-bit", 0x39, 512, 128, 0x4000, SP_OPTIONS}, + {"NAND 128MiB 3,3V 8-bit", 0x79, 512, 128, 0x4000, SP_OPTIONS}, + {"NAND 128MiB 1,8V 16-bit", 0x72, 512, 128, 0x4000, SP_OPTIONS16}, + {"NAND 128MiB 1,8V 16-bit", 0x49, 512, 128, 0x4000, SP_OPTIONS16}, + {"NAND 128MiB 3,3V 16-bit", 0x74, 512, 128, 0x4000, SP_OPTIONS16}, + {"NAND 128MiB 3,3V 16-bit", 0x59, 512, 128, 0x4000, SP_OPTIONS16}, + + {"NAND 256MiB 3,3V 8-bit", 0x71, 512, 256, 0x4000, SP_OPTIONS}, /* * These are the new chips with large page size. The pagesize and the diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 7ccb3c59ed60..ef52d9c91459 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -187,6 +187,13 @@ typedef enum { * This happens with the Renesas AG-AND chips, possibly others. */ #define BBT_AUTO_REFRESH 0x00000080 +/* + * Chip requires ready check on read (for auto-incremented sequential read). + * True only for small page devices; large page devices do not support + * autoincrement. + */ +#define NAND_NEED_READRDY 0x00000100 + /* Chip does not allow subpage writes */ #define NAND_NO_SUBPAGE_WRITE 0x00000200 -- cgit v1.2.3 From 16fad69cfe4adbbfa813de516757b87bcae36d93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2013 05:40:32 +0000 Subject: tcp: fix skb_availroom() Chrome OS team reported a crash on a Pixel ChromeBook in TCP stack : https://code.google.com/p/chromium/issues/detail?id=182056 commit a21d45726acac (tcp: avoid order-1 allocations on wifi and tx path) did a poor choice adding an 'avail_size' field to skb, while what we really needed was a 'reserved_tailroom' one. It would have avoided commit 22b4a4f22da (tcp: fix retransmit of partially acked frames) and this commit. Crash occurs because skb_split() is not aware of the 'avail_size' management (and should not be aware) Signed-off-by: Eric Dumazet Reported-by: Mukesh Agrawal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 1 - 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 821c7f45d2a7..6f2bb860e051 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -500,7 +500,7 @@ struct sk_buff { union { __u32 mark; __u32 dropcount; - __u32 avail_size; + __u32 reserved_tailroom; }; sk_buff_data_t inner_transport_header; @@ -1447,7 +1447,10 @@ static inline int skb_tailroom(const struct sk_buff *skb) */ static inline int skb_availroom(const struct sk_buff *skb) { - return skb_is_nonlinear(skb) ? 0 : skb->avail_size - skb->len; + if (skb_is_nonlinear(skb)) + return 0; + + return skb->end - skb->tail - skb->reserved_tailroom; } /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 47e854fcae24..e22020790709 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -775,7 +775,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb->avail_size = size; + skb->reserved_tailroom = skb->end - skb->tail - size; return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2b4461074da..817fbb396bc8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1298,7 +1298,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; -- cgit v1.2.3 From cca7af3889bfa343d33d5e657a38d876abd10e58 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 14 Mar 2013 03:29:40 +0000 Subject: skb: Propagate pfmemalloc on skb from head page only Hi. I'm trying to send big chunks of memory from application address space via TCP socket using vmsplice + splice like this mem = mmap(128Mb); vmsplice(pipe[1], mem); /* splice memory into pipe */ splice(pipe[0], tcp_socket); /* send it into network */ When I'm lucky and a huge page splices into the pipe and then into the socket _and_ client and server ends of the TCP connection are on the same host, communicating via lo, the whole connection gets stuck! The sending queue becomes full and app stops writing/splicing more into it, but the receiving queue remains empty, and that's why. The __skb_fill_page_desc observes a tail page of a huge page and erroneously propagates its page->pfmemalloc value onto socket (the pfmemalloc on tail pages contain garbage). Then this skb->pfmemalloc leaks through lo and due to the tcp_v4_rcv sk_filter if (skb->pfmemalloc && !sock_flag(sk, SOCK_MEMALLOC)) /* true */ return -ENOMEM goto release_and_discard; no packets reach the socket. Even TCP re-transmits are dropped by this, as skb cloning clones the pfmemalloc flag as well. That said, here's the proper page->pfmemalloc propagation onto socket: we must check the huge-page's head page only, other pages' pfmemalloc and mapping values do not contain what is expected in this place. However, I'm not sure whether this fix is _complete_, since pfmemalloc propagation via lo also oesn't look great. Both, bit propagation from page to skb and this check in sk_filter, were introduced by c48a11c7 (netvm: propagate page->pfmemalloc to skb), in v3.5 so Mel and stable@ are in Cc. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Acked-by: Mel Gorman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f2bb860e051..441f5bfdab8e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1288,11 +1288,13 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, * do not lose pfmemalloc information as the pages would not be * allocated using __GFP_MEMALLOC. */ - if (page->pfmemalloc && !page->mapping) - skb->pfmemalloc = true; frag->page.p = page; frag->page_offset = off; skb_frag_size_set(frag, size); + + page = compound_head(page); + if (page->pfmemalloc && !page->mapping) + skb->pfmemalloc = true; } /** -- cgit v1.2.3 From 1eef1282549d7accdd33ee36d409b039b1f911fb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 11 Mar 2013 09:07:46 -0300 Subject: amd64_edac: Correct DIMM sizes We were filling the csrow size with a wrong value. 16a528ee3975 ("EDAC: Fix csrow size reported in sysfs") tried to address the issue. It fixed the report with the old API but not with the new one. Correct it for the new API too. Signed-off-by: Mauro Carvalho Chehab [ make it a per-csrow accounting regardless of ->channel_count ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 14 +++++++++----- drivers/edac/edac_mc_sysfs.c | 13 +++---------- include/linux/edac.h | 1 - 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 910b0116c128..532de775a184 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2048,12 +2048,18 @@ static int init_csrows(struct mem_ctl_info *mci) edac_dbg(1, "MC node: %d, csrow: %d\n", pvt->mc_node_id, i); - if (row_dct0) + if (row_dct0) { nr_pages = amd64_csrow_nr_pages(pvt, 0, i); + csrow->channels[0]->dimm->nr_pages = nr_pages; + } /* K8 has only one DCT */ - if (boot_cpu_data.x86 != 0xf && row_dct1) - nr_pages += amd64_csrow_nr_pages(pvt, 1, i); + if (boot_cpu_data.x86 != 0xf && row_dct1) { + int row_dct1_pages = amd64_csrow_nr_pages(pvt, 1, i); + + csrow->channels[1]->dimm->nr_pages = row_dct1_pages; + nr_pages += row_dct1_pages; + } mtype = amd64_determine_memory_type(pvt, i); @@ -2072,9 +2078,7 @@ static int init_csrows(struct mem_ctl_info *mci) dimm = csrow->channels[j]->dimm; dimm->mtype = mtype; dimm->edac_mode = edac_mode; - dimm->nr_pages = nr_pages; } - csrow->nr_pages = nr_pages; } return empty; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0cbf670efa23..4c50a4760db7 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -180,9 +180,6 @@ static ssize_t csrow_size_show(struct device *dev, int i; u32 nr_pages = 0; - if (csrow->mci->csbased) - return sprintf(data, "%u\n", PAGES_TO_MiB(csrow->nr_pages)); - for (i = 0; i < csrow->nr_channels; i++) nr_pages += csrow->channels[i]->dimm->nr_pages; return sprintf(data, "%u\n", PAGES_TO_MiB(nr_pages)); @@ -778,14 +775,10 @@ static ssize_t mci_size_mb_show(struct device *dev, for (csrow_idx = 0; csrow_idx < mci->nr_csrows; csrow_idx++) { struct csrow_info *csrow = mci->csrows[csrow_idx]; - if (csrow->mci->csbased) { - total_pages += csrow->nr_pages; - } else { - for (j = 0; j < csrow->nr_channels; j++) { - struct dimm_info *dimm = csrow->channels[j]->dimm; + for (j = 0; j < csrow->nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j]->dimm; - total_pages += dimm->nr_pages; - } + total_pages += dimm->nr_pages; } } diff --git a/include/linux/edac.h b/include/linux/edac.h index 4fd4999ccb5b..ab1ea98e767c 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -561,7 +561,6 @@ struct csrow_info { u32 ue_count; /* Uncorrectable Errors for this csrow */ u32 ce_count; /* Correctable Errors for this csrow */ - u32 nr_pages; /* combined pages count of all channels */ struct mem_ctl_info *mci; /* the parent */ -- cgit v1.2.3 From 9713faecff3d071de1208b081d4943b002e9cb1c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 11 Mar 2013 09:28:48 -0300 Subject: EDAC: Merge mci.mem_is_per_rank with mci.csbased Both mci.mem_is_per_rank and mci.csbased denote the same thing: the memory controller is csrows based. Merge both fields into one. There's no need for the driver to actually fill it, as the core detects it by checking if one of the layers has the csrows type as part of the memory hierarchy: if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) per_rank = true; Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 1 - drivers/edac/edac_mc.c | 6 +++--- drivers/edac/edac_mc_sysfs.c | 2 +- include/linux/edac.h | 6 ++---- 4 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 532de775a184..e1d13c463c90 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2423,7 +2423,6 @@ static int amd64_init_one_instance(struct pci_dev *F2) mci->pvt_info = pvt; mci->pdev = &pvt->F2->dev; - mci->csbased = 1; setup_mci_misc_attrs(mci, fam_type); diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index cdb81aa73ab7..27e86d938262 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -86,7 +86,7 @@ static void edac_mc_dump_dimm(struct dimm_info *dimm, int number) edac_dimm_info_location(dimm, location, sizeof(location)); edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n", - dimm->mci->mem_is_per_rank ? "rank" : "dimm", + dimm->mci->csbased ? "rank" : "dimm", number, location, dimm->csrow, dimm->cschannel); edac_dbg(4, " dimm = %p\n", dimm); edac_dbg(4, " dimm->label = '%s'\n", dimm->label); @@ -341,7 +341,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, memcpy(mci->layers, layers, sizeof(*layer) * n_layers); mci->nr_csrows = tot_csrows; mci->num_cschannel = tot_channels; - mci->mem_is_per_rank = per_rank; + mci->csbased = per_rank; /* * Alocate and fill the csrow/channels structs @@ -1235,7 +1235,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, * incrementing the compat API counters */ edac_dbg(4, "%s csrows map: (%d,%d)\n", - mci->mem_is_per_rank ? "rank" : "dimm", + mci->csbased ? "rank" : "dimm", dimm->csrow, dimm->cschannel); if (row == -1) row = dimm->csrow; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 4c50a4760db7..5899a76eec3b 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -609,7 +609,7 @@ static int edac_create_dimm_object(struct mem_ctl_info *mci, device_initialize(&dimm->dev); dimm->dev.parent = &mci->dev; - if (mci->mem_is_per_rank) + if (mci->csbased) dev_set_name(&dimm->dev, "rank%d", index); else dev_set_name(&dimm->dev, "dimm%d", index); diff --git a/include/linux/edac.h b/include/linux/edac.h index ab1ea98e767c..0b763276f619 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -675,11 +675,11 @@ struct mem_ctl_info { * sees memory sticks ("dimms"), and the ones that sees memory ranks. * All old memory controllers enumerate memories per rank, but most * of the recent drivers enumerate memories per DIMM, instead. - * When the memory controller is per rank, mem_is_per_rank is true. + * When the memory controller is per rank, csbased is true. */ unsigned n_layers; struct edac_mc_layer *layers; - bool mem_is_per_rank; + bool csbased; /* * DIMM info. Will eventually remove the entire csrows_info some day @@ -740,8 +740,6 @@ struct mem_ctl_info { u32 fake_inject_ue; u16 fake_inject_count; #endif - __u8 csbased : 1, /* csrow-based memory controller */ - __resv : 7; }; #endif -- cgit v1.2.3 From 1e8bbe6cd02fc300c88bd48244ce61ad9c7d1776 Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Thu, 14 Mar 2013 01:05:13 +0000 Subject: net: cdc_ncm, cdc_mbim: allow user to prefer NCM for backwards compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bd329e1 ("net: cdc_ncm: do not bind to NCM compatible MBIM devices") introduced a new policy, preferring MBIM for dual NCM/MBIM functions if the cdc_mbim driver was enabled. This caused a regression for users wanting to use NCM. Devices implementing NCM backwards compatibility according to section 3.2 of the MBIM v1.0 specification allow either NCM or MBIM on a single USB function, using different altsettings. The cdc_ncm and cdc_mbim drivers will both probe such functions, and must agree on a common policy for selecting either MBIM or NCM. Until now, this policy has been set at build time based on CONFIG_USB_NET_CDC_MBIM. Use a module parameter to set the system policy at runtime, allowing the user to prefer NCM on systems with the cdc_mbim driver. Cc: Greg Suarez Cc: Alexey Orishko Reported-by: Geir Haatveit Reported-by: Tommi Kyntola Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=54791 Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_mbim.c | 11 +--------- drivers/net/usb/cdc_ncm.c | 49 +++++++++++++++++++++++++++++---------------- include/linux/usb/cdc_ncm.h | 1 + 3 files changed, 34 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 248d2dc765a5..16c842997291 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -68,18 +68,9 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf) struct cdc_ncm_ctx *ctx; struct usb_driver *subdriver = ERR_PTR(-ENODEV); int ret = -ENODEV; - u8 data_altsetting = CDC_NCM_DATA_ALTSETTING_NCM; + u8 data_altsetting = cdc_ncm_select_altsetting(dev, intf); struct cdc_mbim_state *info = (void *)&dev->data; - /* see if interface supports MBIM alternate setting */ - if (intf->num_altsetting == 2) { - if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) - usb_set_interface(dev->udev, - intf->cur_altsetting->desc.bInterfaceNumber, - CDC_NCM_COMM_ALTSETTING_MBIM); - data_altsetting = CDC_NCM_DATA_ALTSETTING_MBIM; - } - /* Probably NCM, defer for cdc_ncm_bind */ if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) goto err; diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 61b74a2b89ac..4709fa3497cf 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -55,6 +55,14 @@ #define DRIVER_VERSION "14-Mar-2012" +#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM) +static bool prefer_mbim = true; +#else +static bool prefer_mbim; +#endif +module_param(prefer_mbim, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions"); + static void cdc_ncm_txpath_bh(unsigned long param); static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx); static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer); @@ -550,9 +558,12 @@ void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf) } EXPORT_SYMBOL_GPL(cdc_ncm_unbind); -static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) +/* Select the MBIM altsetting iff it is preferred and available, + * returning the number of the corresponding data interface altsetting + */ +u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf) { - int ret; + struct usb_host_interface *alt; /* The MBIM spec defines a NCM compatible default altsetting, * which we may have matched: @@ -568,23 +579,27 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) * endpoint descriptors, shall be constructed according to * the rules given in section 6 (USB Device Model) of this * specification." - * - * Do not bind to such interfaces, allowing cdc_mbim to handle - * them */ -#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM) - if ((intf->num_altsetting == 2) && - !usb_set_interface(dev->udev, - intf->cur_altsetting->desc.bInterfaceNumber, - CDC_NCM_COMM_ALTSETTING_MBIM)) { - if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) - return -ENODEV; - else - usb_set_interface(dev->udev, - intf->cur_altsetting->desc.bInterfaceNumber, - CDC_NCM_COMM_ALTSETTING_NCM); + if (prefer_mbim && intf->num_altsetting == 2) { + alt = usb_altnum_to_altsetting(intf, CDC_NCM_COMM_ALTSETTING_MBIM); + if (alt && cdc_ncm_comm_intf_is_mbim(alt) && + !usb_set_interface(dev->udev, + intf->cur_altsetting->desc.bInterfaceNumber, + CDC_NCM_COMM_ALTSETTING_MBIM)) + return CDC_NCM_DATA_ALTSETTING_MBIM; } -#endif + return CDC_NCM_DATA_ALTSETTING_NCM; +} +EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting); + +static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf) +{ + int ret; + + /* MBIM backwards compatible function? */ + cdc_ncm_select_altsetting(dev, intf); + if (cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) + return -ENODEV; /* NCM data altsetting is always 1 */ ret = cdc_ncm_bind_common(dev, intf, 1); diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 3b8f9d4fc3fe..cc25b70af33c 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -127,6 +127,7 @@ struct cdc_ncm_ctx { u16 connected; }; +extern u8 cdc_ncm_select_altsetting(struct usbnet *dev, struct usb_interface *intf); extern int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting); extern void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf); extern struct sk_buff *cdc_ncm_fill_tx_frame(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign); -- cgit v1.2.3 From 65c10553552b487a71bf5e4676743435046fae6f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2013 20:52:30 +0900 Subject: kprobes: Make hash_64() as always inlined Because hash_64() is called from the get_kprobe() inside int3 handler, kernel causes int3 recursion and crashes if kprobes user puts a probe on it. Usually hash_64() is inlined into caller function, but in some cases, it has instances by gcc's interprocedural constant propagation. This patch uses __always_inline instead of inline to prevent gcc from doing such things. Reported-by: Timo Juhani Lindfors Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Pavel Emelyanov Cc: Jiri Kosina Cc: Nadia Yvette Chambers Cc: yrl.pp-manager.tt@hitachi.com Cc: David S. Miller Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130314115230.19690.39387.stgit@mhiramat-M0-7522 Signed-off-by: Ingo Molnar --- include/linux/hash.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hash.h b/include/linux/hash.h index 61c97ae22e01..f09a0ae4d858 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -15,6 +15,7 @@ */ #include +#include /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL @@ -31,7 +32,7 @@ #error Wordsize not 32 or 64 #endif -static inline u64 hash_64(u64 val, unsigned int bits) +static __always_inline u64 hash_64(u64 val, unsigned int bits) { u64 hash = val; -- cgit v1.2.3 From f445f11eb2cc265dd47da5b2e864df46cd6e5a82 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Thu, 14 Mar 2013 17:13:46 -0700 Subject: KVM: allow host header to be included even for !CONFIG_KVM The new context tracking subsystem unconditionally includes kvm_host.h headers for the guest enter/exit macros. This causes a compile failure when KVM is not enabled. Fix by adding an IS_ENABLED(CONFIG_KVM) check to kvm_host so it can be included/compiled even when KVM is not enabled. Cc: Frederic Weisbecker Signed-off-by: Kevin Hilman Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cad77fe09d77..a9428635c9fd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1,6 +1,8 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H +#if IS_ENABLED(CONFIG_KVM) + /* * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -1055,5 +1057,8 @@ static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +#else +static inline void __guest_enter(void) { return; } +static inline void __guest_exit(void) { return; } +#endif /* IS_ENABLED(CONFIG_KVM) */ #endif - -- cgit v1.2.3 From 6a15075eced2d780fa6c5d83682410f47f2e800b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 18 Mar 2013 19:24:02 +0100 Subject: ARM: video: mxs: Fix mxsfb misconfiguring VDCTRL0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The issue fixed by this patch manifests only then using X11 with mxsfb driver. The X11 will display either shifted image or otherwise distorted image on the LCD. The problem is that the X11 tries to reconfigure the framebuffer and along the way calls fb_ops.fb_set_par() with X11's desired configuration values. The field of particular interest is fb_info->var.sync which contains non-standard values if configured by kernel. These are either FB_SYNC_DATA_ENABLE_HIGH_ACT, FB_SYNC_DOTCLK_FAILING_ACT or both, depending on the platform configuration. Both of these values are defined in the include/linux/mxsfb.h file. The driver interprets these values and configures the LCD controller accordingly. Yet X11 only has access to the standard values for this field defined in include/uapi/linux/fb.h and thus, unlike kernel, omits these special values. This results in distorted image on the LCD. This patch moves these non-standard values into new field of the mxsfb_platform_data structure so the driver can in turn check this field instead of the video mode field for these specific portions. Moreover, this patch prefixes these values with MXSFB_SYNC_ prefix instead of FB_SYNC_ prefix to prevent confusion of subsequent users. Signed-off-by: Marek Vasut Cc: Fabio Estevam Cc: Linux ARM Cc: Linux FBDEV Cc: Lothar Waßmann Cc: Sascha Hauer Tested-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/mach-mxs/mach-mxs.c | 24 ++++++++++++------------ drivers/video/mxsfb.c | 7 +++++-- include/linux/mxsfb.h | 7 +++++-- 3 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c index 3218f1f2c0e0..e7b781d3788f 100644 --- a/arch/arm/mach-mxs/mach-mxs.c +++ b/arch/arm/mach-mxs/mach-mxs.c @@ -41,8 +41,6 @@ static struct fb_videomode mx23evk_video_modes[] = { .lower_margin = 4, .hsync_len = 1, .vsync_len = 1, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, }, }; @@ -59,8 +57,6 @@ static struct fb_videomode mx28evk_video_modes[] = { .lower_margin = 10, .hsync_len = 10, .vsync_len = 10, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, }, }; @@ -77,7 +73,6 @@ static struct fb_videomode m28evk_video_modes[] = { .lower_margin = 45, .hsync_len = 1, .vsync_len = 1, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT, }, }; @@ -94,9 +89,7 @@ static struct fb_videomode apx4devkit_video_modes[] = { .lower_margin = 13, .hsync_len = 48, .vsync_len = 3, - .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT | - FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, + .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, }, }; @@ -113,9 +106,7 @@ static struct fb_videomode apf28dev_video_modes[] = { .lower_margin = 0x15, .hsync_len = 64, .vsync_len = 4, - .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT | - FB_SYNC_DATA_ENABLE_HIGH_ACT | - FB_SYNC_DOTCLK_FAILING_ACT, + .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, }, }; @@ -132,7 +123,6 @@ static struct fb_videomode cfa10049_video_modes[] = { .lower_margin = 2, .hsync_len = 15, .vsync_len = 15, - .sync = FB_SYNC_DATA_ENABLE_HIGH_ACT }, }; @@ -259,6 +249,8 @@ static void __init imx23_evk_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(mx23evk_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_24BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; } static inline void enable_clk_enet_out(void) @@ -278,6 +270,8 @@ static void __init imx28_evk_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(mx28evk_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_24BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; mxs_saif_clkmux_select(MXS_DIGCTL_SAIF_CLKMUX_EXTMSTR0); } @@ -297,6 +291,7 @@ static void __init m28evk_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(m28evk_video_modes); mxsfb_pdata.default_bpp = 16; mxsfb_pdata.ld_intf_width = STMLCDIF_18BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT; } static void __init sc_sps1_init(void) @@ -322,6 +317,8 @@ static void __init apx4devkit_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(apx4devkit_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_24BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; } #define ENET0_MDC__GPIO_4_0 MXS_GPIO_NR(4, 0) @@ -407,6 +404,7 @@ static void __init cfa10049_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(cfa10049_video_modes); mxsfb_pdata.default_bpp = 32; mxsfb_pdata.ld_intf_width = STMLCDIF_18BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT; } static void __init cfa10037_init(void) @@ -423,6 +421,8 @@ static void __init apf28_init(void) mxsfb_pdata.mode_count = ARRAY_SIZE(apf28dev_video_modes); mxsfb_pdata.default_bpp = 16; mxsfb_pdata.ld_intf_width = STMLCDIF_16BIT; + mxsfb_pdata.sync = MXSFB_SYNC_DATA_ENABLE_HIGH_ACT | + MXSFB_SYNC_DOTCLK_FAILING_ACT; } static void __init mxs_machine_init(void) diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c index 755556ca5b2d..45169cbaba6e 100644 --- a/drivers/video/mxsfb.c +++ b/drivers/video/mxsfb.c @@ -169,6 +169,7 @@ struct mxsfb_info { unsigned dotclk_delay; const struct mxsfb_devdata *devdata; int mapped; + u32 sync; }; #define mxsfb_is_v3(host) (host->devdata->ipversion == 3) @@ -456,9 +457,9 @@ static int mxsfb_set_par(struct fb_info *fb_info) vdctrl0 |= VDCTRL0_HSYNC_ACT_HIGH; if (fb_info->var.sync & FB_SYNC_VERT_HIGH_ACT) vdctrl0 |= VDCTRL0_VSYNC_ACT_HIGH; - if (fb_info->var.sync & FB_SYNC_DATA_ENABLE_HIGH_ACT) + if (host->sync & MXSFB_SYNC_DATA_ENABLE_HIGH_ACT) vdctrl0 |= VDCTRL0_ENABLE_ACT_HIGH; - if (fb_info->var.sync & FB_SYNC_DOTCLK_FAILING_ACT) + if (host->sync & MXSFB_SYNC_DOTCLK_FAILING_ACT) vdctrl0 |= VDCTRL0_DOTCLK_ACT_FAILING; writel(vdctrl0, host->base + LCDC_VDCTRL0); @@ -861,6 +862,8 @@ static int mxsfb_probe(struct platform_device *pdev) INIT_LIST_HEAD(&fb_info->modelist); + host->sync = pdata->sync; + ret = mxsfb_init_fbinfo(host); if (ret != 0) goto error_init_fb; diff --git a/include/linux/mxsfb.h b/include/linux/mxsfb.h index f14943d55315..f80af8674342 100644 --- a/include/linux/mxsfb.h +++ b/include/linux/mxsfb.h @@ -24,8 +24,8 @@ #define STMLCDIF_18BIT 2 /** pixel data bus to the display is of 18 bit width */ #define STMLCDIF_24BIT 3 /** pixel data bus to the display is of 24 bit width */ -#define FB_SYNC_DATA_ENABLE_HIGH_ACT (1 << 6) -#define FB_SYNC_DOTCLK_FAILING_ACT (1 << 7) /* failing/negtive edge sampling */ +#define MXSFB_SYNC_DATA_ENABLE_HIGH_ACT (1 << 6) +#define MXSFB_SYNC_DOTCLK_FAILING_ACT (1 << 7) /* failing/negtive edge sampling */ struct mxsfb_platform_data { struct fb_videomode *mode_list; @@ -44,6 +44,9 @@ struct mxsfb_platform_data { * allocated. If specified,fb_size must also be specified. * fb_phys must be unused by Linux. */ + u32 sync; /* sync mask, contains MXSFB specifics not + * carried in fb_info->var.sync + */ }; #endif /* __LINUX_MXSFB_H */ -- cgit v1.2.3 From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:20 -0700 Subject: sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY PF_THREAD_BOUND was originally used to mark kernel threads which were bound to a specific CPU using kthread_bind() and a task with the flag set allows cpus_allowed modifications only to itself. Workqueue is currently abusing it to prevent userland from meddling with cpus_allowed of workqueue workers. What we need is a flag to prevent userland from messing with cpus_allowed of certain kernel tasks. In kernel, anyone can (incorrectly) squash the flag, and, for worker-type usages, restricting cpus_allowed modification to the task itself doesn't provide meaningful extra proection as other tasks can inject work items to the task anyway. This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY. sched_setaffinity() checks the flag and return -EINVAL if set. set_cpus_allowed_ptr() is no longer affected by the flag. This will allow simplifying workqueue worker CPU affinity management. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner --- include/linux/sched.h | 2 +- kernel/cgroup.c | 4 ++-- kernel/cpuset.c | 16 ++++++++-------- kernel/kthread.c | 2 +- kernel/sched/core.c | 9 ++++----- kernel/workqueue.c | 10 +++------- 6 files changed, 19 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..e5c64f7b8c1d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..3852d926322c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2224,11 +2224,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..f22e94792707 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1388,16 +1388,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) cgroup_taskset_for_each(task, cgrp, tset) { /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ ret = -EINVAL; - if (task->flags & PF_THREAD_BOUND) + if (task->flags & PF_NO_SETAFFINITY) goto out_unlock; ret = security_task_setscheduler(task); if (ret) diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..a2fbbb782bad 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -260,7 +260,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) { /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + p->flags |= PF_NO_SETAFFINITY; } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..23606ee961b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 969be0b72071..39a591f65b08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1757,12 +1757,8 @@ static struct worker *create_worker(struct worker_pool *pool) set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* - * %PF_THREAD_BOUND is used to prevent userland from meddling with - * cpumask of workqueue workers. This is an abuse. We need - * %PF_NO_SETAFFINITY. - */ - worker->task->flags |= PF_THREAD_BOUND; + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; /* * The caller is responsible for ensuring %POOL_DISASSOCIATED @@ -3876,7 +3872,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_THREAD_BOUND; + rescuer->task->flags |= PF_NO_SETAFFINITY; wake_up_process(rescuer->task); } -- cgit v1.2.3 From 7fa4cd1a78ea5af688ffce45553abbee9d7afd84 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 Mar 2013 10:35:44 -0300 Subject: usb: ulpi: Define a *otg_ulpi_create no-op Building a kernel for imx_v4_v5_defconfig with CONFIG_USB_ULPI disabled, results in the following error: arch/arm/mach-imx/built-in.o: In function 'pca100_init': platform-mx2-emma.c:(.init.text+0x6788): undefined reference to 'otg_ulpi_create' platform-mx2-emma.c:(.init.text+0x682c): undefined reference to 'mxc_ulpi_access_ops' Fix this by providing a no-op definition of *otg_ulpi_create for the case when CONFIG_USB_ULPI is not defined. Acked-by: Igor Grinberg Signed-off-by: Fabio Estevam Signed-off-by: Felipe Balbi --- include/linux/usb/ulpi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h index 6f033a415ecb..5c295c26ad37 100644 --- a/include/linux/usb/ulpi.h +++ b/include/linux/usb/ulpi.h @@ -181,8 +181,16 @@ /*-------------------------------------------------------------------------*/ +#if IS_ENABLED(CONFIG_USB_ULPI) struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, unsigned int flags); +#else +static inline struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, + unsigned int flags) +{ + return NULL; +} +#endif #ifdef CONFIG_USB_ULPI_VIEWPORT /* access ops for controllers with a viewport register */ -- cgit v1.2.3 From 44046a593eb770dbecdabf1c82bcd252f2a8337b Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:12 +0000 Subject: udp: add encap_destroy callback Users of udp encapsulation currently have an encap_rcv callback which they can use to hook into the udp receive path. In situations where a encapsulation user allocates resources associated with a udp encap socket, it may be convenient to be able to also hook the proto .destroy operation. For example, if an encap user holds a reference to the udp socket, the destroy hook might be used to relinquish this reference. This patch adds a socket destroy hook into udp, which is set and enabled in the same way as the existing encap_rcv hook. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- include/linux/udp.h | 1 + net/ipv4/udp.c | 7 +++++++ net/ipv6/udp.c | 8 ++++++++ 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 9d81de123c90..42278bbf7a88 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -68,6 +68,7 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + void (*encap_destroy)(struct sock *sk); }; static inline struct udp_sock *udp_sk(const struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 265c42cf963c..0a073a263720 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1762,9 +1762,16 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); + if (static_key_false(&udp_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 599e1ba6d1ce..d8e5e852fc7a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1285,10 +1285,18 @@ do_confirm: void udpv6_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); lock_sock(sk); udp_v6_flush_pending_frames(sk); release_sock(sk); + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + inet6_destroy_sock(sk); } -- cgit v1.2.3 From 73214f5d9f33b79918b1f7babddd5c8af28dd23d Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Tue, 19 Mar 2013 01:47:28 +0000 Subject: thermal: shorten too long mcast group name The original name is too long. Signed-off-by: Masatake YAMATO Signed-off-by: David S. Miller --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index f0bd7f90a90d..e3c0ae9bb1fa 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -44,7 +44,7 @@ /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" #define THERMAL_GENL_VERSION 0x01 -#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_group" +#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) -- cgit v1.2.3 From e5b33dc9d16053c2ae4c2c669cf008829530364b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 19 Mar 2013 09:21:10 +0100 Subject: USB: serial: add modem-status-change wait queue Add modem-status-change wait queue to struct usb_serial_port that subdrivers can use to implement TIOCMIWAIT. Currently subdrivers use a private wait queue which may have been released when waking up after device disconnected. Note that we're adding a new wait queue rather than reusing the tty-port one as we do not want to get woken up at hangup (yet). Cc: stable Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/serial.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index ef9be7e1e190..1819b59aab2a 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -66,6 +66,7 @@ * port. * @flags: usb serial port flags * @write_wait: a wait_queue_head_t used by the port. + * @delta_msr_wait: modem-status-change wait queue * @work: work queue entry for the line discipline waking up. * @throttled: nonzero if the read urb is inactive to throttle the device * @throttle_req: nonzero if the tty wants to throttle us @@ -112,6 +113,7 @@ struct usb_serial_port { unsigned long flags; wait_queue_head_t write_wait; + wait_queue_head_t delta_msr_wait; struct work_struct work; char throttled; char throttle_req; -- cgit v1.2.3 From 09a6e1f4ad32243989b30485f78985c0923284cd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 22 Mar 2013 08:08:06 -0300 Subject: Revert "KVM: allow host header to be included even for !CONFIG_KVM" This reverts commit f445f11eb2cc265dd47da5b2e864df46cd6e5a82 as it breaks PPC with CONFIG_KVM=n. Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a9428635c9fd..cad77fe09d77 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1,8 +1,6 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H -#if IS_ENABLED(CONFIG_KVM) - /* * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -1057,8 +1055,5 @@ static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ -#else -static inline void __guest_enter(void) { return; } -static inline void __guest_exit(void) { return; } -#endif /* IS_ENABLED(CONFIG_KVM) */ #endif + -- cgit v1.2.3 From fe8d52614bd419cedef85ef55850fd090373f481 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 22 Mar 2013 15:04:37 -0700 Subject: irq_work.h: fix warning when CONFIG_IRQ_WORK=n A randconfig caught repeated compiler warnings when CONFIG_IRQ_WORK=n due to the definition of a non-inline static function in : include/linux/irq_work.h +40 : warning: 'irq_work_needs_cpu' defined but not used Make it inline to supress the warning. This is caused commit 00b42959106a ("irq_work: Don't stop the tick with pending works") merged in v3.9-rc1. Signed-off-by: James Hogan Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irq_work.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index f5dbce50466e..66017028dcb3 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -37,7 +37,7 @@ void irq_work_sync(struct irq_work *work); #ifdef CONFIG_IRQ_WORK bool irq_work_needs_cpu(void); #else -static bool irq_work_needs_cpu(void) { return false; } +static inline bool irq_work_needs_cpu(void) { return false; } #endif #endif /* _LINUX_IRQ_WORK_H */ -- cgit v1.2.3 From dc72c32e1fd872a9a4fdfe645283c9dcd68e556d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 22 Mar 2013 15:04:39 -0700 Subject: printk: Provide a wake_up_klogd() off-case wake_up_klogd() is useless when CONFIG_PRINTK=n because neither printk() nor printk_sched() are in use and there are actually no waiter on log_wait waitqueue. It should be a stub in this case for users like bust_spinlocks(). Otherwise this results in this warning when CONFIG_PRINTK=n and CONFIG_IRQ_WORK=n: kernel/built-in.o In function `wake_up_klogd': (.text.wake_up_klogd+0xb4): undefined reference to `irq_work_queue' To fix this, provide an off-case for wake_up_klogd() when CONFIG_PRINTK=n. There is much more from console_unlock() and other console related code in printk.c that should be moved under CONFIG_PRINTK. But for now, focus on a minimal fix as we passed the merged window already. [akpm@linux-foundation.org: include printk.h in bust_spinlocks.c] Signed-off-by: Frederic Weisbecker Reported-by: James Hogan Cc: James Hogan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 - include/linux/printk.h | 6 ++++ kernel/printk.c | 80 ++++++++++++++++++++++++-------------------------- lib/bust_spinlocks.c | 3 +- 4 files changed, 46 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80d36874689b..79fdd80a42d4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -390,7 +390,6 @@ extern struct pid *session_of_pgrp(struct pid *pgrp); unsigned long int_sqrt(unsigned long); extern void bust_spinlocks(int yes); -extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; diff --git a/include/linux/printk.h b/include/linux/printk.h index 1249a54d17e0..822171fcb1c8 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -134,6 +134,8 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; +extern void wake_up_klogd(void); + void log_buf_kexec_setup(void); void __init setup_log_buf(int early); #else @@ -162,6 +164,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } +static inline void wake_up_klogd(void) +{ +} + static inline void log_buf_kexec_setup(void) { } diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335a..abbdd9e2ac82 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54b95d1..f8e0e5367398 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -28,5 +29,3 @@ void __attribute__((weak)) bust_spinlocks(int yes) wake_up_klogd(); } } - - -- cgit v1.2.3 From f9228b204f789493117e458d2fefae937edb7272 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 22 Mar 2013 15:04:43 -0700 Subject: mm: zone_end_pfn is too small Booting with 32 TBytes memory hits BUG at mm/page_alloc.c:552! (output below). The key hint is "page 4294967296 outside zone". 4294967296 = 0x100000000 (bit 32 is set). The problem is in include/linux/mmzone.h: 530 static inline unsigned zone_end_pfn(const struct zone *zone) 531 { 532 return zone->zone_start_pfn + zone->spanned_pages; 533 } zone_end_pfn is "unsigned" (32 bits). Changing it to "unsigned long" (64 bits) fixes the problem. zone_end_pfn() was added recently in commit 108bcc96ef70 ("mm: add & use zone_end_pfn() and zone_spans_pfn()") Output from the failure. No AGP bridge found page 4294967296 outside zone [ 4294967296 - 4327469056 ] ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:552! invalid opcode: 0000 [#1] SMP Modules linked in: CPU 0 Pid: 0, comm: swapper Not tainted 3.9.0-rc2.dtp+ #10 RIP: free_one_page+0x382/0x430 Process swapper (pid: 0, threadinfo ffffffff81942000, task ffffffff81955420) Call Trace: __free_pages_ok+0x96/0xb0 __free_pages+0x25/0x50 __free_pages_bootmem+0x8a/0x8c __free_memory_core+0xea/0x131 free_low_memory_core_early+0x4a/0x98 free_all_bootmem+0x45/0x47 mem_init+0x7b/0x14c start_kernel+0x216/0x433 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x144/0x153 Code: 89 f1 ba 01 00 00 00 31 f6 d3 e2 4c 89 ef e8 66 a4 01 00 e9 2c fe ff ff 0f 0b eb fe 0f 0b 66 66 2e 0f 1f 84 00 00 00 00 00 eb f3 <0f> 0b eb fe 0f 0b 0f 1f 84 00 00 00 00 00 eb f6 0f 0b eb fe 49 Signed-off-by: Russ Anderson Reported-by: George Beshers Acked-by: Hedi Berriche Cc: Cody P Schafer Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ede274957e05..c74092eebf5c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -527,7 +527,7 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -static inline unsigned zone_end_pfn(const struct zone *zone) +static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } -- cgit v1.2.3 From 3f5eb14135ba9d97ba4b8514fc7ef5e0dac2abf4 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Tue, 19 Mar 2013 16:48:12 +0800 Subject: usb: add find_raw_port_number callback to struct hc_driver() xhci driver divides the root hub into two logical hubs which work respectively for usb 2.0 and usb 3.0 devices. They are independent devices in the usb core. But in the ACPI table, it's one device node and all usb2.0 and usb3.0 ports are under it. Binding usb port with its acpi node needs the raw port number which is reflected in the xhci extended capabilities table. This patch is to add find_raw_port_number callback to struct hc_driver(), fill it with xhci_find_raw_port_number() which will return raw port number and add a wrap usb_hcd_find_raw_port_number(). Otherwise, refactor xhci_find_real_port_number(). Using xhci_find_raw_port_number() to get real index in the HW port status registers instead of scanning through the xHCI roothub port array. This can help to speed up. All addresses in xhci->usb2_ports and xhci->usb3_ports array are kown good ports and don't include following bad ports in the extended capabilities talbe. (1) root port that doesn't have an entry (2) root port with unknown speed (3) root port that is listed twice and with different speeds. So xhci_find_raw_port_number() will only return port num of good ones and never touch bad ports above. Signed-off-by: Lan Tianyu Signed-off-by: Sarah Sharp --- drivers/usb/core/hcd.c | 8 ++++++++ drivers/usb/host/xhci-mem.c | 36 ++++++++---------------------------- drivers/usb/host/xhci-pci.c | 1 + drivers/usb/host/xhci.c | 22 ++++++++++++++++++++++ drivers/usb/host/xhci.h | 1 + include/linux/usb/hcd.h | 2 ++ 6 files changed, 42 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 99b34a30354f..f9ec44cbb82f 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2412,6 +2412,14 @@ int usb_hcd_is_primary_hcd(struct usb_hcd *hcd) } EXPORT_SYMBOL_GPL(usb_hcd_is_primary_hcd); +int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1) +{ + if (!hcd->driver->find_raw_port_number) + return port1; + + return hcd->driver->find_raw_port_number(hcd, port1); +} + static int usb_hcd_request_irqs(struct usb_hcd *hcd, unsigned int irqnum, unsigned long irqflags) { diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 35616ffbe3ae..6dc238c592bc 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1022,44 +1022,24 @@ void xhci_copy_ep0_dequeue_into_input_ctx(struct xhci_hcd *xhci, * is attached to (or the roothub port its ancestor hub is attached to). All we * know is the index of that port under either the USB 2.0 or the USB 3.0 * roothub, but that doesn't give us the real index into the HW port status - * registers. Scan through the xHCI roothub port array, looking for the Nth - * entry of the correct port speed. Return the port number of that entry. + * registers. Call xhci_find_raw_port_number() to get real index. */ static u32 xhci_find_real_port_number(struct xhci_hcd *xhci, struct usb_device *udev) { struct usb_device *top_dev; - unsigned int num_similar_speed_ports; - unsigned int faked_port_num; - int i; + struct usb_hcd *hcd; + + if (udev->speed == USB_SPEED_SUPER) + hcd = xhci->shared_hcd; + else + hcd = xhci->main_hcd; for (top_dev = udev; top_dev->parent && top_dev->parent->parent; top_dev = top_dev->parent) /* Found device below root hub */; - faked_port_num = top_dev->portnum; - for (i = 0, num_similar_speed_ports = 0; - i < HCS_MAX_PORTS(xhci->hcs_params1); i++) { - u8 port_speed = xhci->port_array[i]; - - /* - * Skip ports that don't have known speeds, or have duplicate - * Extended Capabilities port speed entries. - */ - if (port_speed == 0 || port_speed == DUPLICATE_ENTRY) - continue; - /* - * USB 3.0 ports are always under a USB 3.0 hub. USB 2.0 and - * 1.1 ports are under the USB 2.0 hub. If the port speed - * matches the device speed, it's a similar speed port. - */ - if ((port_speed == 0x03) == (udev->speed == USB_SPEED_SUPER)) - num_similar_speed_ports++; - if (num_similar_speed_ports == faked_port_num) - /* Roothub ports are numbered from 1 to N */ - return i+1; - } - return 0; + return xhci_find_raw_port_number(hcd, top_dev->portnum); } /* Setup an xHCI virtual device for a Set Address command */ diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index af259e0ec172..1a30c380043c 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -313,6 +313,7 @@ static const struct hc_driver xhci_pci_hc_driver = { .set_usb2_hw_lpm = xhci_set_usb2_hardware_lpm, .enable_usb3_lpm_timeout = xhci_enable_usb3_lpm_timeout, .disable_usb3_lpm_timeout = xhci_disable_usb3_lpm_timeout, + .find_raw_port_number = xhci_find_raw_port_number, }; /*-------------------------------------------------------------------------*/ diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 849470b18831..53b8f89a0b1c 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -3779,6 +3779,28 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) return 0; } +/* + * Transfer the port index into real index in the HW port status + * registers. Caculate offset between the port's PORTSC register + * and port status base. Divide the number of per port register + * to get the real index. The raw port number bases 1. + */ +int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1) +{ + struct xhci_hcd *xhci = hcd_to_xhci(hcd); + __le32 __iomem *base_addr = &xhci->op_regs->port_status_base; + __le32 __iomem *addr; + int raw_port; + + if (hcd->speed != HCD_USB3) + addr = xhci->usb2_ports[port1 - 1]; + else + addr = xhci->usb3_ports[port1 - 1]; + + raw_port = (addr - base_addr)/NUM_PORT_REGS + 1; + return raw_port; +} + #ifdef CONFIG_USB_SUSPEND /* BESL to HIRD Encoding array for USB2 LPM */ diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 2c510e4a7d4c..d798b6931914 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1829,6 +1829,7 @@ void xhci_test_and_clear_bit(struct xhci_hcd *xhci, __le32 __iomem **port_array, int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength); int xhci_hub_status_data(struct usb_hcd *hcd, char *buf); +int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1); #ifdef CONFIG_PM int xhci_bus_suspend(struct usb_hcd *hcd); diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 0a78df5f6cfd..59694b5e5e90 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -357,6 +357,7 @@ struct hc_driver { */ int (*disable_usb3_lpm_timeout)(struct usb_hcd *, struct usb_device *, enum usb3_link_state state); + int (*find_raw_port_number)(struct usb_hcd *, int); }; extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb); @@ -396,6 +397,7 @@ extern int usb_hcd_is_primary_hcd(struct usb_hcd *hcd); extern int usb_add_hcd(struct usb_hcd *hcd, unsigned int irqnum, unsigned long irqflags); extern void usb_remove_hcd(struct usb_hcd *hcd); +extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1); struct platform_device; extern void usb_hcd_platform_shutdown(struct platform_device *dev); -- cgit v1.2.3 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb45..a3035223d421 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24f..2b93a9a5a1e6 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d342043..0f1e42884577 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3 From 90563b198e4c6674c63672fae1923da467215f45 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 03:10:15 -0700 Subject: vfs: Add a mount flag to lock read only bind mounts When a read-only bind mount is copied from mount namespace in a higher privileged user namespace to a mount namespace in a lesser privileged user namespace, it should not be possible to remove the the read-only restriction. Add a MNT_LOCK_READONLY mount flag to indicate that a mount must remain read-only. CC: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 +++ include/linux/mount.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index a3035223d421..8505b5ece5de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1713,6 +1713,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else diff --git a/include/linux/mount.h b/include/linux/mount.h index d7029f4a191a..73005f9957ea 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -47,6 +47,8 @@ struct mnt_namespace; #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_READONLY 0x400000 + struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ -- cgit v1.2.3 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae03..d581e45c0a9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..9c7fab1d23f0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce009324933..b6b215f13b45 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..8e635a18ab52 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e42884577..a54f26f82eb2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3 From 09a9f1d27892255cfb9c91203f19476765e2d8d1 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 28 Mar 2013 16:26:23 -0700 Subject: Revert "mm: introduce VM_POPULATE flag to better deal with racy userspace programs" This reverts commit 186930500985 ("mm: introduce VM_POPULATE flag to better deal with racy userspace programs"). VM_POPULATE only has any effect when userspace plays racy games with vmas by trying to unmap and remap memory regions that mmap or mlock are operating on. Also, the only effect of VM_POPULATE when userspace plays such games is that it avoids populating new memory regions that get remapped into the address range that was being operated on by the original mmap or mlock calls. Let's remove VM_POPULATE as there isn't any strong argument to mandate a new vm_flag. Signed-off-by: Michel Lespinasse Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - include/linux/mman.h | 4 +--- mm/fremap.c | 12 ++---------- mm/mlock.c | 11 +++++------ mm/mmap.c | 4 +++- 5 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7acc9dc73c9f..e19ff30ad0a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -87,7 +87,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ -#define VM_POPULATE 0x00001000 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 61c7a87e5d2b..9aa863da287f 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -79,8 +79,6 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) | - (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ? - VM_POPULATE : 0); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); } #endif /* _LINUX_MMAN_H */ diff --git a/mm/fremap.c b/mm/fremap.c index 4723ac8d2fc2..87da3590c61e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -204,10 +204,8 @@ get_write_lock: unsigned long addr; struct file *file = get_file(vma->vm_file); - vm_flags = vma->vm_flags; - if (!(flags & MAP_NONBLOCK)) - vm_flags |= VM_POPULATE; - addr = mmap_region(file, start, size, vm_flags, pgoff); + addr = mmap_region(file, start, size, + vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -226,12 +224,6 @@ get_write_lock: mutex_unlock(&mapping->i_mmap_mutex); } - if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { - if (!has_write_lock) - goto get_write_lock; - vma->vm_flags |= VM_POPULATE; - } - if (vma->vm_flags & VM_LOCKED) { /* * drop PG_Mlocked flag for over-mapped range diff --git a/mm/mlock.c b/mm/mlock.c index 1c5e33fce639..79b7cf7d1bca 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on) newflags = vma->vm_flags & ~VM_LOCKED; if (on) - newflags |= VM_LOCKED | VM_POPULATE; + newflags |= VM_LOCKED; tmp = vma->vm_end; if (tmp > end) @@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != - VM_POPULATE) + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; @@ -492,9 +491,9 @@ static int do_mlockall(int flags) struct vm_area_struct * vma, * prev = NULL; if (flags & MCL_FUTURE) - current->mm->def_flags |= VM_LOCKED | VM_POPULATE; + current->mm->def_flags |= VM_LOCKED; else - current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); + current->mm->def_flags &= ~VM_LOCKED; if (flags == MCL_FUTURE) goto out; @@ -503,7 +502,7 @@ static int do_mlockall(int flags) newflags = vma->vm_flags & ~VM_LOCKED; if (flags & MCL_CURRENT) - newflags |= VM_LOCKED | VM_POPULATE; + newflags |= VM_LOCKED; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index 2664a47cec93..6466699b16cb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1306,7 +1306,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } addr = mmap_region(file, addr, len, vm_flags, pgoff); - if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } -- cgit v1.2.3 From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Sun, 31 Mar 2013 00:04:40 +0000 Subject: Revert "lockdep: check that no locks held at freeze time" This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d. Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time") causes problems with NFS root filesystems. The failures were noticed on OMAP2 and 3 boards during kernel init: [ BUG: swapper/0/1 still has locks held! ] 3.9.0-rc3-00344-ga937536 #1 Not tainted ------------------------------------- 1 lock held by swapper/0/1: #0: (&type->s_umount_key#13/1){+.+.+.}, at: [] sget+0x248/0x574 stack backtrace: rpc_wait_bit_killable __wait_on_bit out_of_line_wait_on_bit __rpc_execute rpc_run_task rpc_call_sync nfs_proc_get_root nfs_get_root nfs_fs_mount_common nfs_try_mount nfs_fs_mount mount_fs vfs_kern_mount do_mount sys_mount do_mount_root mount_root prepare_namespace kernel_init_freeable kernel_init Although the rootfs mounts, the system is unstable. Here's a transcript from a PM test: http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt Here's what the test log should look like: http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt Mailing list discussion is here: http://lkml.org/lkml/2013/3/4/221 Deal with this for v3.9 by reverting the problem commit, until folks can figure out the right long-term course of action. Signed-off-by: Paul Walmsley Cc: Mandeep Singh Baines Cc: Jeff Layton Cc: Shawn Guo Cc: Cc: Fengguang Wu Cc: Trond Myklebust Cc: Ingo Molnar Cc: Ben Chan Cc: Oleg Nesterov Cc: Tejun Heo Cc: Rafael J. Wysocki Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debug_locks.h | 4 ++-- include/linux/freezer.h | 3 --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 +++++++++-------- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index a975de1ff59f..3bd46f766751 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(void); +extern void debug_check_no_locks_held(struct task_struct *task); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(void) +debug_check_no_locks_held(struct task_struct *task) { } #endif diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 043a5cf8b5ba..e70df40d84f6 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -3,7 +3,6 @@ #ifndef FREEZER_H_INCLUDED #define FREEZER_H_INCLUDED -#include #include #include #include @@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { - if (!(current->flags & PF_NOFREEZE)) - debug_check_no_locks_held(); might_sleep(); if (likely(!freezing(current))) return false; diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca9935..60bc027c61c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(); + debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d9..8a0efac4f99d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; @@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void) printk("\n"); printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); + printk("[ BUG: lock held at task exit time! ]\n"); print_kernel_ident(); printk("-------------------------------------\n"); - lockdep_print_held_locks(current); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task) { - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3 From d55262c4d164759a8debe772da6c9b16059dec47 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 11:23:38 -0700 Subject: workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- Documentation/kernel-parameters.txt | 9 ++++ include/linux/workqueue.h | 5 +++ kernel/workqueue.c | 82 ++++++++++++++++++++++++++----------- 3 files changed, 73 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37..c75ea0b8ec59 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. or other driver-specific files in the Documentation/watchdog/ directory. + workqueue.disable_numa + By default, all work items queued to unbound + workqueues are affine to the NUMA nodes they're + issued on, which results in better behavior in + general. If NUMA affinity needs to be disabled for + whatever reason, this option can be used. Note + that this also can be controlled per-workqueue for + workqueues visible under /sys/bus/workqueue/. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 835d12b76960..717975639378 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -119,10 +119,15 @@ struct delayed_work { /* * A struct for workqueue attributes. This can be used to change * attributes of an unbound workqueue. + * + * Unlike other fields, ->no_numa isn't a property of a worker_pool. It + * only modifies how apply_workqueue_attrs() select pools and thus doesn't + * participate in pool hash calculations or equality comparisons. */ struct workqueue_attrs { int nice; /* nice level */ cpumask_var_t cpumask; /* allowed CPUs */ + bool no_numa; /* disable NUMA affinity */ }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 57cd77de4a4f..729ac6a44860 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * first_pwq - return the first pool_workqueue of the specified workqueue - * @wq: the target workqueue - * - * This must be called either with wq->mutex held or sched RCU read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - */ -static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) -{ - assert_rcu_or_wq_mutex(wq); - return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, - pwqs_node); -} - /** * unbound_pwq_by_node - return the unbound pool_workqueue for the given node * @wq: the target workqueue @@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = { __ATTR_NULL, }; -static ssize_t wq_pool_id_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); - struct worker_pool *pool; - int written; + const char *delim = ""; + int node, written = 0; rcu_read_lock_sched(); - pool = first_pwq(wq)->pool; - written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock_sched(); return written; @@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev, return ret ?: count; } +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; @@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { - if (!wq_numa_enabled) + if (!wq_numa_enabled || attrs->no_numa) goto use_dfl; /* does @node have any online CPUs @attrs wants? */ @@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, cpumask = target_attrs->cpumask; mutex_lock(&wq->mutex); + if (wq->unbound_attrs->no_numa) + goto out_unlock; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); @@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; + if (wq_disable_numa) { + pr_info("workqueue: NUMA affinity support disabled\n"); + return; + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); BUG_ON(!wq_update_unbound_numa_attrs_buf); -- cgit v1.2.3 From 181387da2d64c3129e5b5186c4dd388bc5041d53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: remove unused bdi_pending_list There's no user left. Remove it. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu --- include/linux/backing-dev.h | 1 - mm/backing-dev.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 350459910fe1..a5ef27f5411a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -130,7 +130,6 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -extern struct list_head bdi_pending_list; static inline int wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41733c5dc820..657569b3fcf6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -31,13 +31,11 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as - * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side * locking. */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); -LIST_HEAD(bdi_pending_list); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { -- cgit v1.2.3 From 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Apr 2013 19:08:06 -0700 Subject: writeback: replace custom worker pool implementation with unbound workqueue Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Fengguang Wu Cc: Jeff Moyer --- fs/fs-writeback.c | 102 +++++----------- include/linux/backing-dev.h | 15 +-- include/trace/events/writeback.h | 5 - mm/backing-dev.c | 255 +++++---------------------------------- 4 files changed, 65 insertions(+), 312 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..8067d3719e94 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a5ef27f5411a..c3881553f7d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,6 +18,7 @@ #include #include #include +#include struct page; struct device; @@ -27,7 +28,6 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pending, /* On its way to being activated */ BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ @@ -53,10 +53,8 @@ struct bdi_writeback { unsigned int nr; unsigned long last_old_flush; /* last old data flush */ - unsigned long last_active; /* last time bdi thread was active */ - struct task_struct *task; /* writeback thread */ - struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */ + struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -int bdi_writeback_thread(void *data); +void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); @@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct workqueue_struct *bdi_wq; + static inline int wb_has_dirty_io(struct bdi_writeback *wb) { return !list_empty(&wb->b_dirty) || @@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) return bdi->capabilities & BDI_CAP_SWAP_BACKED; } -static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi) -{ - return bdi == &default_backing_dev_info; -} - static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 6a16fd2e70ed..464ea82e10db 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class, DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ TP_ARGS(bdi, work)) -DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \ DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_wake_thread); -DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); -DEFINE_WRITEBACK_EVENT(writeback_thread_start); -DEFINE_WRITEBACK_EVENT(writeback_thread_stop); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 657569b3fcf6..2857d4f6bca4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -37,6 +37,9 @@ static struct class *bdi_class; DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; + void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -255,6 +258,11 @@ static int __init default_bdi_init(void) { int err; + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND, 0); + if (!bdi_wq) + return -ENOMEM; + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - /* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the @@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); } /* @@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); + + /* bdi_list is now unused, clear it to mark @bdi dying */ + INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) bdi_remove_from_list(bdi); /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); -- cgit v1.2.3