From 4b8164b91d9fdff4dbac0a742d076bdff7fda21b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 31 Jan 2015 20:08:47 -0500 Subject: new helper: dup_iter() Copy iter and kmemdup the underlying array for the copy. Returns a pointer to result of kmemdup() to be kfree()'d later. Signed-off-by: Al Viro --- include/linux/uio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 07a022641996..71880299ed48 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -98,6 +98,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); +const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); + static inline size_t iov_iter_count(struct iov_iter *i) { return i->count; -- cgit v1.2.3 From 846cd66788b11105a62785078360c8854aa98310 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 18 Feb 2015 11:38:06 +0100 Subject: net: Initialize all members in skb_gro_remcsum_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit skb_gro_remcsum_init() initializes the gro_remcsum.delta member only, leading to compiler warnings about a possibly uninitialized gro_remcsum.offset member: drivers/net/vxlan.c: In function ‘vxlan_gro_receive’: drivers/net/vxlan.c:602: warning: ‘grc.offset’ may be used uninitialized in this function net/ipv4/fou.c: In function ‘gue_gro_receive’: net/ipv4/fou.c:262: warning: ‘grc.offset’ may be used uninitialized in this function While these are harmless for now: - skb_gro_remcsum_process() sets offset before changing delta, - skb_gro_remcsum_cleanup() checks if delta is non-zero before accessing offset, it's safer to let the initialization function initialize all members. Signed-off-by: Geert Uytterhoeven Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5897b4ea5a3f..429d1790a27e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2342,6 +2342,7 @@ struct gro_remcsum { static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) { + grc->offset = 0; grc->delta = 0; } -- cgit v1.2.3 From b9ebafbe8cfeeddec881504c446cccd0d87a51b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Feb 2015 06:48:57 -0800 Subject: rhashtable: ensure cache line alignment on bucket_table struct bucket_table contains mostly read fields : size, locks_mask, locks. Make sure these are not sharing a cache line with buckets[] Signed-off-by: Eric Dumazet Acked-by: Daniel Borkmann Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 58851275fed9..cb2104be2135 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -54,10 +54,11 @@ struct rhash_head { * @buckets: size * hash buckets */ struct bucket_table { - size_t size; - unsigned int locks_mask; - spinlock_t *locks; - struct rhash_head __rcu *buckets[]; + size_t size; + unsigned int locks_mask; + spinlock_t *locks; + + struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -- cgit v1.2.3 From 737eb0301f296d55c22350c6968ff1ef51bacb5f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 20 Feb 2015 14:53:46 +0000 Subject: genirq / PM: better describe IRQF_NO_SUSPEND semantics The IRQF_NO_SUSPEND flag is intended to be used for interrupts required to be enabled during the suspend-resume cycle. This mostly consists of IPIs and timer interrupts, potentially including chained irqchip interrupts if these are necessary to handle timers or IPIs. If an interrupt does not fall into one of the aforementioned categories, requesting it with IRQF_NO_SUSPEND is likely incorrect. Using IRQF_NO_SUSPEND does not guarantee that the interrupt can wake the system from a suspended state. For an interrupt to be able to trigger a wakeup, it may be necessary to program various components of the system. In these cases it is necessary to use {enable,disabled}_irq_wake. Unfortunately, several drivers assume that IRQF_NO_SUSPEND ensures that an IRQ can wake up the system, and the documentation can be read ambiguously w.r.t. this property. This patch updates the documentation regarding IRQF_NO_SUSPEND to make this caveat explicit, hopefully making future misuse rarer. Cleanup of existing misuse will occur as part of later patch series. Signed-off-by: Mark Rutland Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- Documentation/power/suspend-and-interrupts.txt | 6 ++++-- include/linux/interrupt.h | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt index 2f9c5a5fcb25..50493c9284b4 100644 --- a/Documentation/power/suspend-and-interrupts.txt +++ b/Documentation/power/suspend-and-interrupts.txt @@ -40,8 +40,10 @@ but also to IPIs and to some other special-purpose interrupts. The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when requesting a special-purpose interrupt. It causes suspend_device_irqs() to -leave the corresponding IRQ enabled so as to allow the interrupt to work all -the time as expected. +leave the corresponding IRQ enabled so as to allow the interrupt to work as +expected during the suspend-resume cycle, but does not guarantee that the +interrupt will wake the system from a suspended state -- for such cases it is +necessary to use enable_irq_wake(). Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index d9b05b5bf8c7..606771c7cac2 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -52,7 +52,9 @@ * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. - * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend + * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee + * that this interrupt will wake the system from a suspended + * state. See Documentation/power/suspend-and-interrupts.txt * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device -- cgit v1.2.3 From bc4b1f486fe69b86769e07c8edce472327a8462b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Sun, 15 Feb 2015 11:57:53 +0700 Subject: Revert "USB: serial: make bulk_out_size a lower limit" This reverts commit 5083fd7bdfe6760577235a724cf6dccae13652c2. A bulk-out size smaller than the end-point size is indeed valid. The offending commit broke the usb-debug driver for EHCI debug devices, which use 8-byte buffers. Fixes: 5083fd7bdfe6 ("USB: serial: make bulk_out_size a lower limit") Reported-by: "Li, Elvin" Cc: stable # v3.15 Signed-off-by: Johan Hovold --- drivers/usb/serial/usb-serial.c | 5 +++-- include/linux/usb/serial.h | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 475723c006f9..19842370a07f 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -940,8 +940,9 @@ static int usb_serial_probe(struct usb_interface *interface, port = serial->port[i]; if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) goto probe_error; - buffer_size = max_t(int, serial->type->bulk_out_size, - usb_endpoint_maxp(endpoint)); + buffer_size = serial->type->bulk_out_size; + if (!buffer_size) + buffer_size = usb_endpoint_maxp(endpoint); port->bulk_out_size = buffer_size; port->bulk_out_endpointAddress = endpoint->bEndpointAddress; diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index 9bb547c7bce7..704a1ab8240c 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -190,8 +190,7 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data) * @num_ports: the number of different ports this device will have. * @bulk_in_size: minimum number of bytes to allocate for bulk-in buffer * (0 = end-point size) - * @bulk_out_size: minimum number of bytes to allocate for bulk-out buffer - * (0 = end-point size) + * @bulk_out_size: bytes to allocate for bulk-out buffer (0 = end-point size) * @calc_num_ports: pointer to a function to determine how many ports this * device has dynamically. It will be called after the probe() * callback is called, but before attach() -- cgit v1.2.3 From 09ee96b21456883e108c3b00597bb37ec512151b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 26 Feb 2015 11:41:28 -0500 Subject: dm snapshot: suspend merging snapshot when doing exception handover The "dm snapshot: suspend origin when doing exception handover" commit fixed a exception store handover bug associated with pending exceptions to the "snapshot-origin" target. However, a similar problem exists in snapshot merging. When snapshot merging is in progress, we use the target "snapshot-merge" instead of "snapshot-origin". Consequently, during exception store handover, we must find the snapshot-merge target and suspend its associated mapped_device. To avoid lockdep warnings, the target must be suspended and resumed without holding _origins_lock. Introduce a dm_hold() function that grabs a reference on a mapped_device, but unlike dm_get(), it doesn't crash if the device has the DMF_FREEING flag set, it returns an error in this case. In snapshot_resume() we grab the reference to the origin device using dm_hold() while holding _origins_lock (_origins_lock guarantees that the device won't disappear). Then we release _origins_lock, suspend the device and grab _origins_lock again. NOTE to stable@ people: When backporting to kernels 3.18 and older, use dm_internal_suspend and dm_internal_resume instead of dm_internal_suspend_fast and dm_internal_resume_fast. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-snap.c | 35 +++++++++++++++++++++++++++++------ drivers/md/dm.c | 13 +++++++++++++ include/linux/device-mapper.h | 1 + 3 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c2bf822bad6f..f83a0f3fc365 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1888,20 +1888,39 @@ static int snapshot_preresume(struct dm_target *ti) static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = ti->private; - struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL; struct dm_origin *o; struct mapped_device *origin_md = NULL; + bool must_restart_merging = false; down_read(&_origins_lock); o = __lookup_dm_origin(s->origin->bdev); if (o) origin_md = dm_table_get_md(o->ti->table); + if (!origin_md) { + (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging); + if (snap_merging) + origin_md = dm_table_get_md(snap_merging->ti->table); + } if (origin_md == dm_table_get_md(ti->table)) origin_md = NULL; + if (origin_md) { + if (dm_hold(origin_md)) + origin_md = NULL; + } - if (origin_md) + up_read(&_origins_lock); + + if (origin_md) { dm_internal_suspend_fast(origin_md); + if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) { + must_restart_merging = true; + stop_merge(snap_merging); + } + } + + down_read(&_origins_lock); (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { @@ -1912,11 +1931,15 @@ static void snapshot_resume(struct dm_target *ti) up_write(&snap_src->lock); } - if (origin_md) - dm_internal_resume_fast(origin_md); - up_read(&_origins_lock); + if (origin_md) { + if (must_restart_merging) + start_merge(snap_merging); + dm_internal_resume_fast(origin_md); + dm_put(origin_md); + } + /* Now we have correct chunk size, reregister */ reregister_snapshot(s); @@ -2360,7 +2383,7 @@ static struct target_type snapshot_target = { static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e2b2e97abe9..9b641b38b857 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2616,6 +2616,19 @@ void dm_get(struct mapped_device *md) BUG_ON(test_bit(DMF_FREEING, &md->flags)); } +int dm_hold(struct mapped_device *md) +{ + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags)) { + spin_unlock(&_minor_lock); + return -EBUSY; + } + dm_get(md); + spin_unlock(&_minor_lock); + return 0; +} +EXPORT_SYMBOL_GPL(dm_hold); + const char *dm_device_name(struct mapped_device *md) { return md->name; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 2646aed1d3fe..fd23978d93fe 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -375,6 +375,7 @@ int dm_create(int minor, struct mapped_device **md); */ struct mapped_device *dm_get_md(dev_t dev); void dm_get(struct mapped_device *md); +int dm_hold(struct mapped_device *md); void dm_put(struct mapped_device *md); /* -- cgit v1.2.3 From 4c4b52d9b2df45e8216d3e30b5452e4a364d2cac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 25 Feb 2015 16:31:54 +0100 Subject: rhashtable: remove indirection for grow/shrink decision functions Currently, all real users of rhashtable default their grow and shrink decision functions to rht_grow_above_75() and rht_shrink_below_30(), so that there's currently no need to have this explicitly selectable. It can/should be generic and private inside rhashtable until a real use case pops up. Since we can make this private, we'll save us this additional indirection layer and can improve insertion/deletion time as well. Reference: http://patchwork.ozlabs.org/patch/443040/ Suggested-by: David S. Miller Signed-off-by: Daniel Borkmann Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 13 ----------- lib/rhashtable.c | 56 ++++++++++++++-------------------------------- lib/test_rhashtable.c | 1 + net/netfilter/nft_hash.c | 2 -- net/netlink/af_netlink.c | 2 -- net/tipc/socket.c | 2 -- 6 files changed, 18 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index cb2104be2135..d438eeb08bff 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -79,12 +79,6 @@ struct rhashtable; * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) * @hashfn: Function to hash key * @obj_hashfn: Function to hash object - * @grow_decision: If defined, may return true if table should expand - * @shrink_decision: If defined, may return true if table should shrink - * - * Note: when implementing the grow and shrink decision function, min/max - * shift must be enforced, otherwise, resizing watermarks they set may be - * useless. */ struct rhashtable_params { size_t nelem_hint; @@ -98,10 +92,6 @@ struct rhashtable_params { size_t locks_mul; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; - bool (*grow_decision)(const struct rhashtable *ht, - size_t new_size); - bool (*shrink_decision)(const struct rhashtable *ht, - size_t new_size); }; /** @@ -193,9 +183,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params); void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node); -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size); -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); - int rhashtable_expand(struct rhashtable *ht); int rhashtable_shrink(struct rhashtable *ht); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index bcf119bfdef4..090641db4c0d 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -247,26 +247,24 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, * @ht: hash table * @new_size: new table size */ -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) +static bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (new_size / 4 * 3) && (!ht->p.max_shift || atomic_read(&ht->shift) < ht->p.max_shift); } -EXPORT_SYMBOL_GPL(rht_grow_above_75); /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @new_size: new table size */ -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) +static bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (new_size * 3 / 10) && (atomic_read(&ht->shift) > ht->p.min_shift); } -EXPORT_SYMBOL_GPL(rht_shrink_below_30); static void lock_buckets(struct bucket_table *new_tbl, struct bucket_table *old_tbl, unsigned int hash) @@ -528,40 +526,19 @@ static void rht_deferred_worker(struct work_struct *work) list_for_each_entry(walker, &ht->walkers, list) walker->resize = true; - if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) + if (rht_grow_above_75(ht, tbl->size)) rhashtable_expand(ht); - else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) + else if (rht_shrink_below_30(ht, tbl->size)) rhashtable_shrink(ht); - unlock: mutex_unlock(&ht->mutex); } -static void rhashtable_probe_expand(struct rhashtable *ht) -{ - const struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && ht->p.grow_decision && - ht->p.grow_decision(ht, tbl->size)) - schedule_work(&ht->run_work); -} - -static void rhashtable_probe_shrink(struct rhashtable *ht) -{ - const struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && ht->p.shrink_decision && - ht->p.shrink_decision(ht, tbl->size)) - schedule_work(&ht->run_work); -} - static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, - struct bucket_table *tbl, u32 hash) + struct bucket_table *tbl, + const struct bucket_table *old_tbl, u32 hash) { + bool no_resize_running = tbl == old_tbl; struct rhash_head *head; hash = rht_bucket_index(tbl, hash); @@ -577,8 +554,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); - - rhashtable_probe_expand(ht); + if (no_resize_running && rht_grow_above_75(ht, tbl->size)) + schedule_work(&ht->run_work); } /** @@ -608,7 +585,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); lock_buckets(tbl, old_tbl, hash); - __rhashtable_insert(ht, obj, tbl, hash); + __rhashtable_insert(ht, obj, tbl, old_tbl, hash); unlock_buckets(tbl, old_tbl, hash); rcu_read_unlock(); @@ -690,8 +667,11 @@ found: unlock_buckets(new_tbl, old_tbl, new_hash); if (ret) { + bool no_resize_running = new_tbl == old_tbl; + atomic_dec(&ht->nelems); - rhashtable_probe_shrink(ht); + if (no_resize_running && rht_shrink_below_30(ht, new_tbl->size)) + schedule_work(&ht->run_work); } rcu_read_unlock(); @@ -861,7 +841,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, goto exit; } - __rhashtable_insert(ht, obj, new_tbl, new_hash); + __rhashtable_insert(ht, obj, new_tbl, old_tbl, new_hash); exit: unlock_buckets(new_tbl, old_tbl, new_hash); @@ -1123,8 +1103,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) if (!ht->p.hash_rnd) get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); - if (ht->p.grow_decision || ht->p.shrink_decision) - INIT_WORK(&ht->run_work, rht_deferred_worker); + INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } @@ -1142,8 +1121,7 @@ void rhashtable_destroy(struct rhashtable *ht) { ht->being_destroyed = true; - if (ht->p.grow_decision || ht->p.shrink_decision) - cancel_work_sync(&ht->run_work); + cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); bucket_table_free(rht_dereference(ht->tbl, ht)); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index f9e9d734446a..67c7593d1dd6 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -201,6 +201,7 @@ static int __init test_rht_init(void) .key_offset = offsetof(struct test_obj, value), .key_len = sizeof(int), .hashfn = jhash, + .max_shift = 1, /* we expand/shrink manually here */ .nulls_base = (3U << RHT_BASE_SHIFT), }; int err; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 61e6c407476a..c82df0a48fcd 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -192,8 +192,6 @@ static int nft_hash_init(const struct nft_set *set, .key_offset = offsetof(struct nft_hash_elem, key), .key_len = set->klen, .hashfn = jhash, - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; return rhashtable_init(priv, ¶ms); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2702673f0f23..05919bf3f670 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3126,8 +3126,6 @@ static int __init netlink_proto_init(void) .key_len = sizeof(u32), /* portid */ .hashfn = jhash, .max_shift = 16, /* 64K */ - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; if (err != 0) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f73e975af80b..b4d4467d0bb0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2364,8 +2364,6 @@ int tipc_sk_rht_init(struct net *net) .hashfn = jhash, .max_shift = 20, /* 1M */ .min_shift = 8, /* 256 */ - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, }; return rhashtable_init(&tn->sk_rht, &rht_params); -- cgit v1.2.3 From 140e049c64ce848392adbf4678983ecc76888dde Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 17:42:42 -0500 Subject: NFS: Add a helper to set attribute barriers Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 16 ++++++++++++++++ include/linux/nfs_fs.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 83107be3dd01..b0cbc1ba82da 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1260,6 +1260,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_fattr_init); +/** + * nfs_fattr_set_barrier + * @fattr: attributes + * + * Used to set a barrier after an attribute was updated. This + * barrier ensures that older attributes from RPC calls that may + * have raced with our update cannot clobber these new values. + * Note that you are still responsible for ensuring that other + * operations which change the attribute on the server do not + * collide. + */ +void nfs_fattr_set_barrier(struct nfs_fattr *fattr) +{ + fattr->gencount = nfs_inc_attr_generation_counter(); +} + struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 2f77e0c651c8..3a4ffb5856cd 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -369,6 +369,7 @@ extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ct extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); +extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); -- cgit v1.2.3 From f044636d972246d451e06226cc1675d5da389762 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 16:09:04 -0500 Subject: NFS: Add attribute update barriers to nfs_setattr_update_inode() Ensure that other operations which raced with our setattr RPC call cannot revert the file attribute changes that were made on the server. To do so, we artificially bump the attribute generation counter on the inode so that all calls to nfs_fattr_init() that precede ours will be dropped. The motivation for the patch came from Chuck Lever's reports of readaheads racing with truncate operations and causing the file size to be reverted. Reported-by: Chuck Lever Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 17 ++++++++++++----- fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 6 +++--- fs/nfs/proc.c | 2 +- include/linux/nfs_fs.h | 2 +- 5 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b0cbc1ba82da..3a2d127de499 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr); * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. + * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { @@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) if (err) goto out; - spin_lock(&inode->i_lock); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + spin_lock(&inode->i_lock); out: return err; } @@ -585,10 +586,15 @@ out: * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ -void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, + struct nfs_fattr *fattr) { + /* Barrier: bump the attribute generation count. */ + nfs_fattr_set_barrier(fattr); + + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { - spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_gid = attr->ia_gid; nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); - spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } + nfs_update_inode(inode, fattr); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 78e557c3ab87..11109a137c0c 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e41340e957d..c499e02a58ca 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2416,8 +2416,8 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr, sattr, state, label, olabel); if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } @@ -3291,7 +3291,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); if (status == 0) { - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); } nfs4_label_free(label); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b09cc23d6f43..6202bc0f11bb 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 3a4ffb5856cd..f26e64e0aff8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -356,7 +356,7 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); -extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); +extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); -- cgit v1.2.3 From a08a8cd375db9769588257e7782f6b6b68561b88 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Feb 2015 17:36:09 -0500 Subject: NFS: Add attribute update barriers to NFS writebacks Ensure that other operations that race with our write RPC calls cannot revert the file size updates that were made on the server. Signed-off-by: Trond Myklebust Tested-by: Chuck Lever --- fs/nfs/inode.c | 25 ++++++++++++++++++++++--- fs/nfs/internal.h | 1 + fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 2 +- fs/nfs/proc.c | 4 +--- fs/nfs/write.c | 30 ++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 7 files changed, 57 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 299bf7171a4d..ff9a6795da46 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1491,7 +1491,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** - * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes * @@ -1501,11 +1501,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); * * This function is mainly designed to be used by the ->write_done() functions. */ -int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int status; - spin_lock(&inode->i_lock); /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { @@ -1537,6 +1536,26 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b802fb3a2d99..9e6475bc5ba2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 11109a137c0c..1f11d2533ee4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c499e02a58ca..b022e64b76a5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4237,7 +4237,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); - nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr); + nfs_writeback_update_inode(hdr); } return 0; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 6202bc0f11bb..c63189acd052 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = hdr->inode; - if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 595d81e354d1..849ed784d6ac 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1377,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode) return 0; } +static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return; + if (argp->offset + resp->count != fattr->size) + return; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + return; + /* Set attribute barrier */ + nfs_fattr_set_barrier(fattr); +} + +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) +{ + struct nfs_fattr *fattr = hdr->res.fattr; + struct inode *inode = hdr->inode; + + if (fattr == NULL) + return; + spin_lock(&inode->i_lock); + nfs_writeback_check_extend(hdr, fattr); + nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_writeback_update_inode); + /* * This function is called when the WRITE call is complete. */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f26e64e0aff8..59b1516b9fd4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -343,6 +343,7 @@ extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); +extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); -- cgit v1.2.3 From f5956fafb00afab474c3886b6297f9b5e7aff722 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 2 Mar 2015 18:22:15 +0200 Subject: net/mlx4_core: Fix wrong mask and error flow for the update-qp command The bit mask for currently supported driver features (MLX4_UPDATE_QP_SUPPORTED_ATTRS) of the update-qp command was defined twice (using enum value and pre-processor define directive) and wrong. The return value of the call to mlx4_update_qp() from within the SRIOV resource-tracker was wrongly voided down. Fix both issues. issue: none Fixes: 09e05c3f78e9 ('net/mlx4: Set vlan stripping policy by the right command') Fixes: ce8d9e0d6746 ('net/mlx4_core: Add UPDATE_QP SRIOV wrapper support') Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/qp.c | 1 - drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 9 ++++++--- include/linux/mlx4/qp.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 2bb8553bd905..eda29dbbfcd2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -412,7 +412,6 @@ err_icm: EXPORT_SYMBOL_GPL(mlx4_qp_alloc); -#define MLX4_UPDATE_QP_SUPPORTED_ATTRS MLX4_UPDATE_QP_SMAC int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn, enum mlx4_update_qp_attr attr, struct mlx4_update_qp_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 486e3d26cd4a..d97ca88c55b5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -713,7 +713,7 @@ static int update_vport_qp_param(struct mlx4_dev *dev, struct mlx4_vport_oper_state *vp_oper; struct mlx4_priv *priv; u32 qp_type; - int port; + int port, err = 0; port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1; priv = mlx4_priv(dev); @@ -738,7 +738,9 @@ static int update_vport_qp_param(struct mlx4_dev *dev, } else { struct mlx4_update_qp_params params = {.flags = 0}; - mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, ¶ms); + err = mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, ¶ms); + if (err) + goto out; } } @@ -773,7 +775,8 @@ static int update_vport_qp_param(struct mlx4_dev *dev, qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC; qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx; } - return 0; +out: + return err; } static int mpt_mask(struct mlx4_dev *dev) diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 2bbc62aa818a..551f85456c11 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -427,7 +427,7 @@ struct mlx4_wqe_inline_seg { enum mlx4_update_qp_attr { MLX4_UPDATE_QP_SMAC = 1 << 0, - MLX4_UPDATE_QP_VSD = 1 << 2, + MLX4_UPDATE_QP_VSD = 1 << 1, MLX4_UPDATE_QP_SUPPORTED_ATTRS = (1 << 2) - 1 }; -- cgit v1.2.3 From c6331ba3d2d68758f36dbc3e09e648d312c24d97 Mon Sep 17 00:00:00 2001 From: Marcin Bis Date: Sun, 1 Mar 2015 13:49:32 +0100 Subject: spi: fix a typo in comment. alway -> always Signed-off-by: Marcin Bis Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index ed9489d893a4..856d34dde79b 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -649,7 +649,7 @@ struct spi_transfer { * sequence completes. On some systems, many such sequences can execute as * as single programmed DMA transfer. On all systems, these messages are * queued, and might complete after transactions to other devices. Messages - * sent to a given spi_device are alway executed in FIFO order. + * sent to a given spi_device are always executed in FIFO order. * * The code that submits an spi_message (and its spi_transfers) * to the lower layers is responsible for managing its memory. -- cgit v1.2.3 From 874f946376de57c8d6230b30ad71f742883fee3a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Mar 2015 23:32:08 -0500 Subject: NFS: Fix a regression in the read() syscall When invalidating the page cache for a regular file, we want to first sync all dirty data to disk and then call invalidate_inode_pages2(). The latter relies on nfs_launder_page() and nfs_release_page() to deal respectively with dirty pages, and unstable written pages. When commit 9590544694bec ("NFS: avoid deadlocks with loop-back mounted NFS filesystems.") changed the behaviour of nfs_release_page(), then it made it possible for invalidate_inode_pages2() to fail with an EBUSY. Unfortunately, that error is then propagated back to read(). Let's therefore work around the problem for now by protecting the call to sync the data and invalidate_inode_pages2() so that they are atomic w.r.t. the addition of new writes. Later on, we can revisit whether or not we still need nfs_launder_page() and nfs_release_page(). Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ++-- fs/nfs/inode.c | 37 ++++++++++++++++++++++++++++++++++--- include/linux/nfs_fs.h | 1 + 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c045c7169fa0..41963ffca597 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -178,7 +178,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) @@ -199,7 +199,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5026c44a98e1..8edb7d049565 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1067,11 +1067,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) } /** - * nfs_revalidate_mapping - Revalidate the pagecache + * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping + * @may_lock - take inode->i_mutex? */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static int __nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping, + bool may_lock) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1120,7 +1123,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); + if (may_lock) { + mutex_lock(&inode->i_mutex); + ret = nfs_invalidate_mapping(inode, mapping); + mutex_unlock(&inode->i_mutex); + } else + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1130,6 +1138,29 @@ out: return ret; } +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, false); +} + +/** + * nfs_revalidate_mapping_protected - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex + * while invalidating the mapping. + */ +int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, true); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 59b1516b9fd4..b01ccf371fdc 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -356,6 +356,7 @@ extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); +extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, -- cgit v1.2.3 From 17f480342026e54000731acaa69bf32787ce46cb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Feb 2015 00:07:55 +0100 Subject: genirq / PM: Add flag for shared NO_SUSPEND interrupt lines It currently is required that all users of NO_SUSPEND interrupt lines pass the IRQF_NO_SUSPEND flag when requesting the IRQ or the WARN_ON_ONCE() in irq_pm_install_action() will trigger. That is done to warn about situations in which unprepared interrupt handlers may be run unnecessarily for suspended devices and may attempt to access those devices by mistake. However, it may cause drivers that have no technical reasons for using IRQF_NO_SUSPEND to set that flag just because they happen to share the interrupt line with something like a timer. Moreover, the generic handling of wakeup interrupts introduced by commit 9ce7a25849e8 (genirq: Simplify wakeup mechanism) only works for IRQs without any NO_SUSPEND users, so the drivers of wakeup devices needing to use shared NO_SUSPEND interrupt lines for signaling system wakeup generally have to detect wakeup in their interrupt handlers. Thus if they happen to share an interrupt line with a NO_SUSPEND user, they also need to request that their interrupt handlers be run after suspend_device_irqs(). In both cases the reason for using IRQF_NO_SUSPEND is not because the driver in question has a genuine need to run its interrupt handler after suspend_device_irqs(), but because it happens to share the line with some other NO_SUSPEND user. Otherwise, the driver would do without IRQF_NO_SUSPEND just fine. To make it possible to specify that condition explicitly, introduce a new IRQ action handler flag for shared IRQs, IRQF_COND_SUSPEND, that, when set, will indicate to the IRQ core that the interrupt user is generally fine with suspending the IRQ, but it also can tolerate handler invocations after suspend_device_irqs() and, in particular, it is capable of detecting system wakeup and triggering it as appropriate from its interrupt handler. That will allow us to work around a problem with a shared timer interrupt line on at91 platforms. Link: http://marc.info/?l=linux-kernel&m=142252777602084&w=2 Link: http://marc.info/?t=142252775300011&r=1&w=2 Link: https://lkml.org/lkml/2014/12/15/552 Reported-by: Boris Brezillon Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland --- include/linux/interrupt.h | 5 +++++ include/linux/irqdesc.h | 1 + kernel/irq/manage.c | 7 ++++++- kernel/irq/pm.c | 7 ++++++- 4 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 606771c7cac2..2e88580194f0 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -59,6 +59,10 @@ * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. + * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this + * interrupt handler after suspending interrupts. For system + * wakeup devices users need to implement wakeup detection in + * their interrupt handlers. */ #define IRQF_DISABLED 0x00000020 #define IRQF_SHARED 0x00000080 @@ -72,6 +76,7 @@ #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 +#define IRQF_COND_SUSPEND 0x00040000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index faf433af425e..dd1109fb241e 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -78,6 +78,7 @@ struct irq_desc { #ifdef CONFIG_PM_SLEEP unsigned int nr_actions; unsigned int no_suspend_depth; + unsigned int cond_suspend_depth; unsigned int force_resume_depth; #endif #ifdef CONFIG_PROC_FS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 196a06fbc122..886d09e691d5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1474,8 +1474,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). + * + * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and + * it cannot be set along with IRQF_NO_SUSPEND. */ - if ((irqflags & IRQF_SHARED) && !dev_id) + if (((irqflags & IRQF_SHARED) && !dev_id) || + (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || + ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 3ca532592704..5204a6d1b985 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -43,9 +43,12 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - desc->no_suspend_depth != desc->nr_actions); + (desc->no_suspend_depth + + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -61,6 +64,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc, int irq) -- cgit v1.2.3 From 40eeb111d7c88bfbc38e1dfe330bc4cec05e0806 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 5 Mar 2015 10:08:14 +0100 Subject: Revert "pinctrl: consumer: use correct retval for placeholder functions" This reverts commit 5a7d2efdd93f6c4bb6cd3d5df3d2f5611c9b87ac. As per discussion on the mailing list, this is not the right thing to do. NULL cookies are valid in the stubs. Reported-by: Wolfram Sang Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 72c0415d6c21..18eccefea06e 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -82,7 +82,7 @@ static inline int pinctrl_gpio_direction_output(unsigned gpio) static inline struct pinctrl * __must_check pinctrl_get(struct device *dev) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline void pinctrl_put(struct pinctrl *p) @@ -93,7 +93,7 @@ static inline struct pinctrl_state * __must_check pinctrl_lookup_state( struct pinctrl *p, const char *name) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline int pinctrl_select_state(struct pinctrl *p, @@ -104,7 +104,7 @@ static inline int pinctrl_select_state(struct pinctrl *p, static inline struct pinctrl * __must_check devm_pinctrl_get(struct device *dev) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline void devm_pinctrl_put(struct pinctrl *p) -- cgit v1.2.3 From 8603e1b30027f943cc9c1eef2b291d42c3347af1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Mar 2015 08:04:13 -0500 Subject: workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE cancel[_delayed]_work_sync() are implemented using __cancel_work_timer() which grabs the PENDING bit using try_to_grab_pending() and then flushes the work item with PENDING set to prevent the on-going execution of the work item from requeueing itself. try_to_grab_pending() can always grab PENDING bit without blocking except when someone else is doing the above flushing during cancelation. In that case, try_to_grab_pending() returns -ENOENT. In this case, __cancel_work_timer() currently invokes flush_work(). The assumption is that the completion of the work item is what the other canceling task would be waiting for too and thus waiting for the same condition and retrying should allow forward progress without excessive busy looping Unfortunately, this doesn't work if preemption is disabled or the latter task has real time priority. Let's say task A just got woken up from flush_work() by the completion of the target work item. If, before task A starts executing, task B gets scheduled and invokes __cancel_work_timer() on the same work item, its try_to_grab_pending() will return -ENOENT as the work item is still being canceled by task A and flush_work() will also immediately return false as the work item is no longer executing. This puts task B in a busy loop possibly preventing task A from executing and clearing the canceling state on the work item leading to a hang. task A task B worker executing work __cancel_work_timer() try_to_grab_pending() set work CANCELING flush_work() block for work completion completion, wakes up A __cancel_work_timer() while (forever) { try_to_grab_pending() -ENOENT as work is being canceled flush_work() false as work is no longer executing } This patch removes the possible hang by updating __cancel_work_timer() to explicitly wait for clearing of CANCELING rather than invoking flush_work() after try_to_grab_pending() fails with -ENOENT. Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com v3: bit_waitqueue() can't be used for work items defined in vmalloc area. Switched to custom wake function which matches the target work item and exclusive wait and wakeup. v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if the target bit waitqueue has wait_bit_queue's on it. Use DEFINE_WAIT_BIT() and __wake_up_bit() instead. Reported by Tomeu Vizoso. Signed-off-by: Tejun Heo Reported-by: Rabin Vincent Cc: Tomeu Vizoso Cc: stable@vger.kernel.org Tested-by: Jesper Nilsson Tested-by: Rabin Vincent --- include/linux/workqueue.h | 3 ++- kernel/workqueue.c | 56 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 74db135f9957..f597846ff605 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -70,7 +70,8 @@ enum { /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, - WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), + __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, + WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f28849394791..41ff75b478c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2728,19 +2728,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2749,6 +2787,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } -- cgit v1.2.3 From ef2b22ac540c018bd574d1846ab95b9bfcf38702 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 2 Mar 2015 22:26:55 +0100 Subject: cpuidle / sleep: Use broadcast timer for states that stop local timer Commit 381063133246 (PM / sleep: Re-implement suspend-to-idle handling) overlooked the fact that entering some sufficiently deep idle states by CPUs may cause their local timers to stop and in those cases it is necessary to switch over to a broadcast timer prior to entering the idle state. If the cpuidle driver in use does not provide the new ->enter_freeze callback for any of the idle states, that problem affects suspend-to-idle too, but it is not taken into account after the changes made by commit 381063133246. Fix that by changing the definition of cpuidle_enter_freeze() and re-arranging of the code in cpuidle_idle_call(), so the former does not call cpuidle_enter() any more and the fallback case is handled by cpuidle_idle_call() directly. Fixes: 381063133246 (PM / sleep: Re-implement suspend-to-idle handling) Reported-and-tested-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/cpuidle.c | 62 +++++++++++++++++------------------------------ include/linux/cpuidle.h | 17 +++++++++++-- kernel/sched/idle.c | 30 ++++++++++++++++------- 3 files changed, 58 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8b3e132b6a01..080bd2dbde4b 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -44,8 +44,8 @@ void disable_cpuidle(void) off = 1; } -static bool cpuidle_not_available(struct cpuidle_driver *drv, - struct cpuidle_device *dev) +bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev) { return off || !initialized || !drv || !dev || !dev->enabled; } @@ -72,14 +72,8 @@ int cpuidle_play_dead(void) return -ENODEV; } -/** - * cpuidle_find_deepest_state - Find deepest state meeting specific conditions. - * @drv: cpuidle driver for the given CPU. - * @dev: cpuidle device for the given CPU. - * @freeze: Whether or not the state should be suitable for suspend-to-idle. - */ -static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev, bool freeze) +static int find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, bool freeze) { unsigned int latency_req = 0; int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1; @@ -98,6 +92,17 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return ret; } +/** + * cpuidle_find_deepest_state - Find the deepest available idle state. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. + */ +int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + return find_deepest_state(drv, dev, false); +} + static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { @@ -119,46 +124,26 @@ static void enter_freeze_proper(struct cpuidle_driver *drv, /** * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. * * If there are states with the ->enter_freeze callback, find the deepest of - * them and enter it with frozen tick. Otherwise, find the deepest state - * available and enter it normally. - * - * Returns with enabled interrupts. + * them and enter it with frozen tick. */ -void cpuidle_enter_freeze(void) +int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); - struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int index; - if (cpuidle_not_available(drv, dev)) - goto fallback; - /* * Find the deepest state with ->enter_freeze present, which guarantees * that interrupts won't be enabled when it exits and allows the tick to * be frozen safely. */ - index = cpuidle_find_deepest_state(drv, dev, true); - if (index >= 0) { + index = find_deepest_state(drv, dev, true); + if (index >= 0) enter_freeze_proper(drv, dev, index); - local_irq_enable(); - return; - } - /* - * It is not safe to freeze the tick, find the deepest state available - * at all and try to enter it normally. - */ - index = cpuidle_find_deepest_state(drv, dev, false); - if (index >= 0) { - cpuidle_enter(drv, dev, index); - return; - } - - fallback: - arch_cpu_idle(); + return index; } /** @@ -217,9 +202,6 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - if (cpuidle_not_available(drv, dev)) - return -ENODEV; - return cpuidle_curr_governor->select(drv, dev); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index f551a9299ac9..306178d7309f 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -126,6 +126,8 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); +extern bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev); @@ -150,11 +152,17 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); -extern void cpuidle_enter_freeze(void); +extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev); +extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, + struct cpuidle_device *dev); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } +static inline bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return true; } static inline int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } @@ -183,7 +191,12 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } -static inline void cpuidle_enter_freeze(void) { } +static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return -ENODEV; } +static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return -ENODEV; } static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 84b93b68482a..80014a178342 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -82,6 +82,7 @@ static void cpuidle_idle_call(void) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -105,6 +106,9 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* * Suspend-to-idle ("freeze") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only @@ -115,15 +119,22 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ if (idle_should_freeze()) { - cpuidle_enter_freeze(); - goto exit_idle; - } + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { + local_irq_enable(); + goto exit_idle; + } - /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. - */ - next_state = cpuidle_select(drv, dev); + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; + /* + * Ask the cpuidle framework to choose a convenient idle state. + */ + next_state = cpuidle_select(drv, dev); + } + /* Fall back to the default arch idle method on errors. */ if (next_state < 0) goto use_default; @@ -170,7 +181,8 @@ static void cpuidle_idle_call(void) /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); -- cgit v1.2.3 From 2bb785169e9709d41220e5c18b0270883a82f85c Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 1 Mar 2015 10:18:16 -0500 Subject: serial: core: Fix iotype userspace breakage commit 3ffb1a8193bea ("serial: core: Add big-endian iotype") re-numbered userspace-dependent values; ioctl(TIOCSSERIAL) can assign the port iotype (which is expected to match the selected i/o accessors), so iotype values must not be changed. Cc: Kevin Cernekee Cc: # 3.19+ Signed-off-by: Peter Hurley Reviewed-by: Kevin Cernekee Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index baf3e1d08416..1094f2d9cadb 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -147,9 +147,9 @@ struct uart_port { #define UPIO_HUB6 (1) /* Hub6 ISA card */ #define UPIO_MEM (2) /* 8b MMIO access */ #define UPIO_MEM32 (3) /* 32b little endian */ -#define UPIO_MEM32BE (4) /* 32b big endian */ -#define UPIO_AU (5) /* Au1x00 and RT288x type IO */ -#define UPIO_TSI (6) /* Tsi108/109 type IO */ +#define UPIO_AU (4) /* Au1x00 and RT288x type IO */ +#define UPIO_TSI (5) /* Tsi108/109 type IO */ +#define UPIO_MEM32BE (6) /* 32b big endian */ unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ -- cgit v1.2.3 From 647f162b8e7e446c4bade031eb8a1a0a83d3de82 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 1 Mar 2015 10:24:28 -0500 Subject: serial: uapi: Declare all userspace-visible io types ioctl(TIOCGSERIAL|TIOCSSERIAL) report and can change the port->iotype. UART drivers use the UPIO_* definitions, but the uapi header defines parallel values and userspace uses these parallel values for ioctls; thus the userspace values are definitive. Define UPIO_* iotypes in terms of the uapi defines, SERIAL_IO_*; extend the uapi defines to include all values in use by the serial core. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 14 +++++++------- include/uapi/linux/serial.h | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 1094f2d9cadb..d10965f0d8a4 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -143,13 +143,13 @@ struct uart_port { unsigned char iotype; /* io access style */ unsigned char unused1; -#define UPIO_PORT (0) /* 8b I/O port access */ -#define UPIO_HUB6 (1) /* Hub6 ISA card */ -#define UPIO_MEM (2) /* 8b MMIO access */ -#define UPIO_MEM32 (3) /* 32b little endian */ -#define UPIO_AU (4) /* Au1x00 and RT288x type IO */ -#define UPIO_TSI (5) /* Tsi108/109 type IO */ -#define UPIO_MEM32BE (6) /* 32b big endian */ +#define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ +#define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ +#define UPIO_MEM (SERIAL_IO_MEM) /* 8b MMIO access */ +#define UPIO_MEM32 (SERIAL_IO_MEM32) /* 32b little endian */ +#define UPIO_AU (SERIAL_IO_AU) /* Au1x00 and RT288x type IO */ +#define UPIO_TSI (SERIAL_IO_TSI) /* Tsi108/109 type IO */ +#define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h index 5e0d0ed61cf3..25331f9faa76 100644 --- a/include/uapi/linux/serial.h +++ b/include/uapi/linux/serial.h @@ -65,6 +65,10 @@ struct serial_struct { #define SERIAL_IO_PORT 0 #define SERIAL_IO_HUB6 1 #define SERIAL_IO_MEM 2 +#define SERIAL_IO_MEM32 3 +#define SERIAL_IO_AU 4 +#define SERIAL_IO_TSI 5 +#define SERIAL_IO_MEM32BE 6 #define UART_CLEAR_FIFO 0x01 #define UART_USE_FIFO 0x02 -- cgit v1.2.3 From f54b97ed0b17d3da5f98ba8188cd5646415a922d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 6 Mar 2015 16:37:41 +0000 Subject: irqchip: gicv3-its: Allocate enough memory for the full range of DeviceID The ITS table allocator is only allocating a single page per table. This works fine for most things, but leads to silent lack of interrupt delivery if we end-up with a device that has an ID that is out of the range defined by a single page of memory. Even worse, depending on the page size, behaviour changes, which is not a very good experience. A solution is actually to allocate memory for the full range of ID that the ITS supports. A massive waste memory wise, but at least a safe bet. Tested on a Phytium SoC. Tested-by: Chen Baozi Acked-by: Chen Baozi Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1425659870-11832-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic-v3-its.c | 25 +++++++++++++++++++++---- include/linux/irqchip/arm-gic-v3.h | 2 ++ 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index c217ebcf7a48..733b32fda390 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -806,14 +806,31 @@ static int its_alloc_tables(struct its_node *its) u64 val = readq_relaxed(its->base + GITS_BASER + i * 8); u64 type = GITS_BASER_TYPE(val); u64 entry_size = GITS_BASER_ENTRY_SIZE(val); + int order = 0; + int alloc_size; u64 tmp; void *base; if (type == GITS_BASER_TYPE_NONE) continue; - /* We're lazy and only allocate a single page for now */ - base = (void *)get_zeroed_page(GFP_KERNEL); + /* + * Allocate as many entries as required to fit the + * range of device IDs that the ITS can grok... The ID + * space being incredibly sparse, this results in a + * massive waste of memory. + * + * For other tables, only allocate a single page. + */ + if (type == GITS_BASER_TYPE_DEVICE) { + u64 typer = readq_relaxed(its->base + GITS_TYPER); + u32 ids = GITS_TYPER_DEVBITS(typer); + + order = get_order((1UL << ids) * entry_size); + } + + alloc_size = (1 << order) * PAGE_SIZE; + base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); if (!base) { err = -ENOMEM; goto out_free; @@ -841,7 +858,7 @@ retry_baser: break; } - val |= (PAGE_SIZE / psz) - 1; + val |= (alloc_size / psz) - 1; writeq_relaxed(val, its->base + GITS_BASER + i * 8); tmp = readq_relaxed(its->base + GITS_BASER + i * 8); @@ -882,7 +899,7 @@ retry_baser: } pr_info("ITS: allocated %d %s @%lx (psz %dK, shr %d)\n", - (int)(PAGE_SIZE / entry_size), + (int)(alloc_size / entry_size), its_base_type_string[type], (unsigned long)virt_to_phys(base), psz / SZ_1K, (int)shr >> GITS_BASER_SHAREABILITY_SHIFT); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 800544bc7bfd..cbdd440d486d 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -166,6 +166,8 @@ #define GITS_TRANSLATER 0x10040 +#define GITS_TYPER_DEVBITS_SHIFT 13 +#define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) #define GITS_TYPER_PTA (1UL << 19) #define GITS_CBASER_VALID (1UL << 63) -- cgit v1.2.3 From 7cb991164a46992a499ecdc77b17f8ac94bdb75f Mon Sep 17 00:00:00 2001 From: Yun Wu Date: Fri, 6 Mar 2015 16:37:49 +0000 Subject: irqchip: gicv3-its: Define macros for GITS_CTLR fields Define macros for GITS_CTLR fields to avoid using magic numbers. Acked-by: Marc Zyngier Signed-off-by: Yun Wu Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1425659870-11832-11-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic-v3-its.c | 2 +- include/linux/irqchip/arm-gic-v3.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index ec20d4a942e0..826da706be4b 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1388,7 +1388,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent) writeq_relaxed(baser, its->base + GITS_CBASER); tmp = readq_relaxed(its->base + GITS_CBASER); writeq_relaxed(0, its->base + GITS_CWRITER); - writel_relaxed(1, its->base + GITS_CTLR); + writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); if ((tmp ^ baser) & GITS_BASER_SHAREABILITY_MASK) { pr_info("ITS: using cache flushing for cmd queue\n"); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index cbdd440d486d..781974afff9f 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -166,6 +166,9 @@ #define GITS_TRANSLATER 0x10040 +#define GITS_CTLR_ENABLE (1U << 0) +#define GITS_CTLR_QUIESCENT (1U << 31) + #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) #define GITS_TYPER_PTA (1UL << 19) -- cgit v1.2.3 From 3d3801effda19b21012b5d1981e96cc277df85fd Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Wed, 25 Feb 2015 09:11:01 -0800 Subject: clk: introduce clk_is_match Some drivers compare struct clk pointers as a means of knowing if the two pointers reference the same clock hardware. This behavior is dubious (drivers must not dereference struct clk), but did not cause any regressions until the per-user struct clk patch was merged. Now the test for matching clk's will always fail with per-user struct clk's. clk_is_match is introduced to fix the regression and prevent drivers from comparing the pointers manually. Fixes: 035a61c314eb ("clk: Make clk API return per-user struct clk instances") Cc: Russell King Cc: Shawn Guo Cc: Tomeu Vizoso Signed-off-by: Michael Turquette [arnd@arndb.de: Fix COMMON_CLK=N && HAS_CLK=Y config] Signed-off-by: Arnd Bergmann [sboyd@codeaurora.org: const arguments to clk_is_match() and remove unnecessary ternary operation] Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 26 ++++++++++++++++++++++++++ include/linux/clk.h | 18 ++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index b9f85fc2ce3f..237f23f68bfc 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -2169,6 +2169,32 @@ int clk_get_phase(struct clk *clk) return clk_core_get_phase(clk->core); } +/** + * clk_is_match - check if two clk's point to the same hardware clock + * @p: clk compared against q + * @q: clk compared against p + * + * Returns true if the two struct clk pointers both point to the same hardware + * clock node. Put differently, returns true if struct clk *p and struct clk *q + * share the same struct clk_core object. + * + * Returns false otherwise. Note that two NULL clks are treated as matching. + */ +bool clk_is_match(const struct clk *p, const struct clk *q) +{ + /* trivial case: identical struct clk's or both NULL */ + if (p == q) + return true; + + /* true if clk->core pointers match. Avoid derefing garbage */ + if (!IS_ERR_OR_NULL(p) && !IS_ERR_OR_NULL(q)) + if (p->core == q->core) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(clk_is_match); + /** * __clk_init - initialize the data structures in a struct clk * @dev: device initializing this clk, placeholder for now diff --git a/include/linux/clk.h b/include/linux/clk.h index 8381bbfbc308..68c16a6bedb3 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -125,6 +125,19 @@ int clk_set_phase(struct clk *clk, int degrees); */ int clk_get_phase(struct clk *clk); +/** + * clk_is_match - check if two clk's point to the same hardware clock + * @p: clk compared against q + * @q: clk compared against p + * + * Returns true if the two struct clk pointers both point to the same hardware + * clock node. Put differently, returns true if struct clk *p and struct clk *q + * share the same struct clk_core object. + * + * Returns false otherwise. Note that two NULL clks are treated as matching. + */ +bool clk_is_match(const struct clk *p, const struct clk *q); + #else static inline long clk_get_accuracy(struct clk *clk) @@ -142,6 +155,11 @@ static inline long clk_get_phase(struct clk *clk) return -ENOTSUPP; } +static inline bool clk_is_match(const struct clk *p, const struct clk *q) +{ + return p == q; +} + #endif /** -- cgit v1.2.3 From c29390c6dfeee0944ac6b5610ebbe403944378fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2015 18:42:02 -0700 Subject: xps: must clear sender_cpu before forwarding John reported that my previous commit added a regression on his router. This is because sender_cpu & napi_id share a common location, so get_xps_queue() can see garbage and perform an out of bound access. We need to make sure sender_cpu is cleared before doing the transmit, otherwise any NIC busy poll enabled (skb_mark_napi_id()) can trigger this bug. Signed-off-by: Eric Dumazet Reported-by: John Bisected-by: John Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++++ net/core/skbuff.c | 2 +- net/ipv4/ip_forward.c | 1 + net/ipv6/ip6_output.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 30007afe70b3..f54d6659713a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -948,6 +948,13 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_sender_cpu_clear(struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + skb->sender_cpu = 0; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f80507823531..434e78e5254d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4173,7 +4173,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; - skb->sender_cpu = 0; + skb_sender_cpu_clear(skb); skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 787b3c294ce6..d9bc28ac5d1b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,6 +67,7 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + skb_sender_cpu_clear(skb); return dst_output(skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0a04a37305d5..7e80b61b51ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -318,6 +318,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct sk_buff *skb) { + skb_sender_cpu_clear(skb); return dst_output(skb); } -- cgit v1.2.3 From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 12 Mar 2015 16:26:11 -0700 Subject: kasan, module, vmalloc: rework shadow allocation for modules Current approach in handling shadow memory for modules is broken. Shadow memory could be freed only after memory shadow corresponds it is no longer used. vfree() called from interrupt context could use memory its freeing to store 'struct llist_node' in it: void vfree(const void *addr) { ... if (unlikely(in_interrupt())) { struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); Later this list node used in free_work() which actually frees memory. Currently module_memfree() called in interrupt context will free shadow before freeing module's memory which could provoke kernel crash. So shadow memory should be freed after module's memory. However, such deallocation order could race with kasan_module_alloc() in module_alloc(). Free shadow right before releasing vm area. At this point vfree()'d memory is not used anymore and yet not available for other allocations. New VM_KASAN flag used to indicate that vm area has dynamically allocated shadow memory so kasan frees shadow only if it was previously allocated. Signed-off-by: Andrey Ryabinin Acked-by: Rusty Russell Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 5 +++-- include/linux/vmalloc.h | 1 + kernel/module.c | 2 -- mm/kasan/kasan.c | 14 +++++++++++--- mm/vmalloc.c | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72ba725ddf9c..5fa48a21d73e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -5,6 +5,7 @@ struct kmem_cache; struct page; +struct vm_struct; #ifdef CONFIG_KASAN @@ -52,7 +53,7 @@ void kasan_slab_free(struct kmem_cache *s, void *object); #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) int kasan_module_alloc(void *addr, size_t size); -void kasan_module_free(void *addr); +void kasan_free_shadow(const struct vm_struct *vm); #else /* CONFIG_KASAN */ @@ -82,7 +83,7 @@ static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } -static inline void kasan_module_free(void *addr) {} +static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* CONFIG_KASAN */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 7d7acb35603d..0ec598381f97 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -17,6 +17,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/kernel/module.c b/kernel/module.c index cc93cf68653c..b3d634ed06c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { } void __weak module_memfree(void *module_region) { vfree(module_region); - kasan_module_free(module_region); } void __weak module_arch_cleanup(struct module *mod) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee632a7ee..936d81661c47 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "kasan.h" @@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size) GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); - return ret ? 0 : -ENOMEM; + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; } -void kasan_module_free(void *addr) +void kasan_free_shadow(const struct vm_struct *vm) { - vfree(kasan_mem_to_shadow(addr)); + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); } static void register_global(struct kasan_global *global) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1340ca..49abccf29a29 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; -- cgit v1.2.3 From d3733e5c98e952d419e77fa721912f09d15a2806 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 12 Mar 2015 16:26:14 -0700 Subject: kasan, module: move MODULE_ALIGN macro into include/linux/moduleloader.h is more suitable place for this macro. Also change alignment to PAGE_SIZE for CONFIG_KASAN=n as such alignment already assumed in several places. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ---- include/linux/moduleloader.h | 8 ++++++++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5fa48a21d73e..5bb074431eb0 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -50,15 +50,11 @@ void kasan_krealloc(const void *object, size_t new_size); void kasan_slab_alloc(struct kmem_cache *s, void *object); void kasan_slab_free(struct kmem_cache *s, void *object); -#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) - int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); #else /* CONFIG_KASAN */ -#define MODULE_ALIGN 1 - static inline void kasan_unpoison_shadow(const void *address, size_t size) {} static inline void kasan_enable_current(void) {} diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index f7556261fe3c..4d0cb9bba93e 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -84,4 +84,12 @@ void module_arch_cleanup(struct module *mod); /* Any cleanup before freeing mod->module_init */ void module_arch_freeing_init(struct module *mod); + +#ifdef CONFIG_KASAN +#include +#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) +#else +#define MODULE_ALIGN PAGE_SIZE +#endif + #endif -- cgit v1.2.3 From a697c2efba03ac7bfdbffbba7f0f1aa294f7dee0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 10 Mar 2015 20:31:04 -0700 Subject: of/platform: Fix sparc:allmodconfig build sparc:allmodconfig fails to build with: drivers/built-in.o: In function `platform_bus_init': (.init.text+0x3684): undefined reference to `of_platform_register_reconfig_notifier' of_platform_register_reconfig_notifier is only declared if both OF_ADDRESS and OF_DYNAMIC are configured. Yet, the include file only declares a dummy function if OF_DYNAMIC is not configured. The sparc architecture does not configure OF_ADDRESS, but does configure OF_DYNAMIC, causing above error. Fixes: 801d728c10db ("of/reconfig: Add OF_DYNAMIC notifier for platform_bus_type") Cc: Pantelis Antoniou Signed-off-by: Guenter Roeck Signed-off-by: Rob Herring --- include/linux/of_platform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index 8a860f096c35..611a691145c4 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -84,7 +84,7 @@ static inline int of_platform_populate(struct device_node *root, static inline void of_platform_depopulate(struct device *parent) { } #endif -#ifdef CONFIG_OF_DYNAMIC +#if defined(CONFIG_OF_DYNAMIC) && defined(CONFIG_OF_ADDRESS) extern void of_platform_register_reconfig_notifier(void); #else static inline void of_platform_register_reconfig_notifier(void) { } -- cgit v1.2.3 From 8cb2c2dc472775479a1a7e78180955f6f1cb0b0a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 12 Mar 2015 12:55:13 +0100 Subject: livepatch: Fix subtle race with coming and going modules There is a notifier that handles live patches for coming and going modules. It takes klp_mutex lock to avoid races with coming and going patches but it does not keep the lock all the time. Therefore the following races are possible: 1. The notifier is called sometime in STATE_MODULE_COMING. The module is visible by find_module() in this state all the time. It means that new patch can be registered and enabled even before the notifier is called. It might create wrong order of stacked patches, see below for an example. 2. New patch could still see the module in the GOING state even after the notifier has been called. It will try to initialize the related object structures but the module could disappear at any time. There will stay mess in the structures. It might even cause an invalid memory access. This patch solves the problem by adding a boolean variable into struct module. The value is true after the coming and before the going handler is called. New patches need to be applied when the value is true and they need to ignore the module when the value is false. Note that we need to know state of all modules on the system. The races are related to new patches. Therefore we do not know what modules will get patched. Also note that we could not simply ignore going modules. The code from the module could be called even in the GOING state until mod->exit() finishes. If we start supporting patches with semantic changes between function calls, we need to apply new patches to any still usable code. See below for an example. Finally note that the patch solves only the situation when a new patch is registered. There are no such problems when the patch is being removed. It does not matter who disable the patch first, whether the normal disable_patch() or the module notifier. There is nothing to do once the patch is disabled. Alternative solutions: ====================== + reject new patches when a patched module is coming or going; this is ugly + wait with adding new patch until the module leaves the COMING and GOING states; this might be dangerous and complicated; we would need to release kgr_lock in the middle of the patch registration to avoid a deadlock with the coming and going handlers; also we might need a waitqueue for each module which seems to be even bigger overhead than the boolean + stop modules from entering COMING and GOING states; wait until modules leave these states when they are already there; looks complicated; we would need to ignore the module that asked to stop the others to avoid a deadlock; also it is unclear what to do when two modules asked to stop others and both are in COMING state (situation when two new patches are applied) + always register/enable new patches and fix up the potential mess (registered patches order) in klp_module_init(); this is nasty and prone to regressions in the future development + add another MODULE_STATE where the kallsyms are visible but the module is not used yet; this looks too complex; the module states are checked on "many" locations Example of patch stacking breakage: =================================== The notifier could _not_ _simply_ ignore already initialized module objects. For example, let's have three patches (P1, P2, P3) for functions a() and b() where a() is from vmcore and b() is from a module M. Something like: a() b() P1 a1() b1() P2 a2() b2() P3 a3() b3(3) If you load the module M after all patches are registered and enabled. The ftrace ops for function a() and b() has listed the functions in this order: ops_a->func_stack -> list(a3,a2,a1) ops_b->func_stack -> list(b3,b2,b1) , so the pointer to b3() is the first and will be used. Then you might have the following scenario. Let's start with state when patches P1 and P2 are registered and enabled but the module M is not loaded. Then ftrace ops for b() does not exist. Then we get into the following race: CPU0 CPU1 load_module(M) complete_formation() mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); klp_register_patch(P3); klp_enable_patch(P3); # STATE 1 klp_module_notify(M) klp_module_notify_coming(P1); klp_module_notify_coming(P2); klp_module_notify_coming(P3); # STATE 2 The ftrace ops for a() and b() then looks: STATE1: ops_a->func_stack -> list(a3,a2,a1); ops_b->func_stack -> list(b3); STATE2: ops_a->func_stack -> list(a3,a2,a1); ops_b->func_stack -> list(b2,b1,b3); therefore, b2() is used for the module but a3() is used for vmcore because they were the last added. Example of the race with going modules: ======================================= CPU0 CPU1 delete_module() #SYSCALL try_stop_module() mod->state = MODULE_STATE_GOING; mutex_unlock(&module_mutex); klp_register_patch() klp_enable_patch() #save place to switch universe b() # from module that is going a() # from core (patched) mod->exit(); Note that the function b() can be called until we call mod->exit(). If we do not apply patch against b() because it is in MODULE_STATE_GOING, it will call patched a() with modified semantic and things might get wrong. [jpoimboe@redhat.com: use one boolean instead of two] Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- include/linux/module.h | 4 ++++ kernel/livepatch/core.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index b653d7c0a05a..7232fde6a991 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -344,6 +344,10 @@ struct module { unsigned long *ftrace_callsites; #endif +#ifdef CONFIG_LIVEPATCH + bool klp_alive; +#endif + #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 01ca08804f51..3f9f1d6b4c2e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj) /* sets obj->mod if object is not vmlinux and module is found */ static void klp_find_object_module(struct klp_object *obj) { + struct module *mod; + if (!klp_is_module(obj)) return; mutex_lock(&module_mutex); /* - * We don't need to take a reference on the module here because we have - * the klp_mutex, which is also taken by the module notifier. This - * prevents any module from unloading until we release the klp_mutex. + * We do not want to block removal of patched modules and therefore + * we do not take a reference here. The patches are removed by + * a going module handler instead. + */ + mod = find_module(obj->name); + /* + * Do not mess work of the module coming and going notifiers. + * Note that the patch might still be needed before the going handler + * is called. Module functions can be called even in the GOING state + * until mod->exit() finishes. This is especially important for + * patches that modify semantic of the functions. */ - obj->mod = find_module(obj->name); + if (mod && mod->klp_alive) + obj->mod = mod; + mutex_unlock(&module_mutex); } @@ -767,6 +779,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) return -EINVAL; obj->state = KLP_DISABLED; + obj->mod = NULL; klp_find_object_module(obj); @@ -961,6 +974,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action, mutex_lock(&klp_mutex); + /* + * Each module has to know that the notifier has been called. + * We never know what module will get patched by a new patch. + */ + if (action == MODULE_STATE_COMING) + mod->klp_alive = true; + else /* MODULE_STATE_GOING */ + mod->klp_alive = false; + list_for_each_entry(patch, &klp_patches, list) { for (obj = patch->objs; obj->funcs; obj++) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) -- cgit v1.2.3 From e03826d5045e81a66a4fad7be9a8ecdaeb7911cf Mon Sep 17 00:00:00 2001 From: Keerthy Date: Tue, 17 Mar 2015 15:56:04 +0530 Subject: regulator: palmas: Correct TPS659038 register definition for REGEN2 The register offset for REGEN2_CTRL in different for TPS659038 chip as when compared with other Palmas family PMICs. In the case of TPS659038 the wrong offset pointed to PLLEN_CTRL and was causing a hang. Correcting the same. Signed-off-by: Keerthy Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/regulator/palmas-regulator.c | 4 ++++ include/linux/mfd/palmas.h | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c index 9205f433573c..18198316b6cf 100644 --- a/drivers/regulator/palmas-regulator.c +++ b/drivers/regulator/palmas-regulator.c @@ -1572,6 +1572,10 @@ static int palmas_regulators_probe(struct platform_device *pdev) if (!pmic) return -ENOMEM; + if (of_device_is_compatible(node, "ti,tps659038-pmic")) + palmas_generic_regs_info[PALMAS_REG_REGEN2].ctrl_addr = + TPS659038_REGEN2_CTRL; + pmic->dev = &pdev->dev; pmic->palmas = palmas; palmas->pmic = pmic; diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h index fb0390a1a498..ee7b1ce7a6f8 100644 --- a/include/linux/mfd/palmas.h +++ b/include/linux/mfd/palmas.h @@ -2999,6 +2999,9 @@ enum usb_irq_events { #define PALMAS_GPADC_TRIM15 0x0E #define PALMAS_GPADC_TRIM16 0x0F +/* TPS659038 regen2_ctrl offset iss different from palmas */ +#define TPS659038_REGEN2_CTRL 0x12 + /* TPS65917 Interrupt registers */ /* Registers for function INTERRUPT */ -- cgit v1.2.3 From ad41faa88e39af451427c921a0f8b441e104b6fa Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 17 Mar 2015 11:16:00 +0100 Subject: netdevice.h: fix ndo_bridge_* comments The argument 'flags' was missing in ndo_bridge_setlink(). ndo_bridge_dellink() was missing. Fixes: 407af3299ef1 ("bridge: Add netlink interface to configure vlans on bridge ports") Fixes: add511b38266 ("bridge: add flags argument to ndo_bridge_setlink and ndo_bridge_dellink") CC: Vlad Yasevich CC: Roopa Prabhu Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 429d1790a27e..dcf6ec27739b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -965,9 +965,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * - * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh) + * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, + * u16 flags) * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, * struct net_device *dev, u32 filter_mask) + * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, + * u16 flags); * * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); * Called to change device carrier. Soft-devices (like dummy, team, etc) -- cgit v1.2.3 From cf39284d41f67964cf42b21bb386c012cf5b7f65 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Wed, 18 Mar 2015 08:57:41 +0800 Subject: regulator: Fix documentation for regmap in the config dev_get_regulator() does not exist, fix the typo. Signed-off-by: Axel Lin Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index d4ad5b5a02bb..045f709cb89b 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -316,7 +316,7 @@ struct regulator_desc { * @driver_data: private regulator data * @of_node: OpenFirmware node to parse for device tree bindings (may be * NULL). - * @regmap: regmap to use for core regmap helpers if dev_get_regulator() is + * @regmap: regmap to use for core regmap helpers if dev_get_regmap() is * insufficient. * @ena_gpio_initialized: GPIO controlling regulator enable was properly * initialized, meaning that >= 0 is a valid gpio -- cgit v1.2.3 From 5067c0469c643512f24786990e315f9c15cc7d24 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Mar 2015 10:32:18 -0700 Subject: ata: Add a new flag to destinguish sas controller SAS controller has its own tag allocation, which doesn't directly match to ATA tag, so SAS and SATA have different code path for ata tags. Originally we use port->scsi_host (98bd4be1) to destinguish SAS controller, but libsas set ->scsi_host too, so we can't use it for the destinguish, we add a new flag for this purpose. Without this patch, the following oops can happen because scsi-mq uses a host-wide tag map shared among all devices with some integer tag values >= ATA_MAX_QUEUE. These unexpectedly high tag values cause __ata_qc_from_tag() to return NULL, which is then dereferenced in ata_qc_new_init(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 IP: [] ata_qc_new_init+0x3e/0x120 PGD 32adf0067 PUD 32adf1067 PMD 0 Oops: 0002 [#1] SMP DEBUG_PAGEALLOC Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi igb i2c_algo_bit ptp pps_core pm80xx libsas scsi_transport_sas sg coretemp eeprom w83795 i2c_i801 CPU: 4 PID: 1450 Comm: cydiskbench Not tainted 4.0.0-rc3 #1 Hardware name: Supermicro X8DTH-i/6/iF/6F/X8DTH, BIOS 2.1b 05/04/12 task: ffff8800ba86d500 ti: ffff88032a064000 task.ti: ffff88032a064000 RIP: 0010:[] [] ata_qc_new_init+0x3e/0x120 RSP: 0018:ffff88032a067858 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff8800ba0d2230 RCX: 000000000000002a RDX: ffffffff80505ae0 RSI: 0000000000000020 RDI: ffff8800ba0d2230 RBP: ffff88032a067868 R08: 0000000000000201 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800ba0d0000 R13: ffff8800ba0d2230 R14: ffffffff80505ae0 R15: ffff8800ba0d0000 FS: 0000000041223950(0063) GS:ffff88033e480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000058 CR3: 000000032a0a3000 CR4: 00000000000006e0 Stack: ffff880329eee758 ffff880329eee758 ffff88032a0678a8 ffffffff80502dad ffff8800ba167978 ffff880329eee758 ffff88032bf9c520 ffff8800ba167978 ffff88032bf9c520 ffff88032bf9a290 ffff88032a0678b8 ffffffff80506909 Call Trace: [] ata_scsi_translate+0x3d/0x1b0 [] ata_sas_queuecmd+0x149/0x2a0 [] sas_queuecommand+0xa0/0x1f0 [libsas] [] scsi_dispatch_cmd+0xd4/0x1a0 [] scsi_queue_rq+0x66f/0x7f0 [] __blk_mq_run_hw_queue+0x208/0x3f0 [] blk_mq_run_hw_queue+0x88/0xc0 [] blk_mq_insert_request+0xc4/0x130 [] blk_execute_rq_nowait+0x73/0x160 [] sg_common_write+0x3da/0x720 [sg] [] sg_new_write+0x250/0x360 [sg] [] sg_write+0x13b/0x450 [sg] [] vfs_write+0xd1/0x1b0 [] SyS_write+0x54/0xc0 [] system_call_fastpath+0x12/0x17 tj: updated description. Fixes: 12cb5ce101ab ("libata: use blk taging") Reported-and-tested-by: Tony Battersby Signed-off-by: Shaohua Li Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 4 ++-- drivers/scsi/ipr.c | 3 ++- drivers/scsi/libsas/sas_ata.c | 3 ++- include/linux/libata.h | 1 + 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4c35f0822d06..ef150ebb4c30 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4737,7 +4737,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag) return NULL; /* libsas case */ - if (!ap->scsi_host) { + if (ap->flags & ATA_FLAG_SAS_HOST) { tag = ata_sas_allocate_tag(ap); if (tag < 0) return NULL; @@ -4776,7 +4776,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - if (!ap->scsi_host) + if (ap->flags & ATA_FLAG_SAS_HOST) ata_sas_free_tag(tag, ap); } } diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 9219953ee949..d9afc51af7d3 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -6815,7 +6815,8 @@ static struct ata_port_operations ipr_sata_ops = { }; static struct ata_port_info sata_port_info = { - .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA, + .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | + ATA_FLAG_SAS_HOST, .pio_mask = ATA_PIO4_ONLY, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA6, diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 932d9cc98d2f..9c706d8c1441 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -547,7 +547,8 @@ static struct ata_port_operations sas_sata_ops = { }; static struct ata_port_info sata_port_info = { - .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ, + .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ | + ATA_FLAG_SAS_HOST, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA6, diff --git a/include/linux/libata.h b/include/linux/libata.h index fc03efa64ffe..6b08cc106c21 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -232,6 +232,7 @@ enum { * led */ ATA_FLAG_NO_DIPM = (1 << 23), /* host not happy with DIPM */ ATA_FLAG_LOWTAG = (1 << 24), /* host wants lowest available tag */ + ATA_FLAG_SAS_HOST = (1 << 25), /* SAS host */ /* bits 24:31 of ap->flags are reserved for LLD specific flags */ -- cgit v1.2.3 From 074c238177a75f5e79af3b2cb6a84e54823ef950 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:42 -0700 Subject: mm: numa: slow PTE scan rate if migration failures occur Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226 Across the board the 4.0-rc1 numbers are much slower, and the degradation is far worse when using the large memory footprint configs. Perf points straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config: - 56.07% 56.07% [kernel] [k] default_send_IPI_mask_sequence_phys - default_send_IPI_mask_sequence_phys - 99.99% physflat_send_IPI_mask - 99.37% native_send_call_func_ipi smp_call_function_many - native_flush_tlb_others - 99.85% flush_tlb_page ptep_clear_flush try_to_unmap_one rmap_walk try_to_unmap migrate_pages migrate_misplaced_page - handle_mm_fault - 99.73% __do_page_fault trace_do_page_fault do_async_page_fault + async_page_fault 0.63% native_send_call_func_single_ipi generic_exec_single smp_call_function_single This is showing excessive migration activity even though excessive migrations are meant to get throttled. Normally, the scan rate is tuned on a per-task basis depending on the locality of faults. However, if migrations fail for any reason then the PTE scanner may scan faster if the faults continue to be remote. This means there is higher system CPU overhead and fault trapping at exactly the time we know that migrations cannot happen. This patch tracks when migration failures occur and slows the PTE scanner. Signed-off-by: Mel Gorman Reported-by: Dave Chinner Tested-by: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 +++++---- kernel/sched/fair.c | 8 ++++++-- mm/huge_memory.c | 3 ++- mm/memory.c | 3 ++- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..a419b65770d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1625,11 +1625,11 @@ struct task_struct { /* * numa_faults_locality tracks if faults recorded during the last - * scan window were remote/local. The task scan period is adapted - * based on the locality of the faults with different weights - * depending on whether they were shared or private faults + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults */ - unsigned long numa_faults_locality[2]; + unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1719,6 +1719,7 @@ struct task_struct { #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 #define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..bcfe32088b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0a42d1521aa4..51b3e7c64622 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } + } else + flags |= TNF_MIGRATE_FAIL; goto out; clear_pmdnuma: diff --git a/mm/memory.c b/mm/memory.c index d20e12da3a3c..97839f5c8c30 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; - } + } else + flags |= TNF_MIGRATE_FAIL; out: if (page_nid != -1) -- cgit v1.2.3