diff options
author | David S. Miller <davem@davemloft.net> | 2015-02-07 02:18:39 +0300 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-02-07 02:18:39 +0300 |
commit | 41e8b206f8df6daf90e132a73fc2c60f51a36c6b (patch) | |
tree | 709001bcb9373ca4d622f70d4c48095aea4d1a61 | |
parent | 2ca292d968ef20cb04f31192d1f626bd8d782960 (diff) | |
parent | cf52d52f9ccb9966ac019d9f79824195583e3e6c (diff) | |
download | linux-41e8b206f8df6daf90e132a73fc2c60f51a36c6b.tar.xz |
Merge branch 'rhashtable-next'
Thomas Graf says:
====================
rhashtable fixes
This series fixes all remaining known issues with rhashtable that
have been reported. In particular the race condition reported by
Ying Xue.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | lib/rhashtable.c | 305 |
1 files changed, 167 insertions, 138 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 057919164e23..5919d63f58e4 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1,7 +1,7 @@ /* * Resizable, Scalable, Concurrent Hash Table * - * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch> + * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Based on the following paper: @@ -34,12 +34,17 @@ enum { RHT_LOCK_NORMAL, RHT_LOCK_NESTED, - RHT_LOCK_NESTED2, }; /* The bucket lock is selected based on the hash and protects mutations * on a group of hash buckets. * + * A maximum of tbl->size/2 bucket locks is allocated. This ensures that + * a single lock always covers both buckets which may both contains + * entries which link to the same bucket of the old table during resizing. + * This allows to simplify the locking as locking the bucket in both + * tables during resize always guarantee protection. + * * IMPORTANT: When holding the bucket lock of both the old and new table * during expansions and shrinking, the old bucket lock must always be * acquired first. @@ -49,26 +54,6 @@ static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) return &tbl->locks[hash & tbl->locks_mask]; } -#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) -#define ASSERT_BUCKET_LOCK(TBL, HASH) \ - BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH)) - -#ifdef CONFIG_PROVE_LOCKING -int lockdep_rht_mutex_is_held(struct rhashtable *ht) -{ - return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; -} -EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); - -int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) -{ - spinlock_t *lock = bucket_lock(tbl, hash); - - return (debug_locks) ? lockdep_is_held(lock) : 1; -} -EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); -#endif - static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (void *) he - ht->p.head_offset; @@ -94,13 +79,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) { - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - u32 hash; - - hash = ht->p.hashfn(key, len, ht->p.hash_rnd); - hash >>= HASH_RESERVED_SPACE; - - return rht_bucket_index(tbl, hash); + return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE; } static u32 head_hashfn(const struct rhashtable *ht, @@ -110,6 +89,77 @@ static u32 head_hashfn(const struct rhashtable *ht, return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he))); } +#ifdef CONFIG_PROVE_LOCKING +static void debug_dump_buckets(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + struct rhash_head *he; + unsigned int i, hash; + + for (i = 0; i < tbl->size; i++) { + pr_warn(" [Bucket %d] ", i); + rht_for_each_rcu(he, tbl, i) { + hash = head_hashfn(ht, tbl, he); + pr_cont("[hash = %#x, lock = %p] ", + hash, bucket_lock(tbl, hash)); + } + pr_cont("\n"); + } + +} + +static void debug_dump_table(struct rhashtable *ht, + const struct bucket_table *tbl, + unsigned int hash) +{ + struct bucket_table *old_tbl, *future_tbl; + + pr_emerg("BUG: lock for hash %#x in table %p not held\n", + hash, tbl); + + rcu_read_lock(); + future_tbl = rht_dereference_rcu(ht->future_tbl, ht); + old_tbl = rht_dereference_rcu(ht->tbl, ht); + if (future_tbl != old_tbl) { + pr_warn("Future table %p (size: %zd)\n", + future_tbl, future_tbl->size); + debug_dump_buckets(ht, future_tbl); + } + + pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size); + debug_dump_buckets(ht, old_tbl); + + rcu_read_unlock(); +} + +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) +#define ASSERT_BUCKET_LOCK(HT, TBL, HASH) \ + do { \ + if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) { \ + debug_dump_table(HT, TBL, HASH); \ + BUG(); \ + } \ + } while (0) + +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); + +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + spinlock_t *lock = bucket_lock(tbl, hash); + + return (debug_locks) ? lockdep_is_held(lock) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#define ASSERT_BUCKET_LOCK(HT, TBL, HASH) +#endif + + static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) { struct rhash_head __rcu **pprev; @@ -134,8 +184,8 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); - /* Never allocate more than one lock per bucket */ - size = min_t(unsigned int, size, tbl->size); + /* Never allocate more than 0.5 locks per bucket */ + size = min_t(unsigned int, size, tbl->size >> 1); if (sizeof(spinlock_t) != 0) { #ifdef CONFIG_NUMA @@ -217,25 +267,48 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) } EXPORT_SYMBOL_GPL(rht_shrink_below_30); -static void hashtable_chain_unzip(const struct rhashtable *ht, +static void lock_buckets(struct bucket_table *new_tbl, + struct bucket_table *old_tbl, unsigned int hash) + __acquires(old_bucket_lock) +{ + spin_lock_bh(bucket_lock(old_tbl, hash)); + if (new_tbl != old_tbl) + spin_lock_bh_nested(bucket_lock(new_tbl, hash), + RHT_LOCK_NESTED); +} + +static void unlock_buckets(struct bucket_table *new_tbl, + struct bucket_table *old_tbl, unsigned int hash) + __releases(old_bucket_lock) +{ + if (new_tbl != old_tbl) + spin_unlock_bh(bucket_lock(new_tbl, hash)); + spin_unlock_bh(bucket_lock(old_tbl, hash)); +} + +/** + * Unlink entries on bucket which hash to different bucket. + * + * Returns true if no more work needs to be performed on the bucket. + */ +static bool hashtable_chain_unzip(struct rhashtable *ht, const struct bucket_table *new_tbl, struct bucket_table *old_tbl, size_t old_hash) { struct rhash_head *he, *p, *next; - spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL; unsigned int new_hash, new_hash2; - ASSERT_BUCKET_LOCK(old_tbl, old_hash); + ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash); /* Old bucket empty, no work needed. */ p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, old_hash); if (rht_is_a_nulls(p)) - return; + return false; - new_hash = new_hash2 = head_hashfn(ht, new_tbl, p); - new_bucket_lock = bucket_lock(new_tbl, new_hash); + new_hash = head_hashfn(ht, new_tbl, p); + ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); /* Advance the old bucket pointer one or more times until it * reaches a node that doesn't hash to the same bucket as the @@ -243,22 +316,14 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, */ rht_for_each_continue(he, p->next, old_tbl, old_hash) { new_hash2 = head_hashfn(ht, new_tbl, he); + ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2); + if (new_hash != new_hash2) break; p = he; } rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); - spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); - - /* If we have encountered an entry that maps to a different bucket in - * the new table, lock down that bucket as well as we might cut off - * the end of the chain. - */ - new_bucket_lock2 = bucket_lock(new_tbl, new_hash); - if (new_bucket_lock != new_bucket_lock2) - spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2); - /* Find the subsequent node which does hash to the same * bucket as node P, or NULL if no such node exists. */ @@ -277,21 +342,18 @@ static void hashtable_chain_unzip(const struct rhashtable *ht, */ rcu_assign_pointer(p->next, next); - if (new_bucket_lock != new_bucket_lock2) - spin_unlock_bh(new_bucket_lock2); - spin_unlock_bh(new_bucket_lock); + p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, + old_hash); + + return !rht_is_a_nulls(p); } -static void link_old_to_new(struct bucket_table *new_tbl, +static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl, unsigned int new_hash, struct rhash_head *entry) { - spinlock_t *new_bucket_lock; + ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); - new_bucket_lock = bucket_lock(new_tbl, new_hash); - - spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); - spin_unlock_bh(new_bucket_lock); } /** @@ -314,7 +376,6 @@ int rhashtable_expand(struct rhashtable *ht) { struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); struct rhash_head *he; - spinlock_t *old_bucket_lock; unsigned int new_hash, old_hash; bool complete = false; @@ -344,24 +405,16 @@ int rhashtable_expand(struct rhashtable *ht) */ for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { old_hash = rht_bucket_index(old_tbl, new_hash); - old_bucket_lock = bucket_lock(old_tbl, old_hash); - - spin_lock_bh(old_bucket_lock); + lock_buckets(new_tbl, old_tbl, new_hash); rht_for_each(he, old_tbl, old_hash) { if (head_hashfn(ht, new_tbl, he) == new_hash) { - link_old_to_new(new_tbl, new_hash, he); + link_old_to_new(ht, new_tbl, new_hash, he); break; } } - spin_unlock_bh(old_bucket_lock); + unlock_buckets(new_tbl, old_tbl, new_hash); } - /* Publish the new table pointer. Lookups may now traverse - * the new table, but they will not benefit from any - * additional efficiency until later steps unzip the buckets. - */ - rcu_assign_pointer(ht->tbl, new_tbl); - /* Unzip interleaved hash chains */ while (!complete && !ht->being_destroyed) { /* Wait for readers. All new readers will see the new @@ -376,21 +429,19 @@ int rhashtable_expand(struct rhashtable *ht) */ complete = true; for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { - struct rhash_head *head; - - old_bucket_lock = bucket_lock(old_tbl, old_hash); - spin_lock_bh(old_bucket_lock); + lock_buckets(new_tbl, old_tbl, old_hash); - hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash); - head = rht_dereference_bucket(old_tbl->buckets[old_hash], - old_tbl, old_hash); - if (!rht_is_a_nulls(head)) + if (hashtable_chain_unzip(ht, new_tbl, old_tbl, + old_hash)) complete = false; - spin_unlock_bh(old_bucket_lock); + unlock_buckets(new_tbl, old_tbl, old_hash); } } + rcu_assign_pointer(ht->tbl, new_tbl); + synchronize_rcu(); + bucket_table_free(old_tbl); return 0; } @@ -415,7 +466,6 @@ EXPORT_SYMBOL_GPL(rhashtable_expand); int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); - spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2; unsigned int new_hash; ASSERT_RHT_MUTEX(ht); @@ -433,36 +483,17 @@ int rhashtable_shrink(struct rhashtable *ht) * always divide the size in half when shrinking, each bucket * in the new table maps to exactly two buckets in the old * table. - * - * As removals can occur concurrently on the old table, we need - * to lock down both matching buckets in the old table. */ for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { - old_bucket_lock1 = bucket_lock(tbl, new_hash); - old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size); - new_bucket_lock = bucket_lock(new_tbl, new_hash); - - spin_lock_bh(old_bucket_lock1); - - /* Depending on the lock per buckets mapping, the bucket in - * the lower and upper region may map to the same lock. - */ - if (old_bucket_lock1 != old_bucket_lock2) { - spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED); - spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2); - } else { - spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); - } + lock_buckets(new_tbl, tbl, new_hash); rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), tbl->buckets[new_hash]); + ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size); rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), tbl->buckets[new_hash + new_tbl->size]); - spin_unlock_bh(new_bucket_lock); - if (old_bucket_lock1 != old_bucket_lock2) - spin_unlock_bh(old_bucket_lock2); - spin_unlock_bh(old_bucket_lock1); + unlock_buckets(new_tbl, tbl, new_hash); } /* Publish the new, valid hash table */ @@ -524,6 +555,8 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + ASSERT_BUCKET_LOCK(ht, tbl, hash); + if (rht_is_a_nulls(head)) INIT_RHT_NULLS_HEAD(obj->next, ht, hash); else @@ -553,19 +586,18 @@ static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, */ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) { - struct bucket_table *tbl; - spinlock_t *lock; + struct bucket_table *tbl, *old_tbl; unsigned hash; rcu_read_lock(); tbl = rht_dereference_rcu(ht->future_tbl, ht); + old_tbl = rht_dereference_rcu(ht->tbl, ht); hash = head_hashfn(ht, tbl, obj); - lock = bucket_lock(tbl, hash); - spin_lock_bh(lock); + lock_buckets(tbl, old_tbl, hash); __rhashtable_insert(ht, obj, tbl, hash); - spin_unlock_bh(lock); + unlock_buckets(tbl, old_tbl, hash); rcu_read_unlock(); } @@ -588,21 +620,20 @@ EXPORT_SYMBOL_GPL(rhashtable_insert); */ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) { - struct bucket_table *tbl; + struct bucket_table *tbl, *new_tbl, *old_tbl; struct rhash_head __rcu **pprev; - struct rhash_head *he; - spinlock_t *lock; - unsigned int hash; + struct rhash_head *he, *he2; + unsigned int hash, new_hash; bool ret = false; rcu_read_lock(); - tbl = rht_dereference_rcu(ht->tbl, ht); - hash = head_hashfn(ht, tbl, obj); - - lock = bucket_lock(tbl, hash); - spin_lock_bh(lock); + tbl = old_tbl = rht_dereference_rcu(ht->tbl, ht); + new_tbl = rht_dereference_rcu(ht->future_tbl, ht); + new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); + lock_buckets(new_tbl, old_tbl, new_hash); restart: + hash = rht_bucket_index(tbl, new_hash); pprev = &tbl->buckets[hash]; rht_for_each(he, tbl, hash) { if (he != obj) { @@ -610,8 +641,22 @@ restart: continue; } - rcu_assign_pointer(*pprev, obj->next); + ASSERT_BUCKET_LOCK(ht, tbl, hash); + + if (unlikely(new_tbl != tbl)) { + rht_for_each_continue(he2, he->next, tbl, hash) { + if (head_hashfn(ht, tbl, he2) == hash) { + rcu_assign_pointer(*pprev, he2); + goto found; + } + } + + INIT_RHT_NULLS_HEAD(*pprev, ht, hash); + } else { + rcu_assign_pointer(*pprev, obj->next); + } +found: ret = true; break; } @@ -621,18 +666,12 @@ restart: * resizing. Thus traversing both is fine and the added cost is * very rare. */ - if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) { - spin_unlock_bh(lock); - - tbl = rht_dereference_rcu(ht->future_tbl, ht); - hash = head_hashfn(ht, tbl, obj); - - lock = bucket_lock(tbl, hash); - spin_lock_bh(lock); + if (tbl != new_tbl) { + tbl = new_tbl; goto restart; } - spin_unlock_bh(lock); + unlock_buckets(new_tbl, old_tbl, new_hash); if (ret) { atomic_dec(&ht->nelems); @@ -788,24 +827,17 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, void *arg) { struct bucket_table *new_tbl, *old_tbl; - spinlock_t *new_bucket_lock, *old_bucket_lock; - u32 new_hash, old_hash; + u32 new_hash; bool success = true; BUG_ON(!ht->p.key_len); rcu_read_lock(); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - old_hash = head_hashfn(ht, old_tbl, obj); - old_bucket_lock = bucket_lock(old_tbl, old_hash); - spin_lock_bh(old_bucket_lock); - new_tbl = rht_dereference_rcu(ht->future_tbl, ht); new_hash = head_hashfn(ht, new_tbl, obj); - new_bucket_lock = bucket_lock(new_tbl, new_hash); - if (unlikely(old_tbl != new_tbl)) - spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); + + lock_buckets(new_tbl, old_tbl, new_hash); if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, compare, arg)) { @@ -816,10 +848,7 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, __rhashtable_insert(ht, obj, new_tbl, new_hash); exit: - if (unlikely(old_tbl != new_tbl)) - spin_unlock_bh(new_bucket_lock); - spin_unlock_bh(old_bucket_lock); - + unlock_buckets(new_tbl, old_tbl, new_hash); rcu_read_unlock(); return success; |