From f9a61eb4e2471c56a63cd804c7474128138c38ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 11:49:09 -0500 Subject: mbcache2: reimplement mbcache Original mbcache was designed to have more features than what ext? filesystems ended up using. It supported entry being in more hashes, it had a home-grown rwlocking of each entry, and one cache could cache entries from multiple filesystems. This genericity also resulted in more complex locking, larger cache entries, and generally more code complexity. This is reimplementation of the mbcache functionality to exactly fit the purpose ext? filesystems use it for. Cache entries are now considerably smaller (7 instead of 13 longs), the code is considerably smaller as well (414 vs 913 lines of code), and IMO also simpler. The new code is also much more lightweight. I have measured the speed using artificial xattr-bench benchmark, which spawns P processes, each process sets xattr for F different files, and the value of xattr is randomly chosen from a pool of V values. Averages of runtimes for 5 runs for various combinations of parameters are below. The first value in each cell is old mbache, the second value is the new mbcache. V=10 F\P 1 2 4 8 16 32 64 10 0.158,0.157 0.208,0.196 0.500,0.277 0.798,0.400 3.258,0.584 13.807,1.047 61.339,2.803 100 0.172,0.167 0.279,0.222 0.520,0.275 0.825,0.341 2.981,0.505 12.022,1.202 44.641,2.943 1000 0.185,0.174 0.297,0.239 0.445,0.283 0.767,0.340 2.329,0.480 6.342,1.198 16.440,3.888 V=100 F\P 1 2 4 8 16 32 64 10 0.162,0.153 0.200,0.186 0.362,0.257 0.671,0.496 1.433,0.943 3.801,1.345 7.938,2.501 100 0.153,0.160 0.221,0.199 0.404,0.264 0.945,0.379 1.556,0.485 3.761,1.156 7.901,2.484 1000 0.215,0.191 0.303,0.246 0.471,0.288 0.960,0.347 1.647,0.479 3.916,1.176 8.058,3.160 V=1000 F\P 1 2 4 8 16 32 64 10 0.151,0.129 0.210,0.163 0.326,0.245 0.685,0.521 1.284,0.859 3.087,2.251 6.451,4.801 100 0.154,0.153 0.211,0.191 0.276,0.282 0.687,0.506 1.202,0.877 3.259,1.954 8.738,2.887 1000 0.145,0.179 0.202,0.222 0.449,0.319 0.899,0.333 1.577,0.524 4.221,1.240 9.782,3.579 V=10000 F\P 1 2 4 8 16 32 64 10 0.161,0.154 0.198,0.190 0.296,0.256 0.662,0.480 1.192,0.818 2.989,2.200 6.362,4.746 100 0.176,0.174 0.236,0.203 0.326,0.255 0.696,0.511 1.183,0.855 4.205,3.444 19.510,17.760 1000 0.199,0.183 0.240,0.227 1.159,1.014 2.286,2.154 6.023,6.039 ---,10.933 ---,36.620 V=100000 F\P 1 2 4 8 16 32 64 10 0.171,0.162 0.204,0.198 0.285,0.230 0.692,0.500 1.225,0.881 2.990,2.243 6.379,4.771 100 0.151,0.171 0.220,0.210 0.295,0.255 0.720,0.518 1.226,0.844 3.423,2.831 19.234,17.544 1000 0.192,0.189 0.249,0.225 1.162,1.043 2.257,2.093 5.853,4.997 ---,10.399 ---,32.198 We see that the new code is faster in pretty much all the cases and starting from 4 processes there are significant gains with the new code resulting in upto 20-times shorter runtimes. Also for large numbers of cached entries all values for the old code could not be measured as the kernel started hitting softlockups and died before the test completed. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- include/linux/mbcache2.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 include/linux/mbcache2.h (limited to 'include') diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h new file mode 100644 index 000000000000..b6f160ff2533 --- /dev/null +++ b/include/linux/mbcache2.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_MB2CACHE_H +#define _LINUX_MB2CACHE_H + +#include +#include +#include +#include +#include + +struct mb2_cache; + +struct mb2_cache_entry { + /* LRU list - protected by cache->c_lru_list_lock */ + struct list_head e_lru_list; + /* Hash table list - protected by bitlock in e_hash_list_head */ + struct hlist_bl_node e_hash_list; + atomic_t e_refcnt; + /* Key in hash - stable during lifetime of the entry */ + u32 e_key; + /* Block number of hashed block - stable during lifetime of the entry */ + sector_t e_block; + /* Head of hash list (for list bit lock) - stable */ + struct hlist_bl_head *e_hash_list_head; +}; + +struct mb2_cache *mb2_cache_create(int bucket_bits); +void mb2_cache_destroy(struct mb2_cache *cache); + +int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, + sector_t block); +void __mb2_cache_entry_free(struct mb2_cache_entry *entry); +static inline int mb2_cache_entry_put(struct mb2_cache *cache, + struct mb2_cache_entry *entry) +{ + if (!atomic_dec_and_test(&entry->e_refcnt)) + return 0; + __mb2_cache_entry_free(entry); + return 1; +} + +void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, + sector_t block); +struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, + u32 key); +struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, + struct mb2_cache_entry *entry); +void mb2_cache_entry_touch(struct mb2_cache *cache, + struct mb2_cache_entry *entry); + +#endif /* _LINUX_MB2CACHE_H */ -- cgit v1.2.3 From ecd1e64412d5242b8afdef58a714bab3c5464f79 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 12:21:14 -0500 Subject: mbcache: remove mbcache Both ext2 and ext4 are now converted to mbcache2. Remove the old mbcache code. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/Makefile | 2 +- fs/mbcache.c | 858 ------------------------------------------------ include/linux/mbcache.h | 55 ---- 3 files changed, 1 insertion(+), 914 deletions(-) delete mode 100644 fs/mbcache.c delete mode 100644 include/linux/mbcache.h (limited to 'include') diff --git a/fs/Makefile b/fs/Makefile index 15b3d6c4e46a..59b844007fbc 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o -obj-$(CONFIG_FS_MBCACHE) += mbcache.o mbcache2.o +obj-$(CONFIG_FS_MBCACHE) += mbcache2.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o diff --git a/fs/mbcache.c b/fs/mbcache.c deleted file mode 100644 index 187477ded6b3..000000000000 --- a/fs/mbcache.c +++ /dev/null @@ -1,858 +0,0 @@ -/* - * linux/fs/mbcache.c - * (C) 2001-2002 Andreas Gruenbacher, - */ - -/* - * Filesystem Meta Information Block Cache (mbcache) - * - * The mbcache caches blocks of block devices that need to be located - * by their device/block number, as well as by other criteria (such - * as the block's contents). - * - * There can only be one cache entry in a cache per device and block number. - * Additional indexes need not be unique in this sense. The number of - * additional indexes (=other criteria) can be hardwired at compile time - * or specified at cache create time. - * - * Each cache entry is of fixed size. An entry may be `valid' or `invalid' - * in the cache. A valid entry is in the main hash tables of the cache, - * and may also be in the lru list. An invalid entry is not in any hashes - * or lists. - * - * A valid cache entry is only in the lru list if no handles refer to it. - * Invalid cache entries will be freed when the last handle to the cache - * entry is released. Entries that cannot be freed immediately are put - * back on the lru list. - */ - -/* - * Lock descriptions and usage: - * - * Each hash chain of both the block and index hash tables now contains - * a built-in lock used to serialize accesses to the hash chain. - * - * Accesses to global data structures mb_cache_list and mb_cache_lru_list - * are serialized via the global spinlock mb_cache_spinlock. - * - * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize - * accesses to its local data, such as e_used and e_queued. - * - * Lock ordering: - * - * Each block hash chain's lock has the highest lock order, followed by an - * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's - * lock), and mb_cach_spinlock, with the lowest order. While holding - * either a block or index hash chain lock, a thread can acquire an - * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. - * - * Synchronization: - * - * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and - * index hash chian, it needs to lock the corresponding hash chain. For each - * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to - * prevent either any simultaneous release or free on the entry and also - * to serialize accesses to either the e_used or e_queued member of the entry. - * - * To avoid having a dangling reference to an already freed - * mb_cache_entry, an mb_cache_entry is only freed when it is not on a - * block hash chain and also no longer being referenced, both e_used, - * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is - * first removed from a block hash chain. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef MB_CACHE_DEBUG -# define mb_debug(f...) do { \ - printk(KERN_DEBUG f); \ - printk("\n"); \ - } while (0) -#define mb_assert(c) do { if (!(c)) \ - printk(KERN_ERR "assertion " #c " failed\n"); \ - } while(0) -#else -# define mb_debug(f...) do { } while(0) -# define mb_assert(c) do { } while(0) -#endif -#define mb_error(f...) do { \ - printk(KERN_ERR f); \ - printk("\n"); \ - } while(0) - -#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) - -#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) -#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ - (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) - -static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); -static struct blockgroup_lock *mb_cache_bg_lock; -static struct kmem_cache *mb_cache_kmem_cache; - -MODULE_AUTHOR("Andreas Gruenbacher "); -MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -MODULE_LICENSE("GPL"); - -EXPORT_SYMBOL(mb_cache_create); -EXPORT_SYMBOL(mb_cache_shrink); -EXPORT_SYMBOL(mb_cache_destroy); -EXPORT_SYMBOL(mb_cache_entry_alloc); -EXPORT_SYMBOL(mb_cache_entry_insert); -EXPORT_SYMBOL(mb_cache_entry_release); -EXPORT_SYMBOL(mb_cache_entry_free); -EXPORT_SYMBOL(mb_cache_entry_get); -#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -EXPORT_SYMBOL(mb_cache_entry_find_first); -EXPORT_SYMBOL(mb_cache_entry_find_next); -#endif - -/* - * Global data: list of all mbcache's, lru list, and a spinlock for - * accessing cache data structures on SMP machines. The lru list is - * global across all mbcaches. - */ - -static LIST_HEAD(mb_cache_list); -static LIST_HEAD(mb_cache_lru_list); -static DEFINE_SPINLOCK(mb_cache_spinlock); - -static inline void -__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) -{ - spin_lock(bgl_lock_ptr(mb_cache_bg_lock, - MB_CACHE_ENTRY_LOCK_INDEX(ce))); -} - -static inline void -__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) -{ - spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, - MB_CACHE_ENTRY_LOCK_INDEX(ce))); -} - -static inline int -__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) -{ - return !hlist_bl_unhashed(&ce->e_block_list); -} - - -static inline void -__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) -{ - if (__mb_cache_entry_is_block_hashed(ce)) - hlist_bl_del_init(&ce->e_block_list); -} - -static inline int -__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) -{ - return !hlist_bl_unhashed(&ce->e_index.o_list); -} - -static inline void -__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) -{ - if (__mb_cache_entry_is_index_hashed(ce)) - hlist_bl_del_init(&ce->e_index.o_list); -} - -/* - * __mb_cache_entry_unhash_unlock() - * - * This function is called to unhash both the block and index hash - * chain. - * It assumes both the block and index hash chain is locked upon entry. - * It also unlock both hash chains both exit - */ -static inline void -__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) -{ - __mb_cache_entry_unhash_index(ce); - hlist_bl_unlock(ce->e_index_hash_p); - __mb_cache_entry_unhash_block(ce); - hlist_bl_unlock(ce->e_block_hash_p); -} - -static void -__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) -{ - struct mb_cache *cache = ce->e_cache; - - mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); - kmem_cache_free(cache->c_entry_cache, ce); - atomic_dec(&cache->c_entry_count); -} - -static void -__mb_cache_entry_release(struct mb_cache_entry *ce) -{ - /* First lock the entry to serialize access to its local data. */ - __spin_lock_mb_cache_entry(ce); - /* Wake up all processes queuing for this cache entry. */ - if (ce->e_queued) - wake_up_all(&mb_cache_queue); - if (ce->e_used >= MB_CACHE_WRITER) - ce->e_used -= MB_CACHE_WRITER; - /* - * Make sure that all cache entries on lru_list have - * both e_used and e_qued of 0s. - */ - ce->e_used--; - if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { - if (!__mb_cache_entry_is_block_hashed(ce)) { - __spin_unlock_mb_cache_entry(ce); - goto forget; - } - /* - * Need access to lru list, first drop entry lock, - * then reacquire the lock in the proper order. - */ - spin_lock(&mb_cache_spinlock); - if (list_empty(&ce->e_lru_list)) - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); - spin_unlock(&mb_cache_spinlock); - } - __spin_unlock_mb_cache_entry(ce); - return; -forget: - mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_forget(ce, GFP_KERNEL); -} - -/* - * mb_cache_shrink_scan() memory pressure callback - * - * This function is called by the kernel memory management when memory - * gets low. - * - * @shrink: (ignored) - * @sc: shrink_control passed from reclaim - * - * Returns the number of objects freed. - */ -static unsigned long -mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - LIST_HEAD(free_list); - struct mb_cache_entry *entry, *tmp; - int nr_to_scan = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - unsigned long freed = 0; - - mb_debug("trying to free %d entries", nr_to_scan); - spin_lock(&mb_cache_spinlock); - while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { - struct mb_cache_entry *ce = - list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* Prevent any find or get operation on the entry */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - spin_lock(&mb_cache_spinlock); - continue; - } - __mb_cache_entry_unhash_unlock(ce); - list_add_tail(&ce->e_lru_list, &free_list); - spin_lock(&mb_cache_spinlock); - } - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(entry, gfp_mask); - freed++; - } - return freed; -} - -static unsigned long -mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - struct mb_cache *cache; - unsigned long count = 0; - - spin_lock(&mb_cache_spinlock); - list_for_each_entry(cache, &mb_cache_list, c_cache_list) { - mb_debug("cache %s (%d)", cache->c_name, - atomic_read(&cache->c_entry_count)); - count += atomic_read(&cache->c_entry_count); - } - spin_unlock(&mb_cache_spinlock); - - return vfs_pressure_ratio(count); -} - -static struct shrinker mb_cache_shrinker = { - .count_objects = mb_cache_shrink_count, - .scan_objects = mb_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - -/* - * mb_cache_create() create a new cache - * - * All entries in one cache are equal size. Cache entries may be from - * multiple devices. If this is the first mbcache created, registers - * the cache with kernel memory management. Returns NULL if no more - * memory was available. - * - * @name: name of the cache (informal) - * @bucket_bits: log2(number of hash buckets) - */ -struct mb_cache * -mb_cache_create(const char *name, int bucket_bits) -{ - int n, bucket_count = 1 << bucket_bits; - struct mb_cache *cache = NULL; - - if (!mb_cache_bg_lock) { - mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), - GFP_KERNEL); - if (!mb_cache_bg_lock) - return NULL; - bgl_lock_init(mb_cache_bg_lock); - } - - cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); - if (!cache) - return NULL; - cache->c_name = name; - atomic_set(&cache->c_entry_count, 0); - cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * - sizeof(struct hlist_bl_head), GFP_KERNEL); - if (!cache->c_block_hash) - goto fail; - for (n=0; nc_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * - sizeof(struct hlist_bl_head), GFP_KERNEL); - if (!cache->c_index_hash) - goto fail; - for (n=0; nc_index_hash[n]); - if (!mb_cache_kmem_cache) { - mb_cache_kmem_cache = kmem_cache_create(name, - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!mb_cache_kmem_cache) - goto fail2; - } - cache->c_entry_cache = mb_cache_kmem_cache; - - /* - * Set an upper limit on the number of cache entries so that the hash - * chains won't grow too long. - */ - cache->c_max_entries = bucket_count << 4; - - spin_lock(&mb_cache_spinlock); - list_add(&cache->c_cache_list, &mb_cache_list); - spin_unlock(&mb_cache_spinlock); - return cache; - -fail2: - kfree(cache->c_index_hash); - -fail: - kfree(cache->c_block_hash); - kfree(cache); - return NULL; -} - - -/* - * mb_cache_shrink() - * - * Removes all cache entries of a device from the cache. All cache entries - * currently in use cannot be freed, and thus remain in the cache. All others - * are freed. - * - * @bdev: which device's cache entries to shrink - */ -void -mb_cache_shrink(struct block_device *bdev) -{ - LIST_HEAD(free_list); - struct list_head *l; - struct mb_cache_entry *ce, *tmp; - - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - while (!list_is_last(l, &mb_cache_lru_list)) { - l = l->next; - ce = list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_bdev == bdev) { - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* - * Prevent any find or get operation on the entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - continue; - } - __mb_cache_entry_unhash_unlock(ce); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - list_add_tail(&ce->e_lru_list, &free_list); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - } - } - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(ce, GFP_KERNEL); - } -} - - -/* - * mb_cache_destroy() - * - * Shrinks the cache to its minimum possible size (hopefully 0 entries), - * and then destroys it. If this was the last mbcache, un-registers the - * mbcache from kernel memory management. - */ -void -mb_cache_destroy(struct mb_cache *cache) -{ - LIST_HEAD(free_list); - struct mb_cache_entry *ce, *tmp; - - spin_lock(&mb_cache_spinlock); - list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { - if (ce->e_cache == cache) - list_move_tail(&ce->e_lru_list, &free_list); - } - list_del(&cache->c_cache_list); - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { - list_del_init(&ce->e_lru_list); - /* - * Prevent any find or get operation on the entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - __mb_cache_entry_unhash_unlock(ce); - __mb_cache_entry_forget(ce, GFP_KERNEL); - } - - if (atomic_read(&cache->c_entry_count) > 0) { - mb_error("cache %s: %d orphaned entries", - cache->c_name, - atomic_read(&cache->c_entry_count)); - } - - if (list_empty(&mb_cache_list)) { - kmem_cache_destroy(mb_cache_kmem_cache); - mb_cache_kmem_cache = NULL; - } - kfree(cache->c_index_hash); - kfree(cache->c_block_hash); - kfree(cache); -} - -/* - * mb_cache_entry_alloc() - * - * Allocates a new cache entry. The new entry will not be valid initially, - * and thus cannot be looked up yet. It should be filled with data, and - * then inserted into the cache using mb_cache_entry_insert(). Returns NULL - * if no more memory was available. - */ -struct mb_cache_entry * -mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) -{ - struct mb_cache_entry *ce; - - if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { - struct list_head *l; - - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - while (!list_is_last(l, &mb_cache_lru_list)) { - l = l->next; - ce = list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_cache == cache) { - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* - * Prevent any find or get operation on the - * entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - continue; - } - mb_assert(list_empty(&ce->e_lru_list)); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - __mb_cache_entry_unhash_unlock(ce); - goto found; - } - } - spin_unlock(&mb_cache_spinlock); - } - - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (!ce) - return NULL; - atomic_inc(&cache->c_entry_count); - INIT_LIST_HEAD(&ce->e_lru_list); - INIT_HLIST_BL_NODE(&ce->e_block_list); - INIT_HLIST_BL_NODE(&ce->e_index.o_list); - ce->e_cache = cache; - ce->e_queued = 0; - atomic_set(&ce->e_refcnt, 0); -found: - ce->e_block_hash_p = &cache->c_block_hash[0]; - ce->e_index_hash_p = &cache->c_index_hash[0]; - ce->e_used = 1 + MB_CACHE_WRITER; - return ce; -} - - -/* - * mb_cache_entry_insert() - * - * Inserts an entry that was allocated using mb_cache_entry_alloc() into - * the cache. After this, the cache entry can be looked up, but is not yet - * in the lru list as the caller still holds a handle to it. Returns 0 on - * success, or -EBUSY if a cache entry for that device + inode exists - * already (this may happen after a failed lookup, but when another process - * has inserted the same cache entry in the meantime). - * - * @bdev: device the cache entry belongs to - * @block: block number - * @key: lookup key - */ -int -mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, - sector_t block, unsigned int key) -{ - struct mb_cache *cache = ce->e_cache; - unsigned int bucket; - struct hlist_bl_node *l; - struct hlist_bl_head *block_hash_p; - struct hlist_bl_head *index_hash_p; - struct mb_cache_entry *lce; - - mb_assert(ce); - bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), - cache->c_bucket_bits); - block_hash_p = &cache->c_block_hash[bucket]; - hlist_bl_lock(block_hash_p); - hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { - if (lce->e_bdev == bdev && lce->e_block == block) { - hlist_bl_unlock(block_hash_p); - return -EBUSY; - } - } - mb_assert(!__mb_cache_entry_is_block_hashed(ce)); - __mb_cache_entry_unhash_block(ce); - __mb_cache_entry_unhash_index(ce); - ce->e_bdev = bdev; - ce->e_block = block; - ce->e_block_hash_p = block_hash_p; - ce->e_index.o_key = key; - hlist_bl_add_head(&ce->e_block_list, block_hash_p); - hlist_bl_unlock(block_hash_p); - bucket = hash_long(key, cache->c_bucket_bits); - index_hash_p = &cache->c_index_hash[bucket]; - hlist_bl_lock(index_hash_p); - ce->e_index_hash_p = index_hash_p; - hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); - hlist_bl_unlock(index_hash_p); - return 0; -} - - -/* - * mb_cache_entry_release() - * - * Release a handle to a cache entry. When the last handle to a cache entry - * is released it is either freed (if it is invalid) or otherwise inserted - * in to the lru list. - */ -void -mb_cache_entry_release(struct mb_cache_entry *ce) -{ - __mb_cache_entry_release(ce); -} - - -/* - * mb_cache_entry_free() - * - */ -void -mb_cache_entry_free(struct mb_cache_entry *ce) -{ - mb_assert(ce); - mb_assert(list_empty(&ce->e_lru_list)); - hlist_bl_lock(ce->e_index_hash_p); - __mb_cache_entry_unhash_index(ce); - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_lock(ce->e_block_hash_p); - __mb_cache_entry_unhash_block(ce); - hlist_bl_unlock(ce->e_block_hash_p); - __mb_cache_entry_release(ce); -} - - -/* - * mb_cache_entry_get() - * - * Get a cache entry by device / block number. (There can only be one entry - * in the cache per device and block.) Returns NULL if no such cache entry - * exists. The returned cache entry is locked for exclusive access ("single - * writer"). - */ -struct mb_cache_entry * -mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, - sector_t block) -{ - unsigned int bucket; - struct hlist_bl_node *l; - struct mb_cache_entry *ce; - struct hlist_bl_head *block_hash_p; - - bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), - cache->c_bucket_bits); - block_hash_p = &cache->c_block_hash[bucket]; - /* First serialize access to the block corresponding hash chain. */ - hlist_bl_lock(block_hash_p); - hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { - mb_assert(ce->e_block_hash_p == block_hash_p); - if (ce->e_bdev == bdev && ce->e_block == block) { - /* - * Prevent a free from removing the entry. - */ - atomic_inc(&ce->e_refcnt); - hlist_bl_unlock(block_hash_p); - __spin_lock_mb_cache_entry(ce); - atomic_dec(&ce->e_refcnt); - if (ce->e_used > 0) { - DEFINE_WAIT(wait); - while (ce->e_used > 0) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - __spin_unlock_mb_cache_entry(ce); - schedule(); - __spin_lock_mb_cache_entry(ce); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); - } - ce->e_used += 1 + MB_CACHE_WRITER; - __spin_unlock_mb_cache_entry(ce); - - if (!list_empty(&ce->e_lru_list)) { - spin_lock(&mb_cache_spinlock); - list_del_init(&ce->e_lru_list); - spin_unlock(&mb_cache_spinlock); - } - if (!__mb_cache_entry_is_block_hashed(ce)) { - __mb_cache_entry_release(ce); - return NULL; - } - return ce; - } - } - hlist_bl_unlock(block_hash_p); - return NULL; -} - -#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) - -static struct mb_cache_entry * -__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, - struct block_device *bdev, unsigned int key) -{ - - /* The index hash chain is alredy acquire by caller. */ - while (l != NULL) { - struct mb_cache_entry *ce = - hlist_bl_entry(l, struct mb_cache_entry, - e_index.o_list); - mb_assert(ce->e_index_hash_p == head); - if (ce->e_bdev == bdev && ce->e_index.o_key == key) { - /* - * Prevent a free from removing the entry. - */ - atomic_inc(&ce->e_refcnt); - hlist_bl_unlock(head); - __spin_lock_mb_cache_entry(ce); - atomic_dec(&ce->e_refcnt); - ce->e_used++; - /* Incrementing before holding the lock gives readers - priority over writers. */ - if (ce->e_used >= MB_CACHE_WRITER) { - DEFINE_WAIT(wait); - - while (ce->e_used >= MB_CACHE_WRITER) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - __spin_unlock_mb_cache_entry(ce); - schedule(); - __spin_lock_mb_cache_entry(ce); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); - } - __spin_unlock_mb_cache_entry(ce); - if (!list_empty(&ce->e_lru_list)) { - spin_lock(&mb_cache_spinlock); - list_del_init(&ce->e_lru_list); - spin_unlock(&mb_cache_spinlock); - } - if (!__mb_cache_entry_is_block_hashed(ce)) { - __mb_cache_entry_release(ce); - return ERR_PTR(-EAGAIN); - } - return ce; - } - l = l->next; - } - hlist_bl_unlock(head); - return NULL; -} - - -/* - * mb_cache_entry_find_first() - * - * Find the first cache entry on a given device with a certain key in - * an additional index. Additional matches can be found with - * mb_cache_entry_find_next(). Returns NULL if no match was found. The - * returned cache entry is locked for shared access ("multiple readers"). - * - * @cache: the cache to search - * @bdev: the device the cache entry should belong to - * @key: the key in the index - */ -struct mb_cache_entry * -mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, - unsigned int key) -{ - unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct hlist_bl_node *l; - struct mb_cache_entry *ce = NULL; - struct hlist_bl_head *index_hash_p; - - index_hash_p = &cache->c_index_hash[bucket]; - hlist_bl_lock(index_hash_p); - if (!hlist_bl_empty(index_hash_p)) { - l = hlist_bl_first(index_hash_p); - ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); - } else - hlist_bl_unlock(index_hash_p); - return ce; -} - - -/* - * mb_cache_entry_find_next() - * - * Find the next cache entry on a given device with a certain key in an - * additional index. Returns NULL if no match could be found. The previous - * entry is atomatically released, so that mb_cache_entry_find_next() can - * be called like this: - * - * entry = mb_cache_entry_find_first(); - * while (entry) { - * ... - * entry = mb_cache_entry_find_next(entry, ...); - * } - * - * @prev: The previous match - * @bdev: the device the cache entry should belong to - * @key: the key in the index - */ -struct mb_cache_entry * -mb_cache_entry_find_next(struct mb_cache_entry *prev, - struct block_device *bdev, unsigned int key) -{ - struct mb_cache *cache = prev->e_cache; - unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct hlist_bl_node *l; - struct mb_cache_entry *ce; - struct hlist_bl_head *index_hash_p; - - index_hash_p = &cache->c_index_hash[bucket]; - mb_assert(prev->e_index_hash_p == index_hash_p); - hlist_bl_lock(index_hash_p); - mb_assert(!hlist_bl_empty(index_hash_p)); - l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); - __mb_cache_entry_release(prev); - return ce; -} - -#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ - -static int __init init_mbcache(void) -{ - register_shrinker(&mb_cache_shrinker); - return 0; -} - -static void __exit exit_mbcache(void) -{ - unregister_shrinker(&mb_cache_shrinker); -} - -module_init(init_mbcache) -module_exit(exit_mbcache) - diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h deleted file mode 100644 index 6a392e7a723a..000000000000 --- a/include/linux/mbcache.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - File: linux/mbcache.h - - (C) 2001 by Andreas Gruenbacher, -*/ -struct mb_cache_entry { - struct list_head e_lru_list; - struct mb_cache *e_cache; - unsigned short e_used; - unsigned short e_queued; - atomic_t e_refcnt; - struct block_device *e_bdev; - sector_t e_block; - struct hlist_bl_node e_block_list; - struct { - struct hlist_bl_node o_list; - unsigned int o_key; - } e_index; - struct hlist_bl_head *e_block_hash_p; - struct hlist_bl_head *e_index_hash_p; -}; - -struct mb_cache { - struct list_head c_cache_list; - const char *c_name; - atomic_t c_entry_count; - int c_max_entries; - int c_bucket_bits; - struct kmem_cache *c_entry_cache; - struct hlist_bl_head *c_block_hash; - struct hlist_bl_head *c_index_hash; -}; - -/* Functions on caches */ - -struct mb_cache *mb_cache_create(const char *, int); -void mb_cache_shrink(struct block_device *); -void mb_cache_destroy(struct mb_cache *); - -/* Functions on cache entries */ - -struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t); -int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *, - sector_t, unsigned int); -void mb_cache_entry_release(struct mb_cache_entry *); -void mb_cache_entry_free(struct mb_cache_entry *); -struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, - struct block_device *, - sector_t); -struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, - struct block_device *, - unsigned int); -struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, - struct block_device *, - unsigned int); -- cgit v1.2.3 From f0c8b46238db9d51ef9ea0858259958d0c601cec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 18:23:47 -0500 Subject: mbcache2: Use referenced bit instead of LRU Currently we maintain perfect LRU list by moving entry to the tail of the list when it gets used. However these operations on cache-global list are relatively expensive. In this patch we switch to lazy updates of LRU list. Whenever entry gets used, we set a referenced bit in it. When reclaiming entries, we give referenced entries another round in the LRU. Since the list is not a real LRU anymore, rename it to just 'list'. In my testing this logic gives about 30% boost to workloads with mostly unique xattr blocks (e.g. xattr-bench with 10 files and 10000 unique xattr values). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/mbcache2.c | 87 +++++++++++++++++++++++++++++++----------------- include/linux/mbcache2.h | 11 +++--- 2 files changed, 63 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/fs/mbcache2.c b/fs/mbcache2.c index 3e3198d6b9d6..49f7a6feaa83 100644 --- a/fs/mbcache2.c +++ b/fs/mbcache2.c @@ -30,9 +30,9 @@ struct mb2_cache { int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ int c_max_entries; - /* Protects c_lru_list, c_entry_count */ - spinlock_t c_lru_list_lock; - struct list_head c_lru_list; + /* Protects c_list, c_entry_count */ + spinlock_t c_list_lock; + struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; struct shrinker c_shrink; @@ -45,6 +45,29 @@ static struct kmem_cache *mb2_entry_cache; static unsigned long mb2_cache_shrink(struct mb2_cache *cache, unsigned int nr_to_scan); +static inline bool mb2_cache_entry_referenced(struct mb2_cache_entry *entry) +{ + return entry->_e_hash_list_head & 1; +} + +static inline void mb2_cache_entry_set_referenced(struct mb2_cache_entry *entry) +{ + entry->_e_hash_list_head |= 1; +} + +static inline void mb2_cache_entry_clear_referenced( + struct mb2_cache_entry *entry) +{ + entry->_e_hash_list_head &= ~1; +} + +static inline struct hlist_bl_head *mb2_cache_entry_head( + struct mb2_cache_entry *entry) +{ + return (struct hlist_bl_head *) + (entry->_e_hash_list_head & ~1); +} + /* * Number of entries to reclaim synchronously when there are too many entries * in cache @@ -80,13 +103,13 @@ int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, if (!entry) return -ENOMEM; - INIT_LIST_HEAD(&entry->e_lru_list); + INIT_LIST_HEAD(&entry->e_list); /* One ref for hash, one ref returned */ atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_block = block; head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - entry->e_hash_list_head = head; + entry->_e_hash_list_head = (unsigned long)head; hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_block == block) { @@ -98,12 +121,12 @@ int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, hlist_bl_add_head(&entry->e_hash_list, head); hlist_bl_unlock(head); - spin_lock(&cache->c_lru_list_lock); - list_add_tail(&entry->e_lru_list, &cache->c_lru_list); + spin_lock(&cache->c_list_lock); + list_add_tail(&entry->e_list, &cache->c_list); /* Grab ref for LRU list */ atomic_inc(&entry->e_refcnt); cache->c_entry_count++; - spin_unlock(&cache->c_lru_list_lock); + spin_unlock(&cache->c_list_lock); return 0; } @@ -124,7 +147,7 @@ static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, struct hlist_bl_head *head; if (entry) - head = entry->e_hash_list_head; + head = mb2_cache_entry_head(entry); else head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; hlist_bl_lock(head); @@ -203,13 +226,13 @@ void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, /* We keep hash list reference to keep entry alive */ hlist_bl_del_init(&entry->e_hash_list); hlist_bl_unlock(head); - spin_lock(&cache->c_lru_list_lock); - if (!list_empty(&entry->e_lru_list)) { - list_del_init(&entry->e_lru_list); + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) { + list_del_init(&entry->e_list); cache->c_entry_count--; atomic_dec(&entry->e_refcnt); } - spin_unlock(&cache->c_lru_list_lock); + spin_unlock(&cache->c_list_lock); mb2_cache_entry_put(cache, entry); return; } @@ -222,15 +245,12 @@ EXPORT_SYMBOL(mb2_cache_entry_delete_block); * @cache - cache the entry belongs to * @entry - entry that got used * - * Move entry in lru list to reflect the fact that it was used. + * Marks entry as used to give hit higher chances of surviving in cache. */ void mb2_cache_entry_touch(struct mb2_cache *cache, struct mb2_cache_entry *entry) { - spin_lock(&cache->c_lru_list_lock); - if (!list_empty(&entry->e_lru_list)) - list_move_tail(&cache->c_lru_list, &entry->e_lru_list); - spin_unlock(&cache->c_lru_list_lock); + mb2_cache_entry_set_referenced(entry); } EXPORT_SYMBOL(mb2_cache_entry_touch); @@ -251,18 +271,23 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache, struct hlist_bl_head *head; unsigned int shrunk = 0; - spin_lock(&cache->c_lru_list_lock); - while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) { - entry = list_first_entry(&cache->c_lru_list, - struct mb2_cache_entry, e_lru_list); - list_del_init(&entry->e_lru_list); + spin_lock(&cache->c_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_list)) { + entry = list_first_entry(&cache->c_list, + struct mb2_cache_entry, e_list); + if (mb2_cache_entry_referenced(entry)) { + mb2_cache_entry_clear_referenced(entry); + list_move_tail(&cache->c_list, &entry->e_list); + continue; + } + list_del_init(&entry->e_list); cache->c_entry_count--; /* * We keep LRU list reference so that entry doesn't go away * from under us. */ - spin_unlock(&cache->c_lru_list_lock); - head = entry->e_hash_list_head; + spin_unlock(&cache->c_list_lock); + head = mb2_cache_entry_head(entry); hlist_bl_lock(head); if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); @@ -272,9 +297,9 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache, if (mb2_cache_entry_put(cache, entry)) shrunk++; cond_resched(); - spin_lock(&cache->c_lru_list_lock); + spin_lock(&cache->c_list_lock); } - spin_unlock(&cache->c_lru_list_lock); + spin_unlock(&cache->c_list_lock); return shrunk; } @@ -318,8 +343,8 @@ struct mb2_cache *mb2_cache_create(int bucket_bits) goto err_out; cache->c_bucket_bits = bucket_bits; cache->c_max_entries = bucket_count << 4; - INIT_LIST_HEAD(&cache->c_lru_list); - spin_lock_init(&cache->c_lru_list_lock); + INIT_LIST_HEAD(&cache->c_list); + spin_lock_init(&cache->c_list_lock); cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_hash) { @@ -361,13 +386,13 @@ void mb2_cache_destroy(struct mb2_cache *cache) * We don't bother with any locking. Cache must not be used at this * point. */ - list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) { + list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); atomic_dec(&entry->e_refcnt); } else WARN_ON(1); - list_del(&entry->e_lru_list); + list_del(&entry->e_list); WARN_ON(atomic_read(&entry->e_refcnt) != 1); mb2_cache_entry_put(cache, entry); } diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h index b6f160ff2533..c934843a6a31 100644 --- a/include/linux/mbcache2.h +++ b/include/linux/mbcache2.h @@ -10,8 +10,8 @@ struct mb2_cache; struct mb2_cache_entry { - /* LRU list - protected by cache->c_lru_list_lock */ - struct list_head e_lru_list; + /* List of entries in cache - protected by cache->c_list_lock */ + struct list_head e_list; /* Hash table list - protected by bitlock in e_hash_list_head */ struct hlist_bl_node e_hash_list; atomic_t e_refcnt; @@ -19,8 +19,11 @@ struct mb2_cache_entry { u32 e_key; /* Block number of hashed block - stable during lifetime of the entry */ sector_t e_block; - /* Head of hash list (for list bit lock) - stable */ - struct hlist_bl_head *e_hash_list_head; + /* + * Head of hash list (for list bit lock) - stable. Combined with + * referenced bit of entry + */ + unsigned long _e_hash_list_head; }; struct mb2_cache *mb2_cache_create(int bucket_bits); -- cgit v1.2.3 From 7a2508e1b657cfc7e1371550f88c7a7bc4288f32 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 22:35:22 -0500 Subject: mbcache2: rename to mbcache Since old mbcache code is gone, let's rename new code to mbcache since number 2 is now meaningless. This is just a mechanical replacement. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/Makefile | 2 +- fs/ext2/ext2.h | 4 +- fs/ext2/xattr.c | 48 +++--- fs/ext2/xattr.h | 8 +- fs/ext4/ext4.h | 2 +- fs/ext4/xattr.c | 54 +++--- fs/ext4/xattr.h | 4 +- fs/mbcache.c | 424 +++++++++++++++++++++++++++++++++++++++++++++++ fs/mbcache2.c | 424 ----------------------------------------------- include/linux/mbcache.h | 53 ++++++ include/linux/mbcache2.h | 53 ------ 11 files changed, 538 insertions(+), 538 deletions(-) create mode 100644 fs/mbcache.c delete mode 100644 fs/mbcache2.c create mode 100644 include/linux/mbcache.h delete mode 100644 include/linux/mbcache2.h (limited to 'include') diff --git a/fs/Makefile b/fs/Makefile index 59b844007fbc..79f522575cba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o -obj-$(CONFIG_FS_MBCACHE) += mbcache2.o +obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index f98ce7e60a0f..170939f379d7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -61,7 +61,7 @@ struct ext2_block_alloc_info { #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end -struct mb2_cache; +struct mb_cache; /* * second extended-fs super-block data in memory @@ -113,7 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; - struct mb2_cache *s_mb_cache; + struct mb_cache *s_mb_cache; }; static inline spinlock_t * diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 7162b4869bc3..71d58c2d7a19 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include #include @@ -90,7 +90,7 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *, struct ext2_xattr_header *); -static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); +static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext2_xattr_cache_find(struct inode *, struct ext2_xattr_header *); static void ext2_xattr_rehash(struct ext2_xattr_header *, @@ -150,7 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; - struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -246,7 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; - struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -493,8 +493,8 @@ bad_block: ext2_error(sb, "ext2_xattr_set", * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect modified block */ - mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, + hash, bh->b_blocknr); /* keep the buffer locked while modifying it. */ } else { @@ -627,7 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; - struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; + struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -721,8 +721,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * This must happen under buffer lock for * ext2_xattr_set2() to reliably detect freed block */ - mb2_cache_entry_delete_block(ext2_mb_cache, - hash, old_bh->b_blocknr); + mb_cache_entry_delete_block(ext2_mb_cache, + hash, old_bh->b_blocknr); /* Free the old block. */ ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); @@ -786,8 +786,8 @@ ext2_xattr_delete_inode(struct inode *inode) * This must happen under buffer lock for ext2_xattr_set2() to * reliably detect freed block */ - mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, - hash, bh->b_blocknr); + mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, + hash, bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); @@ -818,12 +818,12 @@ cleanup: * Returns 0, or a negative error number on failure. */ static int -ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh) +ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(HDR(bh)->h_hash); int error; - error = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); if (error) { if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", @@ -887,14 +887,14 @@ static struct buffer_head * ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb2_cache_entry *ce; - struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + struct mb_cache_entry *ce; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb2_cache_entry_find_first(ext2_mb_cache, hash); + ce = mb_cache_entry_find_first(ext2_mb_cache, hash); while (ce) { struct buffer_head *bh; @@ -915,7 +915,7 @@ again: * entry is still hashed is reliable. */ if (hlist_bl_unhashed(&ce->e_hash_list)) { - mb2_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_put(ext2_mb_cache, ce); unlock_buffer(bh); brelse(bh); goto again; @@ -928,14 +928,14 @@ again: } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb2_cache_entry_touch(ext2_mb_cache, ce); - mb2_cache_entry_put(ext2_mb_cache, ce); + mb_cache_entry_touch(ext2_mb_cache, ce); + mb_cache_entry_put(ext2_mb_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb2_cache_entry_find_next(ext2_mb_cache, ce); + ce = mb_cache_entry_find_next(ext2_mb_cache, ce); } return NULL; } @@ -1010,13 +1010,13 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, #define HASH_BUCKET_BITS 10 -struct mb2_cache *ext2_xattr_create_cache(void) +struct mb_cache *ext2_xattr_create_cache(void) { - return mb2_cache_create(HASH_BUCKET_BITS); + return mb_cache_create(HASH_BUCKET_BITS); } -void ext2_xattr_destroy_cache(struct mb2_cache *cache) +void ext2_xattr_destroy_cache(struct mb_cache *cache) { if (cache) - mb2_cache_destroy(cache); + mb_cache_destroy(cache); } diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 6ea38aa9563a..6f82ab1b00ca 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -53,7 +53,7 @@ struct ext2_xattr_entry { #define EXT2_XATTR_SIZE(size) \ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) -struct mb2_cache; +struct mb_cache; # ifdef CONFIG_EXT2_FS_XATTR @@ -68,8 +68,8 @@ extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_ extern void ext2_xattr_delete_inode(struct inode *); -extern struct mb2_cache *ext2_xattr_create_cache(void); -extern void ext2_xattr_destroy_cache(struct mb2_cache *cache); +extern struct mb_cache *ext2_xattr_create_cache(void); +extern void ext2_xattr_destroy_cache(struct mb_cache *cache); extern const struct xattr_handler *ext2_xattr_handlers[]; @@ -94,7 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) { } -static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache) +static inline void ext2_xattr_destroy_cache(struct mb_cache *cache) { } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9ac9e62569ef..157b458a69d4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1468,7 +1468,7 @@ struct ext4_sb_info { struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; - struct mb2_cache *s_mb_cache; + struct mb_cache *s_mb_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fe9f8d6ab6c9..c6af8a7a436a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include "ext4_jbd2.h" #include "ext4.h" @@ -78,10 +78,10 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, - struct mb2_cache_entry **); + struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); static int ext4_xattr_list(struct dentry *dentry, char *buffer, @@ -276,7 +276,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -428,7 +428,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -561,8 +561,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, - bh->b_blocknr); + mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, + bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -782,9 +782,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search *s = &bs->s; - struct mb2_cache_entry *ce = NULL; + struct mb_cache_entry *ce = NULL; int error = 0; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) @@ -805,8 +805,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * ext4_xattr_block_set() to reliably detect modified * block */ - mb2_cache_entry_delete_block(ext4_mb_cache, hash, - bs->bh->b_blocknr); + mb_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s); if (!error) { @@ -904,7 +904,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); - mb2_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); ce = NULL; new_bh = NULL; goto inserted; @@ -919,8 +919,8 @@ inserted: if (error) goto cleanup_dquot; } - mb2_cache_entry_touch(ext4_mb_cache, ce); - mb2_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_touch(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -985,7 +985,7 @@ getblk_failed: cleanup: if (ce) - mb2_cache_entry_put(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1546,13 +1546,13 @@ cleanup: * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); int error; - error = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, - bh->b_blocknr); + error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + bh->b_blocknr); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); @@ -1610,16 +1610,16 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, */ static struct buffer_head * ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb2_cache_entry **pce) + struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); - struct mb2_cache_entry *ce; - struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); - ce = mb2_cache_entry_find_first(ext4_mb_cache, hash); + ce = mb_cache_entry_find_first(ext4_mb_cache, hash); while (ce) { struct buffer_head *bh; @@ -1638,7 +1638,7 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, return bh; } brelse(bh); - ce = mb2_cache_entry_find_next(ext4_mb_cache, ce); + ce = mb_cache_entry_find_next(ext4_mb_cache, ce); } return NULL; } @@ -1713,15 +1713,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #define HASH_BUCKET_BITS 10 -struct mb2_cache * +struct mb_cache * ext4_xattr_create_cache(void) { - return mb2_cache_create(HASH_BUCKET_BITS); + return mb_cache_create(HASH_BUCKET_BITS); } -void ext4_xattr_destroy_cache(struct mb2_cache *cache) +void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) - mb2_cache_destroy(cache); + mb_cache_destroy(cache); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 10b0f7323ed6..69dd3e6566e0 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -123,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern struct mb2_cache *ext4_xattr_create_cache(void); -extern void ext4_xattr_destroy_cache(struct mb2_cache *); +extern struct mb_cache *ext4_xattr_create_cache(void); +extern void ext4_xattr_destroy_cache(struct mb_cache *); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, diff --git a/fs/mbcache.c b/fs/mbcache.c new file mode 100644 index 000000000000..4241b633f155 --- /dev/null +++ b/fs/mbcache.c @@ -0,0 +1,424 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mbcache is a simple key-value store. Keys need not be unique, however + * key-value pairs are expected to be unique (we use this fact in + * mb_cache_entry_delete_block()). + * + * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. + * They use hash of a block contents as a key and block number as a value. + * That's why keys need not be unique (different xattr blocks may end up having + * the same hash). However block number always uniquely identifies a cache + * entry. + * + * We provide functions for creation and removal of entries, search by key, + * and a special "delete entry with given key-value pair" operation. Fixed + * size hash table is used for fast key lookups. + */ + +struct mb_cache { + /* Hash table of entries */ + struct hlist_bl_head *c_hash; + /* log2 of hash table size */ + int c_bucket_bits; + /* Maximum entries in cache to avoid degrading hash too much */ + int c_max_entries; + /* Protects c_list, c_entry_count */ + spinlock_t c_list_lock; + struct list_head c_list; + /* Number of entries in cache */ + unsigned long c_entry_count; + struct shrinker c_shrink; + /* Work for shrinking when the cache has too many entries */ + struct work_struct c_shrink_work; +}; + +static struct kmem_cache *mb_entry_cache; + +static unsigned long mb_cache_shrink(struct mb_cache *cache, + unsigned int nr_to_scan); + +static inline bool mb_cache_entry_referenced(struct mb_cache_entry *entry) +{ + return entry->_e_hash_list_head & 1; +} + +static inline void mb_cache_entry_set_referenced(struct mb_cache_entry *entry) +{ + entry->_e_hash_list_head |= 1; +} + +static inline void mb_cache_entry_clear_referenced( + struct mb_cache_entry *entry) +{ + entry->_e_hash_list_head &= ~1; +} + +static inline struct hlist_bl_head *mb_cache_entry_head( + struct mb_cache_entry *entry) +{ + return (struct hlist_bl_head *) + (entry->_e_hash_list_head & ~1); +} + +/* + * Number of entries to reclaim synchronously when there are too many entries + * in cache + */ +#define SYNC_SHRINK_BATCH 64 + +/* + * mb_cache_entry_create - create entry in cache + * @cache - cache where the entry should be created + * @mask - gfp mask with which the entry should be allocated + * @key - key of the entry + * @block - block that contains data + * + * Creates entry in @cache with key @key and records that data is stored in + * block @block. The function returns -EBUSY if entry with the same key + * and for the same block already exists in cache. Otherwise 0 is returned. + */ +int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, + sector_t block) +{ + struct mb_cache_entry *entry, *dup; + struct hlist_bl_node *dup_node; + struct hlist_bl_head *head; + + /* Schedule background reclaim if there are too many entries */ + if (cache->c_entry_count >= cache->c_max_entries) + schedule_work(&cache->c_shrink_work); + /* Do some sync reclaim if background reclaim cannot keep up */ + if (cache->c_entry_count >= 2*cache->c_max_entries) + mb_cache_shrink(cache, SYNC_SHRINK_BATCH); + + entry = kmem_cache_alloc(mb_entry_cache, mask); + if (!entry) + return -ENOMEM; + + INIT_LIST_HEAD(&entry->e_list); + /* One ref for hash, one ref returned */ + atomic_set(&entry->e_refcnt, 1); + entry->e_key = key; + entry->e_block = block; + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + entry->_e_hash_list_head = (unsigned long)head; + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { + if (dup->e_key == key && dup->e_block == block) { + hlist_bl_unlock(head); + kmem_cache_free(mb_entry_cache, entry); + return -EBUSY; + } + } + hlist_bl_add_head(&entry->e_hash_list, head); + hlist_bl_unlock(head); + + spin_lock(&cache->c_list_lock); + list_add_tail(&entry->e_list, &cache->c_list); + /* Grab ref for LRU list */ + atomic_inc(&entry->e_refcnt); + cache->c_entry_count++; + spin_unlock(&cache->c_list_lock); + + return 0; +} +EXPORT_SYMBOL(mb_cache_entry_create); + +void __mb_cache_entry_free(struct mb_cache_entry *entry) +{ + kmem_cache_free(mb_entry_cache, entry); +} +EXPORT_SYMBOL(__mb_cache_entry_free); + +static struct mb_cache_entry *__entry_find(struct mb_cache *cache, + struct mb_cache_entry *entry, + u32 key) +{ + struct mb_cache_entry *old_entry = entry; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + + if (entry) + head = mb_cache_entry_head(entry); + else + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) + node = entry->e_hash_list.next; + else + node = hlist_bl_first(head); + while (node) { + entry = hlist_bl_entry(node, struct mb_cache_entry, + e_hash_list); + if (entry->e_key == key) { + atomic_inc(&entry->e_refcnt); + goto out; + } + node = node->next; + } + entry = NULL; +out: + hlist_bl_unlock(head); + if (old_entry) + mb_cache_entry_put(cache, old_entry); + + return entry; +} + +/* + * mb_cache_entry_find_first - find the first entry in cache with given key + * @cache: cache where we should search + * @key: key to look for + * + * Search in @cache for entry with key @key. Grabs reference to the first + * entry found and returns the entry. + */ +struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, + u32 key) +{ + return __entry_find(cache, NULL, key); +} +EXPORT_SYMBOL(mb_cache_entry_find_first); + +/* + * mb_cache_entry_find_next - find next entry in cache with the same + * @cache: cache where we should search + * @entry: entry to start search from + * + * Finds next entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races + * with the search), finds the first entry in the hash chain. The function + * drops reference to @entry and returns with a reference to the found entry. + */ +struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, + struct mb_cache_entry *entry) +{ + return __entry_find(cache, entry, entry->e_key); +} +EXPORT_SYMBOL(mb_cache_entry_find_next); + +/* mb_cache_entry_delete_block - remove information about block from cache + * @cache - cache we work with + * @key - key of the entry to remove + * @block - block containing data for @key + * + * Remove entry from cache @cache with key @key with data stored in @block. + */ +void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, + sector_t block) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + /* We keep hash list reference to keep entry alive */ + hlist_bl_del_init(&entry->e_hash_list); + hlist_bl_unlock(head); + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) { + list_del_init(&entry->e_list); + cache->c_entry_count--; + atomic_dec(&entry->e_refcnt); + } + spin_unlock(&cache->c_list_lock); + mb_cache_entry_put(cache, entry); + return; + } + } + hlist_bl_unlock(head); +} +EXPORT_SYMBOL(mb_cache_entry_delete_block); + +/* mb_cache_entry_touch - cache entry got used + * @cache - cache the entry belongs to + * @entry - entry that got used + * + * Marks entry as used to give hit higher chances of surviving in cache. + */ +void mb_cache_entry_touch(struct mb_cache *cache, + struct mb_cache_entry *entry) +{ + mb_cache_entry_set_referenced(entry); +} +EXPORT_SYMBOL(mb_cache_entry_touch); + +static unsigned long mb_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct mb_cache *cache = container_of(shrink, struct mb_cache, + c_shrink); + + return cache->c_entry_count; +} + +/* Shrink number of entries in cache */ +static unsigned long mb_cache_shrink(struct mb_cache *cache, + unsigned int nr_to_scan) +{ + struct mb_cache_entry *entry; + struct hlist_bl_head *head; + unsigned int shrunk = 0; + + spin_lock(&cache->c_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_list)) { + entry = list_first_entry(&cache->c_list, + struct mb_cache_entry, e_list); + if (mb_cache_entry_referenced(entry)) { + mb_cache_entry_clear_referenced(entry); + list_move_tail(&cache->c_list, &entry->e_list); + continue; + } + list_del_init(&entry->e_list); + cache->c_entry_count--; + /* + * We keep LRU list reference so that entry doesn't go away + * from under us. + */ + spin_unlock(&cache->c_list_lock); + head = mb_cache_entry_head(entry); + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } + hlist_bl_unlock(head); + if (mb_cache_entry_put(cache, entry)) + shrunk++; + cond_resched(); + spin_lock(&cache->c_list_lock); + } + spin_unlock(&cache->c_list_lock); + + return shrunk; +} + +static unsigned long mb_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + int nr_to_scan = sc->nr_to_scan; + struct mb_cache *cache = container_of(shrink, struct mb_cache, + c_shrink); + return mb_cache_shrink(cache, nr_to_scan); +} + +/* We shrink 1/X of the cache when we have too many entries in it */ +#define SHRINK_DIVISOR 16 + +static void mb_cache_shrink_worker(struct work_struct *work) +{ + struct mb_cache *cache = container_of(work, struct mb_cache, + c_shrink_work); + mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); +} + +/* + * mb_cache_create - create cache + * @bucket_bits: log2 of the hash table size + * + * Create cache for keys with 2^bucket_bits hash entries. + */ +struct mb_cache *mb_cache_create(int bucket_bits) +{ + struct mb_cache *cache; + int bucket_count = 1 << bucket_bits; + int i; + + if (!try_module_get(THIS_MODULE)) + return NULL; + + cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); + if (!cache) + goto err_out; + cache->c_bucket_bits = bucket_bits; + cache->c_max_entries = bucket_count << 4; + INIT_LIST_HEAD(&cache->c_list); + spin_lock_init(&cache->c_list_lock); + cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), + GFP_KERNEL); + if (!cache->c_hash) { + kfree(cache); + goto err_out; + } + for (i = 0; i < bucket_count; i++) + INIT_HLIST_BL_HEAD(&cache->c_hash[i]); + + cache->c_shrink.count_objects = mb_cache_count; + cache->c_shrink.scan_objects = mb_cache_scan; + cache->c_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&cache->c_shrink); + + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); + + return cache; + +err_out: + module_put(THIS_MODULE); + return NULL; +} +EXPORT_SYMBOL(mb_cache_create); + +/* + * mb_cache_destroy - destroy cache + * @cache: the cache to destroy + * + * Free all entries in cache and cache itself. Caller must make sure nobody + * (except shrinker) can reach @cache when calling this. + */ +void mb_cache_destroy(struct mb_cache *cache) +{ + struct mb_cache_entry *entry, *next; + + unregister_shrinker(&cache->c_shrink); + + /* + * We don't bother with any locking. Cache must not be used at this + * point. + */ + list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } else + WARN_ON(1); + list_del(&entry->e_list); + WARN_ON(atomic_read(&entry->e_refcnt) != 1); + mb_cache_entry_put(cache, entry); + } + kfree(cache->c_hash); + kfree(cache); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL(mb_cache_destroy); + +static int __init mbcache_init(void) +{ + mb_entry_cache = kmem_cache_create("mbcache", + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + BUG_ON(!mb_entry_cache); + return 0; +} + +static void __exit mbcache_exit(void) +{ + kmem_cache_destroy(mb_entry_cache); +} + +module_init(mbcache_init) +module_exit(mbcache_exit) + +MODULE_AUTHOR("Jan Kara "); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); diff --git a/fs/mbcache2.c b/fs/mbcache2.c deleted file mode 100644 index 49f7a6feaa83..000000000000 --- a/fs/mbcache2.c +++ /dev/null @@ -1,424 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Mbcache is a simple key-value store. Keys need not be unique, however - * key-value pairs are expected to be unique (we use this fact in - * mb2_cache_entry_delete_block()). - * - * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. - * They use hash of a block contents as a key and block number as a value. - * That's why keys need not be unique (different xattr blocks may end up having - * the same hash). However block number always uniquely identifies a cache - * entry. - * - * We provide functions for creation and removal of entries, search by key, - * and a special "delete entry with given key-value pair" operation. Fixed - * size hash table is used for fast key lookups. - */ - -struct mb2_cache { - /* Hash table of entries */ - struct hlist_bl_head *c_hash; - /* log2 of hash table size */ - int c_bucket_bits; - /* Maximum entries in cache to avoid degrading hash too much */ - int c_max_entries; - /* Protects c_list, c_entry_count */ - spinlock_t c_list_lock; - struct list_head c_list; - /* Number of entries in cache */ - unsigned long c_entry_count; - struct shrinker c_shrink; - /* Work for shrinking when the cache has too many entries */ - struct work_struct c_shrink_work; -}; - -static struct kmem_cache *mb2_entry_cache; - -static unsigned long mb2_cache_shrink(struct mb2_cache *cache, - unsigned int nr_to_scan); - -static inline bool mb2_cache_entry_referenced(struct mb2_cache_entry *entry) -{ - return entry->_e_hash_list_head & 1; -} - -static inline void mb2_cache_entry_set_referenced(struct mb2_cache_entry *entry) -{ - entry->_e_hash_list_head |= 1; -} - -static inline void mb2_cache_entry_clear_referenced( - struct mb2_cache_entry *entry) -{ - entry->_e_hash_list_head &= ~1; -} - -static inline struct hlist_bl_head *mb2_cache_entry_head( - struct mb2_cache_entry *entry) -{ - return (struct hlist_bl_head *) - (entry->_e_hash_list_head & ~1); -} - -/* - * Number of entries to reclaim synchronously when there are too many entries - * in cache - */ -#define SYNC_SHRINK_BATCH 64 - -/* - * mb2_cache_entry_create - create entry in cache - * @cache - cache where the entry should be created - * @mask - gfp mask with which the entry should be allocated - * @key - key of the entry - * @block - block that contains data - * - * Creates entry in @cache with key @key and records that data is stored in - * block @block. The function returns -EBUSY if entry with the same key - * and for the same block already exists in cache. Otherwise 0 is returned. - */ -int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, - sector_t block) -{ - struct mb2_cache_entry *entry, *dup; - struct hlist_bl_node *dup_node; - struct hlist_bl_head *head; - - /* Schedule background reclaim if there are too many entries */ - if (cache->c_entry_count >= cache->c_max_entries) - schedule_work(&cache->c_shrink_work); - /* Do some sync reclaim if background reclaim cannot keep up */ - if (cache->c_entry_count >= 2*cache->c_max_entries) - mb2_cache_shrink(cache, SYNC_SHRINK_BATCH); - - entry = kmem_cache_alloc(mb2_entry_cache, mask); - if (!entry) - return -ENOMEM; - - INIT_LIST_HEAD(&entry->e_list); - /* One ref for hash, one ref returned */ - atomic_set(&entry->e_refcnt, 1); - entry->e_key = key; - entry->e_block = block; - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - entry->_e_hash_list_head = (unsigned long)head; - hlist_bl_lock(head); - hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { - if (dup->e_key == key && dup->e_block == block) { - hlist_bl_unlock(head); - kmem_cache_free(mb2_entry_cache, entry); - return -EBUSY; - } - } - hlist_bl_add_head(&entry->e_hash_list, head); - hlist_bl_unlock(head); - - spin_lock(&cache->c_list_lock); - list_add_tail(&entry->e_list, &cache->c_list); - /* Grab ref for LRU list */ - atomic_inc(&entry->e_refcnt); - cache->c_entry_count++; - spin_unlock(&cache->c_list_lock); - - return 0; -} -EXPORT_SYMBOL(mb2_cache_entry_create); - -void __mb2_cache_entry_free(struct mb2_cache_entry *entry) -{ - kmem_cache_free(mb2_entry_cache, entry); -} -EXPORT_SYMBOL(__mb2_cache_entry_free); - -static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, - struct mb2_cache_entry *entry, - u32 key) -{ - struct mb2_cache_entry *old_entry = entry; - struct hlist_bl_node *node; - struct hlist_bl_head *head; - - if (entry) - head = mb2_cache_entry_head(entry); - else - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - hlist_bl_lock(head); - if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) - node = entry->e_hash_list.next; - else - node = hlist_bl_first(head); - while (node) { - entry = hlist_bl_entry(node, struct mb2_cache_entry, - e_hash_list); - if (entry->e_key == key) { - atomic_inc(&entry->e_refcnt); - goto out; - } - node = node->next; - } - entry = NULL; -out: - hlist_bl_unlock(head); - if (old_entry) - mb2_cache_entry_put(cache, old_entry); - - return entry; -} - -/* - * mb2_cache_entry_find_first - find the first entry in cache with given key - * @cache: cache where we should search - * @key: key to look for - * - * Search in @cache for entry with key @key. Grabs reference to the first - * entry found and returns the entry. - */ -struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, - u32 key) -{ - return __entry_find(cache, NULL, key); -} -EXPORT_SYMBOL(mb2_cache_entry_find_first); - -/* - * mb2_cache_entry_find_next - find next entry in cache with the same - * @cache: cache where we should search - * @entry: entry to start search from - * - * Finds next entry in the hash chain which has the same key as @entry. - * If @entry is unhashed (which can happen when deletion of entry races - * with the search), finds the first entry in the hash chain. The function - * drops reference to @entry and returns with a reference to the found entry. - */ -struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, - struct mb2_cache_entry *entry) -{ - return __entry_find(cache, entry, entry->e_key); -} -EXPORT_SYMBOL(mb2_cache_entry_find_next); - -/* mb2_cache_entry_delete_block - remove information about block from cache - * @cache - cache we work with - * @key - key of the entry to remove - * @block - block containing data for @key - * - * Remove entry from cache @cache with key @key with data stored in @block. - */ -void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, - sector_t block) -{ - struct hlist_bl_node *node; - struct hlist_bl_head *head; - struct mb2_cache_entry *entry; - - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - hlist_bl_lock(head); - hlist_bl_for_each_entry(entry, node, head, e_hash_list) { - if (entry->e_key == key && entry->e_block == block) { - /* We keep hash list reference to keep entry alive */ - hlist_bl_del_init(&entry->e_hash_list); - hlist_bl_unlock(head); - spin_lock(&cache->c_list_lock); - if (!list_empty(&entry->e_list)) { - list_del_init(&entry->e_list); - cache->c_entry_count--; - atomic_dec(&entry->e_refcnt); - } - spin_unlock(&cache->c_list_lock); - mb2_cache_entry_put(cache, entry); - return; - } - } - hlist_bl_unlock(head); -} -EXPORT_SYMBOL(mb2_cache_entry_delete_block); - -/* mb2_cache_entry_touch - cache entry got used - * @cache - cache the entry belongs to - * @entry - entry that got used - * - * Marks entry as used to give hit higher chances of surviving in cache. - */ -void mb2_cache_entry_touch(struct mb2_cache *cache, - struct mb2_cache_entry *entry) -{ - mb2_cache_entry_set_referenced(entry); -} -EXPORT_SYMBOL(mb2_cache_entry_touch); - -static unsigned long mb2_cache_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct mb2_cache *cache = container_of(shrink, struct mb2_cache, - c_shrink); - - return cache->c_entry_count; -} - -/* Shrink number of entries in cache */ -static unsigned long mb2_cache_shrink(struct mb2_cache *cache, - unsigned int nr_to_scan) -{ - struct mb2_cache_entry *entry; - struct hlist_bl_head *head; - unsigned int shrunk = 0; - - spin_lock(&cache->c_list_lock); - while (nr_to_scan-- && !list_empty(&cache->c_list)) { - entry = list_first_entry(&cache->c_list, - struct mb2_cache_entry, e_list); - if (mb2_cache_entry_referenced(entry)) { - mb2_cache_entry_clear_referenced(entry); - list_move_tail(&cache->c_list, &entry->e_list); - continue; - } - list_del_init(&entry->e_list); - cache->c_entry_count--; - /* - * We keep LRU list reference so that entry doesn't go away - * from under us. - */ - spin_unlock(&cache->c_list_lock); - head = mb2_cache_entry_head(entry); - hlist_bl_lock(head); - if (!hlist_bl_unhashed(&entry->e_hash_list)) { - hlist_bl_del_init(&entry->e_hash_list); - atomic_dec(&entry->e_refcnt); - } - hlist_bl_unlock(head); - if (mb2_cache_entry_put(cache, entry)) - shrunk++; - cond_resched(); - spin_lock(&cache->c_list_lock); - } - spin_unlock(&cache->c_list_lock); - - return shrunk; -} - -static unsigned long mb2_cache_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - int nr_to_scan = sc->nr_to_scan; - struct mb2_cache *cache = container_of(shrink, struct mb2_cache, - c_shrink); - return mb2_cache_shrink(cache, nr_to_scan); -} - -/* We shrink 1/X of the cache when we have too many entries in it */ -#define SHRINK_DIVISOR 16 - -static void mb2_cache_shrink_worker(struct work_struct *work) -{ - struct mb2_cache *cache = container_of(work, struct mb2_cache, - c_shrink_work); - mb2_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); -} - -/* - * mb2_cache_create - create cache - * @bucket_bits: log2 of the hash table size - * - * Create cache for keys with 2^bucket_bits hash entries. - */ -struct mb2_cache *mb2_cache_create(int bucket_bits) -{ - struct mb2_cache *cache; - int bucket_count = 1 << bucket_bits; - int i; - - if (!try_module_get(THIS_MODULE)) - return NULL; - - cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL); - if (!cache) - goto err_out; - cache->c_bucket_bits = bucket_bits; - cache->c_max_entries = bucket_count << 4; - INIT_LIST_HEAD(&cache->c_list); - spin_lock_init(&cache->c_list_lock); - cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), - GFP_KERNEL); - if (!cache->c_hash) { - kfree(cache); - goto err_out; - } - for (i = 0; i < bucket_count; i++) - INIT_HLIST_BL_HEAD(&cache->c_hash[i]); - - cache->c_shrink.count_objects = mb2_cache_count; - cache->c_shrink.scan_objects = mb2_cache_scan; - cache->c_shrink.seeks = DEFAULT_SEEKS; - register_shrinker(&cache->c_shrink); - - INIT_WORK(&cache->c_shrink_work, mb2_cache_shrink_worker); - - return cache; - -err_out: - module_put(THIS_MODULE); - return NULL; -} -EXPORT_SYMBOL(mb2_cache_create); - -/* - * mb2_cache_destroy - destroy cache - * @cache: the cache to destroy - * - * Free all entries in cache and cache itself. Caller must make sure nobody - * (except shrinker) can reach @cache when calling this. - */ -void mb2_cache_destroy(struct mb2_cache *cache) -{ - struct mb2_cache_entry *entry, *next; - - unregister_shrinker(&cache->c_shrink); - - /* - * We don't bother with any locking. Cache must not be used at this - * point. - */ - list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { - if (!hlist_bl_unhashed(&entry->e_hash_list)) { - hlist_bl_del_init(&entry->e_hash_list); - atomic_dec(&entry->e_refcnt); - } else - WARN_ON(1); - list_del(&entry->e_list); - WARN_ON(atomic_read(&entry->e_refcnt) != 1); - mb2_cache_entry_put(cache, entry); - } - kfree(cache->c_hash); - kfree(cache); - module_put(THIS_MODULE); -} -EXPORT_SYMBOL(mb2_cache_destroy); - -static int __init mb2cache_init(void) -{ - mb2_entry_cache = kmem_cache_create("mbcache", - sizeof(struct mb2_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - BUG_ON(!mb2_entry_cache); - return 0; -} - -static void __exit mb2cache_exit(void) -{ - kmem_cache_destroy(mb2_entry_cache); -} - -module_init(mb2cache_init) -module_exit(mb2cache_exit) - -MODULE_AUTHOR("Jan Kara "); -MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -MODULE_LICENSE("GPL"); diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h new file mode 100644 index 000000000000..a74a1f3082fb --- /dev/null +++ b/include/linux/mbcache.h @@ -0,0 +1,53 @@ +#ifndef _LINUX_MBCACHE_H +#define _LINUX_MBCACHE_H + +#include +#include +#include +#include +#include + +struct mb_cache; + +struct mb_cache_entry { + /* List of entries in cache - protected by cache->c_list_lock */ + struct list_head e_list; + /* Hash table list - protected by bitlock in e_hash_list_head */ + struct hlist_bl_node e_hash_list; + atomic_t e_refcnt; + /* Key in hash - stable during lifetime of the entry */ + u32 e_key; + /* Block number of hashed block - stable during lifetime of the entry */ + sector_t e_block; + /* + * Head of hash list (for list bit lock) - stable. Combined with + * referenced bit of entry + */ + unsigned long _e_hash_list_head; +}; + +struct mb_cache *mb_cache_create(int bucket_bits); +void mb_cache_destroy(struct mb_cache *cache); + +int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, + sector_t block); +void __mb_cache_entry_free(struct mb_cache_entry *entry); +static inline int mb_cache_entry_put(struct mb_cache *cache, + struct mb_cache_entry *entry) +{ + if (!atomic_dec_and_test(&entry->e_refcnt)) + return 0; + __mb_cache_entry_free(entry); + return 1; +} + +void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, + sector_t block); +struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, + u32 key); +struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, + struct mb_cache_entry *entry); +void mb_cache_entry_touch(struct mb_cache *cache, + struct mb_cache_entry *entry); + +#endif /* _LINUX_MBCACHE_H */ diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h deleted file mode 100644 index c934843a6a31..000000000000 --- a/include/linux/mbcache2.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _LINUX_MB2CACHE_H -#define _LINUX_MB2CACHE_H - -#include -#include -#include -#include -#include - -struct mb2_cache; - -struct mb2_cache_entry { - /* List of entries in cache - protected by cache->c_list_lock */ - struct list_head e_list; - /* Hash table list - protected by bitlock in e_hash_list_head */ - struct hlist_bl_node e_hash_list; - atomic_t e_refcnt; - /* Key in hash - stable during lifetime of the entry */ - u32 e_key; - /* Block number of hashed block - stable during lifetime of the entry */ - sector_t e_block; - /* - * Head of hash list (for list bit lock) - stable. Combined with - * referenced bit of entry - */ - unsigned long _e_hash_list_head; -}; - -struct mb2_cache *mb2_cache_create(int bucket_bits); -void mb2_cache_destroy(struct mb2_cache *cache); - -int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key, - sector_t block); -void __mb2_cache_entry_free(struct mb2_cache_entry *entry); -static inline int mb2_cache_entry_put(struct mb2_cache *cache, - struct mb2_cache_entry *entry) -{ - if (!atomic_dec_and_test(&entry->e_refcnt)) - return 0; - __mb2_cache_entry_free(entry); - return 1; -} - -void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key, - sector_t block); -struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, - u32 key); -struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, - struct mb2_cache_entry *entry); -void mb2_cache_entry_touch(struct mb2_cache *cache, - struct mb2_cache_entry *entry); - -#endif /* _LINUX_MB2CACHE_H */ -- cgit v1.2.3 From dc8d5e565f00c9442fa1cbf9acc115475628527c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 22 Feb 2016 22:42:05 -0500 Subject: mbcache: get rid of _e_hash_list_head Get rid of field _e_hash_list_head in cache entries and add bit field e_referenced instead. Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/mbcache.c | 41 ++++++++++------------------------------- include/linux/mbcache.h | 8 ++------ 2 files changed, 12 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/fs/mbcache.c b/fs/mbcache.c index 4241b633f155..903be151dcfe 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -45,27 +45,10 @@ static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, unsigned int nr_to_scan); -static inline bool mb_cache_entry_referenced(struct mb_cache_entry *entry) +static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, + u32 key) { - return entry->_e_hash_list_head & 1; -} - -static inline void mb_cache_entry_set_referenced(struct mb_cache_entry *entry) -{ - entry->_e_hash_list_head |= 1; -} - -static inline void mb_cache_entry_clear_referenced( - struct mb_cache_entry *entry) -{ - entry->_e_hash_list_head &= ~1; -} - -static inline struct hlist_bl_head *mb_cache_entry_head( - struct mb_cache_entry *entry) -{ - return (struct hlist_bl_head *) - (entry->_e_hash_list_head & ~1); + return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } /* @@ -108,8 +91,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_block = block; - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; - entry->_e_hash_list_head = (unsigned long)head; + head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { if (dup->e_key == key && dup->e_block == block) { @@ -146,10 +128,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, struct hlist_bl_node *node; struct hlist_bl_head *head; - if (entry) - head = mb_cache_entry_head(entry); - else - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) node = entry->e_hash_list.next; @@ -219,7 +198,7 @@ void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, struct hlist_bl_head *head; struct mb_cache_entry *entry; - head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(entry, node, head, e_hash_list) { if (entry->e_key == key && entry->e_block == block) { @@ -250,7 +229,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_block); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { - mb_cache_entry_set_referenced(entry); + entry->e_referenced = 1; } EXPORT_SYMBOL(mb_cache_entry_touch); @@ -275,8 +254,8 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, while (nr_to_scan-- && !list_empty(&cache->c_list)) { entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); - if (mb_cache_entry_referenced(entry)) { - mb_cache_entry_clear_referenced(entry); + if (entry->e_referenced) { + entry->e_referenced = 0; list_move_tail(&cache->c_list, &entry->e_list); continue; } @@ -287,7 +266,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, * from under us. */ spin_unlock(&cache->c_list_lock); - head = mb_cache_entry_head(entry); + head = mb_cache_entry_head(cache, entry->e_key); hlist_bl_lock(head); if (!hlist_bl_unhashed(&entry->e_hash_list)) { hlist_bl_del_init(&entry->e_hash_list); diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index a74a1f3082fb..607e6968542e 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -12,18 +12,14 @@ struct mb_cache; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; - /* Hash table list - protected by bitlock in e_hash_list_head */ + /* Hash table list - protected by hash chain bitlock */ struct hlist_bl_node e_hash_list; atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; + u32 e_referenced:1; /* Block number of hashed block - stable during lifetime of the entry */ sector_t e_block; - /* - * Head of hash list (for list bit lock) - stable. Combined with - * referenced bit of entry - */ - unsigned long _e_hash_list_head; }; struct mb_cache *mb_cache_create(int bucket_bits); -- cgit v1.2.3 From 6048c64b26097a0ffbd966866b599f990e674e9b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 22 Feb 2016 22:44:04 -0500 Subject: mbcache: add reusable flag to cache entries To reduce amount of damage caused by single bad block, we limit number of inodes sharing an xattr block to 1024. Thus there can be more xattr blocks with the same contents when there are lots of files with the same extended attributes. These xattr blocks naturally result in hash collisions and can form long hash chains and we unnecessarily check each such block only to find out we cannot use it because it is already shared by too many inodes. Add a reusable flag to cache entries which is cleared when a cache entry has reached its maximum refcount. Cache entries which are not marked reusable are skipped by mb_cache_entry_find_{first,next}. This significantly speeds up mbcache when there are many same xattr blocks. For example for xattr-bench with 5 values and each process handling 20000 files, the run for 64 processes is 25x faster with this patch. Even for 8 processes the speedup is almost 3x. We have also verified that for situations where there is only one xattr block of each kind, the patch doesn't have a measurable cost. [JK: Remove handling of setting the same value since it is not needed anymore, check for races in e_reusable setting, improve changelog, add measurements] Signed-off-by: Andreas Gruenbacher Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext2/xattr.c | 2 +- fs/ext4/xattr.c | 66 +++++++++++++++++++++++++++++++------------------ fs/mbcache.c | 38 +++++++++++++++++++++++++--- include/linux/mbcache.h | 5 +++- 4 files changed, 81 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 71d58c2d7a19..1a5e3bff0b63 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -823,7 +823,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) __u32 hash = le32_to_cpu(HDR(bh)->h_hash); int error; - error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1); if (error) { if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b661ae8332e3..0441e055c8e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -545,6 +545,8 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); @@ -553,23 +555,34 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, goto out; lock_buffer(bh); - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - + hash = le32_to_cpu(BHDR(bh)->h_hash); + ref = le32_to_cpu(BHDR(bh)->h_refcount); + if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ - mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, - bh->b_blocknr); + mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); + ref--; + BHDR(bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { + struct mb_cache_entry *ce; + + ce = mb_cache_entry_get(ext4_mb_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ext4_mb_cache, ce); + } + } + /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect @@ -872,6 +885,8 @@ inserted: if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { + u32 ref; + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -886,15 +901,18 @@ inserted: lock_buffer(new_bh); /* * We have to be careful about races with - * freeing or rehashing of xattr block. Once we - * hold buffer lock xattr block's state is - * stable so we can check whether the block got - * freed / rehashed or not. Since we unhash - * mbcache entry under buffer lock when freeing - * / rehashing xattr block, checking whether - * entry is still hashed is reliable. + * freeing, rehashing or adding references to + * xattr block. Once we hold buffer lock xattr + * block's state is stable so we can check + * whether the block got freed / rehashed or + * not. Since we unhash mbcache entry under + * buffer lock when freeing / rehashing xattr + * block, checking whether entry is still + * hashed is reliable. Same rules hold for + * e_reusable handling. */ - if (hlist_bl_unhashed(&ce->e_hash_list)) { + if (hlist_bl_unhashed(&ce->e_hash_list) || + !ce->e_reusable) { /* * Undo everything and check mbcache * again. @@ -909,9 +927,12 @@ inserted: new_bh = NULL; goto inserted; } - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); + if (ref >= EXT4_XATTR_REFCOUNT_MAX) + ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); + ref); unlock_buffer(new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, @@ -1566,11 +1587,14 @@ cleanup: static void ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct ext4_xattr_header *header = BHDR(bh); + __u32 hash = le32_to_cpu(header->h_hash); + int reusable = le32_to_cpu(header->h_refcount) < + EXT4_XATTR_REFCOUNT_MAX; int error; error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, - bh->b_blocknr); + bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); @@ -1645,12 +1669,6 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT4_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT4_XATTR_REFCOUNT_MAX); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; diff --git a/fs/mbcache.c b/fs/mbcache.c index 903be151dcfe..eccda3a02de6 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -63,13 +63,14 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, * @mask - gfp mask with which the entry should be allocated * @key - key of the entry * @block - block that contains data + * @reusable - is the block reusable by other inodes? * * Creates entry in @cache with key @key and records that data is stored in * block @block. The function returns -EBUSY if entry with the same key * and for the same block already exists in cache. Otherwise 0 is returned. */ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block) + sector_t block, bool reusable) { struct mb_cache_entry *entry, *dup; struct hlist_bl_node *dup_node; @@ -91,6 +92,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, atomic_set(&entry->e_refcnt, 1); entry->e_key = key; entry->e_block = block; + entry->e_reusable = reusable; head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { @@ -137,7 +139,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); - if (entry->e_key == key) { + if (entry->e_key == key && entry->e_reusable) { atomic_inc(&entry->e_refcnt); goto out; } @@ -184,10 +186,38 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, } EXPORT_SYMBOL(mb_cache_entry_find_next); +/* + * mb_cache_entry_get - get a cache entry by block number (and key) + * @cache - cache we work with + * @key - key of block number @block + * @block - block number + */ +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, + sector_t block) +{ + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + atomic_inc(&entry->e_refcnt); + goto out; + } + } + entry = NULL; +out: + hlist_bl_unlock(head); + return entry; +} +EXPORT_SYMBOL(mb_cache_entry_get); + /* mb_cache_entry_delete_block - remove information about block from cache * @cache - cache we work with - * @key - key of the entry to remove - * @block - block containing data for @key + * @key - key of block @block + * @block - block number * * Remove entry from cache @cache with key @key with data stored in @block. */ diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 607e6968542e..86c9a8b480c5 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -18,6 +18,7 @@ struct mb_cache_entry { /* Key in hash - stable during lifetime of the entry */ u32 e_key; u32 e_referenced:1; + u32 e_reusable:1; /* Block number of hashed block - stable during lifetime of the entry */ sector_t e_block; }; @@ -26,7 +27,7 @@ struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, - sector_t block); + sector_t block, bool reusable); void __mb_cache_entry_free(struct mb_cache_entry *entry); static inline int mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) @@ -39,6 +40,8 @@ static inline int mb_cache_entry_put(struct mb_cache *cache, void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, sector_t block); +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, + sector_t block); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, -- cgit v1.2.3 From 9bcf976cb8b86eb40e0c0b495a14e4cb967b9c6e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:07:30 -0500 Subject: jbd2: remove unnecessary arguments of jbd2_journal_write_revoke_records jbd2_journal_write_revoke_records() takes journal pointer and write_op, although journal can be obtained from the passed transaction and write_op is always WRITE_SYNC. Remove these superfluous arguments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 3 +-- fs/jbd2/revoke.c | 33 +++++++++++++++------------------ include/linux/jbd2.h | 6 ++---- 3 files changed, 18 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 36345fefa3ff..ae4402d15d46 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -554,8 +554,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_abort(journal, err); blk_start_plug(&plug); - jbd2_journal_write_revoke_records(journal, commit_transaction, - &log_bufs, WRITE_SYNC); + jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); jbd_debug(3, "JBD2: commit phase 2b\n"); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 705ae577882b..c839332be56b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -122,11 +122,11 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ -static void write_one_revoke_record(journal_t *, transaction_t *, +static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, - struct jbd2_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct buffer_head *, int, int); + struct jbd2_revoke_record_s *); +static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ @@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ -void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, - struct list_head *log_bufs, - int write_op) +void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs) { + journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; @@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal, while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; - write_one_revoke_record(journal, transaction, log_bufs, - &descriptor, &offset, - record, write_op); + write_one_revoke_record(transaction, log_bufs, + &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset, write_op); + flush_descriptor(journal, descriptor, offset); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -562,14 +560,13 @@ void jbd2_journal_write_revoke_records(journal_t *journal, * block if the old one is full or if we have not already created one. */ -static void write_one_revoke_record(journal_t *journal, - transaction_t *transaction, +static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, - struct jbd2_revoke_record_s *record, - int write_op) + struct jbd2_revoke_record_s *record) { + journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; @@ -597,7 +594,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { - flush_descriptor(journal, descriptor, offset, write_op); + flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } @@ -654,7 +651,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, - int offset, int write_op) + int offset) { jbd2_journal_revoke_header_t *header; @@ -670,7 +667,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, write_op); + write_dirty_buffer(descriptor, WRITE_SYNC); } #endif diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 65407f6c9120..5716d9554e77 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1327,10 +1327,8 @@ extern int jbd2_journal_init_revoke_caches(void); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); -extern void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, - struct list_head *log_bufs, - int write_op); +extern void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs); /* Recovery revoke support */ extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); -- cgit v1.2.3 From 32ab671599a89534f37e97d146c27690e371b661 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:17:15 -0500 Subject: jbd2: factor out common descriptor block initialization Descriptor block header is initialized in several places. Factor out the common code into jbd2_journal_get_descriptor_buffer(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 16 +++++----------- fs/jbd2/journal.c | 9 ++++++++- fs/jbd2/revoke.c | 8 ++------ include/linux/jbd2.h | 2 +- 4 files changed, 16 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ae4402d15d46..cf221f3d955a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal, if (is_journal_aborted(journal)) return 0; - bh = jbd2_journal_get_descriptor_buffer(journal); + bh = jbd2_journal_get_descriptor_buffer(commit_transaction, + JBD2_COMMIT_BLOCK); if (!bh) return 1; tmp = (struct commit_header *)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); @@ -379,7 +377,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) ktime_t start_time; u64 commit_time; char *tagp = NULL; - journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; @@ -615,7 +612,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(4, "JBD2: get descriptor\n"); - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer( + commit_transaction, + JBD2_DESCRIPTOR_BLOCK); if (!descriptor) { jbd2_journal_abort(journal, -EIO); continue; @@ -624,11 +623,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)descriptor->b_blocknr, descriptor->b_data); - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - tagp = &descriptor->b_data[sizeof(journal_header_t)]; space_left = descriptor->b_size - sizeof(journal_header_t); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 81e622681c82..28d05bd9a588 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ -struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +struct buffer_head * +jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) { + journal_t *journal = transaction->t_journal; struct buffer_head *bh; unsigned long long blocknr; + journal_header_t *header; int err; err = jbd2_journal_next_log_block(journal, &blocknr); @@ -821,6 +824,10 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); + header = (journal_header_t *)bh->b_data; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(type); + header->h_sequence = cpu_to_be32(transaction->t_tid); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index c839332be56b..d1ebb1d41d17 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -570,7 +570,6 @@ static void write_one_revoke_record(transaction_t *transaction, int csum_size = 0; struct buffer_head *descriptor; int sz, offset; - journal_header_t *header; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in @@ -600,13 +599,10 @@ static void write_one_revoke_record(transaction_t *transaction, } if (!descriptor) { - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer(transaction, + JBD2_REVOKE_BLOCK); if (!descriptor) return; - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); - header->h_sequence = cpu_to_be32(transaction->t_tid); /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5716d9554e77..3649cb8d3a41 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1137,7 +1137,7 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh) } /* Log buffer allocation */ -struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); +struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -- cgit v1.2.3 From 1101cd4d13ba2f42c5da4c1b9081f35a73b5df31 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2016 23:19:09 -0500 Subject: jbd2: unify revoke and tag block checksum handling Revoke and tag descriptor blocks are just different kinds of descriptor blocks and thus have checksum in the same place. Unify computation and checking of checksums for these. Reviewed-by: Darrick J. Wong Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 18 +----------------- fs/jbd2/journal.c | 15 +++++++++++++++ fs/jbd2/recovery.c | 31 +++++-------------------------- fs/jbd2/revoke.c | 19 ++----------------- include/linux/jbd2.h | 8 ++------ 5 files changed, 25 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index cf221f3d955a..ae832cd44dd8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -317,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } -static void jbd2_descr_block_csum_set(journal_t *j, - struct buffer_head *bh) -{ - struct jbd2_journal_block_tail *tail; - __u32 csum; - - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); - tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->t_checksum = cpu_to_be32(csum); -} - static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { @@ -714,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - jbd2_descr_block_csum_set(journal, descriptor); + jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 28d05bd9a588..defa962a8e15 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -834,6 +834,21 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) return bh; } +void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + /* * Return tid of the oldest transaction in the journal and block in the journal * where the transaction starts. diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 7f277e49fe88..08a456b96e4e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return 0; } -static int jbd2_descr_block_csum_verify(journal_t *j, - void *buf) +static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_block_tail *tail; __be32 provided; @@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal, descr_csum_size = sizeof(struct jbd2_journal_block_tail); if (descr_csum_size > 0 && - !jbd2_descr_block_csum_verify(journal, - bh->b_data)) { + !jbd2_descriptor_block_csum_verify(journal, + bh->b_data)) { printk(KERN_ERR "JBD2: Invalid checksum " "recovering block %lu in log\n", next_log_block); @@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal, return err; } -static int jbd2_revoke_block_csum_verify(journal_t *j, - void *buf) -{ - struct jbd2_journal_revoke_tail *tail; - __be32 provided; - __u32 calculated; - - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - - sizeof(struct jbd2_journal_revoke_tail)); - provided = tail->r_checksum; - tail->r_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); - tail->r_checksum = provided; - - return provided == cpu_to_be32(calculated); -} - /* Scan a revoke record, marking all blocks mentioned as revoked. */ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, @@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, offset = sizeof(jbd2_journal_revoke_header_t); rcount = be32_to_cpu(header->r_count); - if (!jbd2_revoke_block_csum_verify(journal, header)) + if (!jbd2_descriptor_block_csum_verify(journal, header)) return -EFSBADCRC; if (jbd2_journal_has_csum_v2or3(journal)) - csum_size = sizeof(struct jbd2_journal_revoke_tail); + csum_size = sizeof(struct jbd2_journal_block_tail); if (rcount > journal->j_blocksize - csum_size) return -EINVAL; max = rcount; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index d1ebb1d41d17..91171dc352cb 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -583,7 +583,7 @@ static void write_one_revoke_record(transaction_t *transaction, /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) - csum_size = sizeof(struct jbd2_journal_revoke_tail); + csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; @@ -623,21 +623,6 @@ static void write_one_revoke_record(transaction_t *transaction, *offsetp = offset; } -static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) -{ - struct jbd2_journal_revoke_tail *tail; - __u32 csum; - - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_revoke_tail)); - tail->r_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->r_checksum = cpu_to_be32(csum); -} - /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to @@ -658,7 +643,7 @@ static void flush_descriptor(journal_t *journal, header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); - jbd2_revoke_csum_set(journal, descriptor); + jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 3649cb8d3a41..fd1083c46c61 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -200,7 +200,7 @@ typedef struct journal_block_tag_s __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; -/* Tail of descriptor block, for checksumming */ +/* Tail of descriptor or revoke block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ }; @@ -215,11 +215,6 @@ typedef struct jbd2_journal_revoke_header_s __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; -/* Tail of revoke block, for checksumming */ -struct jbd2_journal_revoke_tail { - __be32 r_checksum; /* crc32c(uuid+revoke_block) */ -}; - /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ @@ -1138,6 +1133,7 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh) /* Log buffer allocation */ struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); +void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -- cgit v1.2.3