diff options
Diffstat (limited to 'mm/zswap.c')
-rw-r--r-- | mm/zswap.c | 410 |
1 files changed, 163 insertions, 247 deletions
diff --git a/mm/zswap.c b/mm/zswap.c index caed028945b0..a50e2986cd2f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -20,7 +20,6 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> -#include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> @@ -43,8 +42,6 @@ /********************************* * statistics **********************************/ -/* Total bytes used by the compressed storage */ -u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ @@ -126,19 +123,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* - * Enable/disable handling same-value filled pages (enabled by default). - * If disabled every page is considered non-same-value filled. - */ -static bool zswap_same_filled_pages_enabled = true; -module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, - bool, 0644); - -/* Enable/disable handling non-same-value filled pages (enabled by default) */ -static bool zswap_non_same_filled_pages_enabled = true; -module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, - bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -183,8 +167,6 @@ struct zswap_pool { /* Global LRU lists shared by all zswap pools. */ static struct list_lru zswap_list_lru; -/* counter of pages stored in all zswap pools. */ -static atomic_t zswap_nr_stored = ATOMIC_INIT(0); /* The lock protects zswap_next_shrink updates. */ static DEFINE_SPINLOCK(zswap_shrink_lock); @@ -198,7 +180,6 @@ static struct shrinker *zswap_shrinker; * This structure contains the metadata for tracking a single compressed * page within zswap. * - * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both @@ -210,7 +191,6 @@ static struct shrinker *zswap_shrinker; * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { - struct rb_node rbnode; swp_entry_t swpentry; unsigned int length; struct zswap_pool *pool; @@ -222,12 +202,7 @@ struct zswap_entry { struct list_head lru; }; -struct zswap_tree { - struct rb_root rbroot; - spinlock_t lock; -}; - -static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static struct xarray *zswap_trees[MAX_SWAPFILES]; static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ @@ -255,7 +230,7 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ -static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) >> SWAP_ADDRESS_SPACE_SHIFT]; @@ -265,45 +240,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static bool zswap_is_full(void) -{ - return totalram_pages() * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static bool zswap_can_accept(void) -{ - return totalram_pages() * zswap_accept_thr_percent / 100 * - zswap_max_pool_percent / 100 > - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static u64 get_zswap_pool_size(struct zswap_pool *pool) -{ - u64 pool_size = 0; - int i; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - pool_size += zpool_get_total_size(pool->zpools[i]); - - return pool_size; -} - -static void zswap_update_total_size(void) -{ - struct zswap_pool *pool; - u64 total = 0; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - total += get_zswap_pool_size(pool); - - rcu_read_unlock(); - - zswap_pool_total_size = total; -} - /********************************* * pool functions **********************************/ @@ -541,6 +477,48 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +static unsigned long zswap_max_pages(void) +{ + return totalram_pages() * zswap_max_pool_percent / 100; +} + +static unsigned long zswap_accept_thr_pages(void) +{ + return zswap_max_pages() * zswap_accept_thr_percent / 100; +} + +unsigned long zswap_total_pages(void) +{ + struct zswap_pool *pool; + unsigned long total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &zswap_pools, list) { + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + total += zpool_get_total_pages(pool->zpools[i]); + } + rcu_read_unlock(); + + return total; +} + +static bool zswap_check_limits(void) +{ + unsigned long cur_pages = zswap_total_pages(); + unsigned long max_pages = zswap_max_pages(); + + if (cur_pages >= max_pages) { + zswap_pool_limit_hit++; + zswap_pool_reached_full = true; + } else if (zswap_pool_reached_full && + cur_pages <= zswap_accept_thr_pages()) { + zswap_pool_reached_full = false; + } + return zswap_pool_reached_full; +} + /********************************* * param callbacks **********************************/ @@ -807,63 +785,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) } /********************************* -* rbtree functions -**********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) -{ - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } - } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} - -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); -} - -/********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; @@ -874,7 +795,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; - RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -885,12 +805,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { - int i = 0; - - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); - - return entry->pool->zpools[i]; + return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))]; } /* @@ -904,7 +819,6 @@ static void zswap_entry_free(struct zswap_entry *entry) else { zswap_lru_del(&zswap_list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&zswap_nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -913,18 +827,6 @@ static void zswap_entry_free(struct zswap_entry *entry) } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); -} - -/* - * The caller hold the tree lock and search the entry from the tree, - * so it must be on the tree, remove it from the tree and free it. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - zswap_rb_erase(&tree->rbroot, entry); - zswap_entry_free(entry); } /********************************* @@ -1126,7 +1028,8 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry) { - struct zswap_tree *tree; + struct xarray *tree; + pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1163,19 +1066,13 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); + if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } - /* Safe to deref entry after the entry is verified above. */ - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); - zswap_decompress(entry, &folio->page); count_vm_event(ZSWPWB); @@ -1331,15 +1228,22 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!gfp_has_io_fs(sc->gfp_mask)) return 0; -#ifdef CONFIG_MEMCG_KMEM - mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); -#else - /* use pool stats instead of memcg stats */ - nr_backing = zswap_pool_total_size >> PAGE_SHIFT; - nr_stored = atomic_read(&zswap_nr_stored); -#endif + /* + * For memcg, use the cgroup-wide ZSWAP stats since we don't + * have them per-node and thus per-lruvec. Careful if memcg is + * runtime-disabled: we can get sc->memcg == NULL, which is ok + * for the lruvec, but not for memcg_page_state(). + * + * Without memcg, use the zswap pool-wide metrics. + */ + if (!mem_cgroup_disabled()) { + mem_cgroup_flush_stats(memcg); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); + } else { + nr_backing = zswap_total_pages(); + nr_stored = atomic_read(&zswap_stored_pages); + } if (!nr_stored) return 0; @@ -1358,6 +1262,11 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). + * + * The memory saving factor calculated here takes same-filled pages into + * account, but those are not freeable since they almost occupy no + * space. Hence, we may scale nr_freeable down a little bit more than we + * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1405,6 +1314,10 @@ static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; int ret, failures = 0; + unsigned long thr; + + /* Reclaim down to the accept threshold */ + thr = zswap_accept_thr_pages(); /* global reclaim will select cgroup in a round-robin fashion. */ do { @@ -1452,32 +1365,37 @@ static void shrink_worker(struct work_struct *w) break; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; - resched: cond_resched(); - } while (!zswap_can_accept()); + } while (zswap_total_pages() > thr); } -static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +/********************************* +* same-filled functions +**********************************/ +static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; + bool ret = false; - page = (unsigned long *)ptr; + page = kmap_local_folio(folio, 0); val = page[0]; if (val != page[last_pos]) - return 0; + goto out; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) - return 0; + goto out; } *value = val; - - return 1; + ret = true; +out: + kunmap_local(page); + return ret; } static void zswap_fill_page(void *ptr, unsigned long value) @@ -1488,14 +1406,18 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } +/********************************* +* main API +**********************************/ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); - struct zswap_entry *entry, *dupentry; + struct xarray *tree = swap_zswap_tree(swp); + struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; + unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1507,6 +1429,7 @@ bool zswap_store(struct folio *folio) if (!zswap_enabled) goto check_old; + /* Check cgroup limits */ objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); @@ -1517,19 +1440,8 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* reclaim space if needed */ - if (zswap_is_full()) { - zswap_pool_limit_hit++; - zswap_pool_reached_full = true; - goto shrink; - } - - if (zswap_pool_reached_full) { - if (!zswap_can_accept()) - goto shrink; - else - zswap_pool_reached_full = false; - } + if (zswap_check_limits()) + goto reject; /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); @@ -1538,24 +1450,13 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_same_filled_pages_enabled) { - unsigned long value; - u8 *src; - - src = kmap_local_folio(folio, 0); - if (zswap_is_page_same_filled(src, &value)) { - kunmap_local(src); - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto insert_entry; - } - kunmap_local(src); + if (zswap_is_folio_same_filled(folio, &value)) { + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto store_entry; } - if (!zswap_non_same_filled_pages_enabled) - goto freepage; - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1573,62 +1474,77 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -insert_entry: +store_entry: entry->swpentry = swp; entry->objcg = objcg; + + old = xa_store(tree, offset, entry, GFP_KERNEL); + if (xa_is_err(old)) { + int err = xa_err(old); + + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); + zswap_reject_alloc_fail++; + goto store_failed; + } + + /* + * We may have had an existing entry that became stale when + * the folio was redirtied and now the new version is being + * swapped out. Get rid of the old. + */ + if (old) + zswap_entry_free(old); + if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); - /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } - /* map */ - spin_lock(&tree->lock); /* - * The folio may have been dirtied again, invalidate the - * possibly stale entry before inserting the new entry. + * We finish initializing the entry while it's already in xarray. + * This is safe because: + * + * 1. Concurrent stores and invalidations are excluded by folio lock. + * + * 2. Writeback is excluded by the entry not being on the LRU yet. + * The publishing order matters to prevent writeback from seeing + * an incoherent entry. */ - if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - zswap_invalidate_entry(tree, dupentry); - WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); - } if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); - atomic_inc(&zswap_nr_stored); } - spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; +store_failed: + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(zswap_find_zpool(entry), entry->handle); put_pool: - zswap_pool_put(entry->pool); + zswap_pool_put(entry->pool); + } freepage: zswap_entry_cache_free(entry); reject: - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); + if (zswap_pool_reached_full) + queue_work(shrink_wq, &zswap_shrink_work); check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate the * possibly stale entry which was previously stored at this offset. * Otherwise, writeback could overwrite the new data in the swapfile. */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); return false; - -shrink: - queue_work(shrink_wq, &zswap_shrink_work); - goto reject; } bool zswap_load(struct folio *folio) @@ -1637,18 +1553,12 @@ bool zswap_load(struct folio *folio) pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; bool swapcache = folio_test_swapcache(folio); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - spin_unlock(&tree->lock); - return false; - } /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and @@ -1662,8 +1572,12 @@ bool zswap_load(struct folio *folio) * the fault fails. We remain the primary owner of the entry.) */ if (swapcache) - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); + entry = xa_erase(tree, offset); + else + entry = xa_load(tree, offset); + + if (!entry) + return false; if (entry->length) zswap_decompress(entry, page); @@ -1688,19 +1602,17 @@ bool zswap_load(struct folio *folio) void zswap_invalidate(swp_entry_t swp) { pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); } int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *trees, *tree; + struct xarray *trees, *tree; unsigned int nr, i; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); @@ -1710,11 +1622,8 @@ int zswap_swapon(int type, unsigned long nr_pages) return -ENOMEM; } - for (i = 0; i < nr; i++) { - tree = trees + i; - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - } + for (i = 0; i < nr; i++) + xa_init(trees + i); nr_zswap_trees[type] = nr; zswap_trees[type] = trees; @@ -1723,7 +1632,7 @@ int zswap_swapon(int type, unsigned long nr_pages) void zswap_swapoff(int type) { - struct zswap_tree *trees = zswap_trees[type]; + struct xarray *trees = zswap_trees[type]; unsigned int i; if (!trees) @@ -1731,7 +1640,7 @@ void zswap_swapoff(int type) /* try_to_unuse() invalidated all the entries already */ for (i = 0; i < nr_zswap_trees[type]; i++) - WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); + WARN_ON_ONCE(!xa_empty(trees + i)); kvfree(trees); nr_zswap_trees[type] = 0; @@ -1746,6 +1655,13 @@ void zswap_swapoff(int type) static struct dentry *zswap_debugfs_root; +static int debugfs_get_total_size(void *data, u64 *val) +{ + *val = zswap_total_pages() * PAGE_SIZE; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1767,8 +1683,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("pool_total_size", 0444, - zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_file("pool_total_size", 0444, + zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, |