summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/swap.h6
-rw-r--r--mm/swap.h15
-rw-r--r--mm/swap_state.c105
-rw-r--r--mm/swapfile.c39
-rw-r--r--mm/vmscan.c3
5 files changed, 97 insertions, 71 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bf72b548a96d..74df3004c850 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -458,7 +458,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern int swap_duplicate_nr(swp_entry_t entry, int nr);
-extern int swapcache_prepare(swp_entry_t entry, int nr);
extern void swap_free_nr(swp_entry_t entry, int nr_pages);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
@@ -517,11 +516,6 @@ static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages)
return 0;
}
-static inline int swapcache_prepare(swp_entry_t swp, int nr)
-{
- return 0;
-}
-
static inline void swap_free_nr(swp_entry_t entry, int nr_pages)
{
}
diff --git a/mm/swap.h b/mm/swap.h
index 2f79458b37f3..e427240073e9 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -234,6 +234,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
return folio_entry.val == round_down(entry.val, nr_pages);
}
+/* Temporary internal helpers */
+void __swapcache_set_cached(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry);
+void __swapcache_clear_cached(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr);
+
/*
* All swap cache helpers below require the caller to ensure the swap entries
* used are valid and stabilize the device by any of the following ways:
@@ -247,7 +255,8 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
*/
struct folio *swap_cache_get_folio(swp_entry_t entry);
void *swap_cache_get_shadow(swp_entry_t entry);
-void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
+int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+ void **shadow, bool alloc);
void swap_cache_del_folio(struct folio *folio);
struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
struct mempolicy *mpol, pgoff_t ilx,
@@ -413,8 +422,10 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry)
return NULL;
}
-static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
+static inline int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+ void **shadow, bool alloc)
{
+ return -ENOENT;
}
static inline void swap_cache_del_folio(struct folio *folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d58bce532d95..22990c5259cc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -127,34 +127,64 @@ void *swap_cache_get_shadow(swp_entry_t entry)
* @entry: The swap entry corresponding to the folio.
* @gfp: gfp_mask for XArray node allocation.
* @shadowp: If a shadow is found, return the shadow.
+ * @alloc: If it's the allocator that is trying to insert a folio. Allocator
+ * sets SWAP_HAS_CACHE to pin slots before insert so skip map update.
*
* Context: Caller must ensure @entry is valid and protect the swap device
* with reference count or locks.
- * The caller also needs to update the corresponding swap_map slots with
- * SWAP_HAS_CACHE bit to avoid race or conflict.
*/
-void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
+int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+ void **shadowp, bool alloc)
{
+ int err;
void *shadow = NULL;
+ struct swap_info_struct *si;
unsigned long old_tb, new_tb;
struct swap_cluster_info *ci;
- unsigned int ci_start, ci_off, ci_end;
+ unsigned int ci_start, ci_off, ci_end, offset;
unsigned long nr_pages = folio_nr_pages(folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+ si = __swap_entry_to_info(entry);
new_tb = folio_to_swp_tb(folio);
ci_start = swp_cluster_offset(entry);
ci_end = ci_start + nr_pages;
ci_off = ci_start;
- ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ offset = swp_offset(entry);
+ ci = swap_cluster_lock(si, swp_offset(entry));
+ if (unlikely(!ci->table)) {
+ err = -ENOENT;
+ goto failed;
+ }
do {
- old_tb = __swap_table_xchg(ci, ci_off, new_tb);
- WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+ old_tb = __swap_table_get(ci, ci_off);
+ if (unlikely(swp_tb_is_folio(old_tb))) {
+ err = -EEXIST;
+ goto failed;
+ }
+ if (!alloc && unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
+ err = -ENOENT;
+ goto failed;
+ }
if (swp_tb_is_shadow(old_tb))
shadow = swp_tb_to_shadow(old_tb);
+ offset++;
+ } while (++ci_off < ci_end);
+
+ ci_off = ci_start;
+ offset = swp_offset(entry);
+ do {
+ /*
+ * Still need to pin the slots with SWAP_HAS_CACHE since
+ * swap allocator depends on that.
+ */
+ if (!alloc)
+ __swapcache_set_cached(si, ci, swp_entry(swp_type(entry), offset));
+ __swap_table_set(ci, ci_off, new_tb);
+ offset++;
} while (++ci_off < ci_end);
folio_ref_add(folio, nr_pages);
@@ -167,6 +197,11 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp
if (shadowp)
*shadowp = shadow;
+ return 0;
+
+failed:
+ swap_cluster_unlock(ci);
+ return err;
}
/**
@@ -185,6 +220,7 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp
void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
swp_entry_t entry, void *shadow)
{
+ struct swap_info_struct *si;
unsigned long old_tb, new_tb;
unsigned int ci_start, ci_off, ci_end;
unsigned long nr_pages = folio_nr_pages(folio);
@@ -194,6 +230,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
+ si = __swap_entry_to_info(entry);
new_tb = shadow_swp_to_tb(shadow);
ci_start = swp_cluster_offset(entry);
ci_end = ci_start + nr_pages;
@@ -209,6 +246,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
folio_clear_swapcache(folio);
node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
+ __swapcache_clear_cached(si, ci, entry, nr_pages);
}
/**
@@ -230,7 +268,6 @@ void swap_cache_del_folio(struct folio *folio)
__swap_cache_del_folio(ci, folio, entry, NULL);
swap_cluster_unlock(ci);
- put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
@@ -422,67 +459,37 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
gfp_t gfp, bool charged,
bool skip_if_exists)
{
- struct folio *swapcache;
+ struct folio *swapcache = NULL;
void *shadow;
int ret;
- /*
- * Check and pin the swap map with SWAP_HAS_CACHE, then add the folio
- * into the swap cache. Loop with a schedule delay if raced with
- * another process setting SWAP_HAS_CACHE. This hackish loop will
- * be fixed very soon.
- */
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
for (;;) {
- ret = swapcache_prepare(entry, folio_nr_pages(folio));
+ ret = swap_cache_add_folio(folio, entry, &shadow, false);
if (!ret)
break;
/*
- * The skip_if_exists is for protecting against a recursive
- * call to this helper on the same entry waiting forever
- * here because SWAP_HAS_CACHE is set but the folio is not
- * in the swap cache yet. This can happen today if
- * mem_cgroup_swapin_charge_folio() below triggers reclaim
- * through zswap, which may call this helper again in the
- * writeback path.
- *
- * Large order allocation also needs special handling on
+ * Large order allocation needs special handling on
* race: if a smaller folio exists in cache, swapin needs
* to fallback to order 0, and doing a swap cache lookup
* might return a folio that is irrelevant to the faulting
* entry because @entry is aligned down. Just return NULL.
*/
if (ret != -EEXIST || skip_if_exists || folio_test_large(folio))
- return NULL;
+ goto failed;
- /*
- * Check the swap cache again, we can only arrive
- * here because swapcache_prepare returns -EEXIST.
- */
swapcache = swap_cache_get_folio(entry);
if (swapcache)
- return swapcache;
-
- /*
- * We might race against __swap_cache_del_folio(), and
- * stumble across a swap_map entry whose SWAP_HAS_CACHE
- * has not yet been cleared. Or race against another
- * swap_cache_alloc_folio(), which has set SWAP_HAS_CACHE
- * in swap_map, but not yet added its folio to swap cache.
- */
- schedule_timeout_uninterruptible(1);
+ goto failed;
}
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) {
- put_swap_folio(folio, entry);
- folio_unlock(folio);
- return NULL;
+ swap_cache_del_folio(folio);
+ goto failed;
}
- swap_cache_add_folio(folio, entry, &shadow);
memcg1_swapin(entry, folio_nr_pages(folio));
if (shadow)
workingset_refault(folio, shadow);
@@ -490,6 +497,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
/* Caller will initiate read into locked folio */
folio_add_lru(folio);
return folio;
+
+failed:
+ folio_unlock(folio);
+ return swapcache;
}
/**
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ced53aba3f4c..64970ee11fcf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1476,7 +1476,11 @@ again:
if (!entry.val)
return -ENOMEM;
- swap_cache_add_folio(folio, entry, NULL);
+ /*
+ * Allocator has pinned the slots with SWAP_HAS_CACHE
+ * so it should never fail
+ */
+ WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true));
return 0;
@@ -1582,9 +1586,8 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
* do_swap_page()
* ... swapoff+swapon
* swap_cache_alloc_folio()
- * swapcache_prepare()
- * __swap_duplicate()
- * // check swap_map
+ * swap_cache_add_folio()
+ * // check swap_map
* // verify PTE not changed
*
* In __swap_duplicate(), the swap_map need to be checked before
@@ -3769,17 +3772,25 @@ int swap_duplicate_nr(swp_entry_t entry, int nr)
return err;
}
-/*
- * @entry: first swap entry from which we allocate nr swap cache.
- *
- * Called when allocating swap cache for existing swap entries,
- * This can return error codes. Returns 0 at success.
- * -EEXIST means there is a swap cache.
- * Note: return code is different from swap_duplicate().
- */
-int swapcache_prepare(swp_entry_t entry, int nr)
+/* Mark the swap map as HAS_CACHE, caller need to hold the cluster lock */
+void __swapcache_set_cached(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry)
+{
+ WARN_ON(swap_dup_entries(si, ci, swp_offset(entry), SWAP_HAS_CACHE, 1));
+}
+
+/* Clear the swap map as !HAS_CACHE, caller need to hold the cluster lock */
+void __swapcache_clear_cached(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr)
{
- return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
+ if (swap_only_has_cache(si, swp_offset(entry), nr)) {
+ swap_entries_free(si, ci, entry, nr);
+ } else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
}
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1d281174164e..973ffb9813ea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -757,10 +757,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __swap_cache_del_folio(ci, folio, swap, shadow);
memcg1_swapout(folio, swap);
+ __swap_cache_del_folio(ci, folio, swap, shadow);
swap_cluster_unlock_irq(ci);
- put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);