diff options
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 487 |
1 files changed, 193 insertions, 294 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index dbac1d49469d..2b8d9c3fbb47 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); +static struct swap_info_struct *swap_type_to_swap_info(int type) +{ + if (type >= READ_ONCE(nr_swapfiles)) + return NULL; + + smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ + return READ_ONCE(swap_info[type]); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -1044,12 +1053,14 @@ noswap: /* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si; + struct swap_info_struct *si = swap_type_to_swap_info(type); pgoff_t offset; - si = swap_info[type]; + if (!si) + goto fail; + spin_lock(&si->lock); - if (si && (si->flags & SWP_WRITEOK)) { + if (si->flags & SWP_WRITEOK) { atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); @@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type) atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); +fail: return (swp_entry_t) {0}; } @@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) if (!entry.val) goto out; type = swp_type(entry); - if (type >= nr_swapfiles) + p = swap_type_to_swap_info(type); + if (!p) goto bad_nofile; - p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) sector_t swapdev_block(int type, pgoff_t offset) { struct block_device *bdev; + struct swap_info_struct *si = swap_type_to_swap_info(type); - if ((unsigned int)type >= nr_swapfiles) - return 0; - if (!(swap_info[type]->flags & SWP_WRITEOK)) + if (!si || !(si->flags & SWP_WRITEOK)) return 0; return map_swap_entry(swp_entry(type, offset), &bdev); } @@ -1799,44 +1810,77 @@ out_nolock: } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned long addr, unsigned long end, + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { - pte_t swp_pte = swp_entry_to_pte(entry); + struct page *page; + swp_entry_t entry; pte_t *pte; + struct swap_info_struct *si; + unsigned long offset; int ret = 0; + volatile unsigned char *swap_map; - /* - * We don't actually need pte lock while scanning for swp_pte: since - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the - * page table while we're scanning; though it could get zapped, and on - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse - * of unmatched parts which look like swp_pte, so unuse_pte must - * recheck under pte lock. Scanning without pte lock lets it be - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. - */ + si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - /* - * swapoff spends a _lot_ of time in this loop! - * Test inline before going to call unuse_pte. - */ - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { - pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); - if (ret) - goto out; - pte = pte_offset_map(pmd, addr); + struct vm_fault vmf; + + if (!is_swap_pte(*pte)) + continue; + + entry = pte_to_swp_entry(*pte); + if (swp_type(entry) != type) + continue; + + offset = swp_offset(entry); + if (frontswap && !frontswap_test(si, offset)) + continue; + + pte_unmap(pte); + swap_map = &si->swap_map[offset]; + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (!page) { + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + goto try_next; + return -ENOMEM; + } + + lock_page(page); + wait_on_page_writeback(page); + ret = unuse_pte(vma, pmd, addr, entry, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto out; } + + try_to_free_swap(page); + unlock_page(page); + put_page(page); + + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { + ret = FRONTSWAP_PAGES_UNUSED; + goto out; + } +try_next: + pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); + + ret = 0; out: return ret; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pmd_t *pmd; unsigned long next; @@ -1848,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = unuse_pte_range(vma, pmd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -1857,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { pud_t *pud; unsigned long next; @@ -1868,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = unuse_pmd_range(vma, pud, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -1877,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - swp_entry_t entry, struct page *page) + unsigned int type, bool frontswap, + unsigned long *fs_pages_to_unuse) { p4d_t *p4d; unsigned long next; @@ -1888,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); + ret = unuse_pud_range(vma, p4d, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, - swp_entry_t entry, struct page *page) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { pgd_t *pgd; unsigned long addr, end, next; int ret; - if (page_anon_vma(page)) { - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - return 0; - else - end = addr + PAGE_SIZE; - } else { - addr = vma->vm_start; - end = vma->vm_end; - } + addr = vma->vm_start; + end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); + ret = unuse_p4d_range(vma, pgd, addr, next, type, + frontswap, fs_pages_to_unuse); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, - swp_entry_t entry, struct page *page) +static int unuse_mm(struct mm_struct *mm, unsigned int type, + bool frontswap, unsigned long *fs_pages_to_unuse) { struct vm_area_struct *vma; int ret = 0; - if (!down_read_trylock(&mm->mmap_sem)) { - /* - * Activate page so shrink_inactive_list is unlikely to unmap - * its ptes while lock is dropped, so swapoff can make progress. - */ - activate_page(page); - unlock_page(page); - down_read(&mm->mmap_sem); - lock_page(page); - } + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) - break; + if (vma->anon_vma) { + ret = unuse_vma(vma, type, frontswap, + fs_pages_to_unuse); + if (ret) + break; + } cond_resched(); } up_read(&mm->mmap_sem); - return (ret < 0)? ret: 0; + return ret; } /* * Scan swap_map (or frontswap_map if frontswap parameter is true) - * from current position to next entry still in use. - * Recycle to start on reaching the end, returning 0 when empty. + * from current position to next entry still in use. Return 0 + * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev, bool frontswap) { - unsigned int max = si->max; - unsigned int i = prev; + unsigned int i; unsigned char count; /* @@ -1968,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ - for (;;) { - if (++i >= max) { - if (!prev) { - i = 0; - break; - } - /* - * No entries in use at top of swap_map, - * loop back to start and recheck there. - */ - max = prev + 1; - prev = 0; - i = 1; - } + for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) if (!frontswap || frontswap_test(si, i)) @@ -1989,240 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, if ((i % LATENCY_LIMIT) == 0) cond_resched(); } + + if (i == si->max) + i = 0; + return i; } /* - * We completely avoid races by reading each swap page in advance, - * and then search for the process using it. All the necessary - * page table adjustments can then be made atomically. - * - * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * If the boolean frontswap is true, only unuse pages_to_unuse pages; * pages_to_unuse==0 means all pages; ignored if frontswap is false */ +#define SWAP_UNUSE_MAX_TRIES 3 int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) { + struct mm_struct *prev_mm; + struct mm_struct *mm; + struct list_head *p; + int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct mm_struct *start_mm; - volatile unsigned char *swap_map; /* swap_map is accessed without - * locking. Mark it as volatile - * to prevent compiler doing - * something odd. - */ - unsigned char swcount; struct page *page; swp_entry_t entry; - unsigned int i = 0; - int retval = 0; + unsigned int i; + int retries = 0; - /* - * When searching mms for an entry, a good strategy is to - * start at the first mm we freed the previous entry from - * (though actually we don't notice whether we or coincidence - * freed the entry). Initialize this start_mm with a hold. - * - * A simpler strategy would be to start at the last mm we - * freed the previous entry from; but that would take less - * advantage of mmlist ordering, which clusters forked mms - * together, child after parent. If we race with dup_mmap(), we - * prefer to resolve parent before child, lest we miss entries - * duplicated after we scanned child: using last mm would invert - * that. - */ - start_mm = &init_mm; - mmget(&init_mm); + if (!si->inuse_pages) + return 0; - /* - * Keep on scanning until all entries have gone. Usually, - * one pass through swap_map is enough, but not necessarily: - * there are races when an instance of an entry might be missed. - */ - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + if (!frontswap) + pages_to_unuse = 0; + +retry: + retval = shmem_unuse(type, frontswap, &pages_to_unuse); + if (retval) + goto out; + + prev_mm = &init_mm; + mmget(prev_mm); + + spin_lock(&mmlist_lock); + p = &init_mm.mmlist; + while ((p = p->next) != &init_mm.mmlist) { if (signal_pending(current)) { retval = -EINTR; break; } - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); - page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0, false); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - swcount = *swap_map; - /* - * We don't hold lock here, so the swap entry could be - * SWAP_MAP_BAD (when the cluster is discarding). - * Instead of fail out, We can just skip the swap - * entry because swapoff will wait for discarding - * finish anyway. - */ - if (!swcount || swcount == SWAP_MAP_BAD) - continue; - retval = -ENOMEM; - break; - } + mm = list_entry(p, struct mm_struct, mmlist); + if (!mmget_not_zero(mm)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - mmget(&init_mm); + if (retval) { + mmput(prev_mm); + goto out; } /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page_locked" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page_locked(page); - wait_on_page_writeback(page); - lock_page(page); - wait_on_page_writeback(page); - - /* - * Remove all references to entry. + * Make sure that we aren't completely killing + * interactive performance. */ - swcount = *swap_map; - if (swap_count(swcount) == SWAP_MAP_SHMEM) { - retval = shmem_unuse(entry, page); - /* page has already been unlocked and released */ - if (retval < 0) - break; - continue; - } - if (swap_count(swcount) && start_mm != &init_mm) - retval = unuse_mm(start_mm, entry, page); - - if (swap_count(*swap_map)) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *prev_mm = start_mm; - struct mm_struct *mm; - - mmget(new_start_mm); - mmget(prev_mm); - spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - if (!mmget_not_zero(mm)) - continue; - spin_unlock(&mmlist_lock); - mmput(prev_mm); - prev_mm = mm; + cond_resched(); + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); - cond_resched(); + mmput(prev_mm); - swcount = *swap_map; - if (!swap_count(swcount)) /* any usage ? */ - ; - else if (mm == &init_mm) - set_start_mm = 1; - else - retval = unuse_mm(mm, entry, page); - - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - mmget(mm); - new_start_mm = mm; - set_start_mm = 0; - } - spin_lock(&mmlist_lock); - } - spin_unlock(&mmlist_lock); - mmput(prev_mm); - mmput(start_mm); - start_mm = new_start_mm; - } - if (retval) { - unlock_page(page); - put_page(page); - break; - } + i = 0; + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_unmap could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Given how unuse_vma() targets one particular offset - * in an anon_vma, once the anon_vma has been determined, - * this splitting happens to be just what is needed to - * handle where KSM pages have been swapped out: re-reading - * is unnecessarily slow, but we can fix that later on. - */ - if (swap_count(*swap_map) && - PageDirty(page) && PageSwapCache(page)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - swap_writepage(compound_head(page), &wbc); - lock_page(page); - wait_on_page_writeback(page); - } + entry = swp_entry(type, i); + page = find_get_page(swap_address_space(entry), i); + if (!page) + continue; /* * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock at the top, - * or while we dropped it in unuse_mm(). The page might even - * be back in swap cache on another swap area: that we must not - * delete, since it may not have been written out to swap yet. - */ - if (PageSwapCache(page) && - likely(page_private(page) == entry.val) && - (!PageTransCompound(page) || - !swap_page_trans_huge_swapped(si, entry))) - delete_from_swap_cache(compound_head(page)); - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_page_list will preserve it. + * swap cache just before we acquired the page lock. The page + * might even be back in swap cache on another swap area. But + * that is okay, try_to_free_swap() only removes stale pages. */ - SetPageDirty(page); + lock_page(page); + wait_on_page_writeback(page); + try_to_free_swap(page); unlock_page(page); put_page(page); /* - * Make sure that we aren't completely killing - * interactive performance. + * For frontswap, we just need to unuse pages_to_unuse, if + * it was specified. Need not check frontswap again here as + * we already zeroed out pages_to_unuse if not frontswap. */ - cond_resched(); - if (frontswap && pages_to_unuse > 0) { - if (!--pages_to_unuse) - break; - } + if (pages_to_unuse && --pages_to_unuse == 0) + goto out; } - mmput(start_mm); - return retval; + /* + * Lets check again to see if there are still swap entries in the map. + * If yes, we would need to do retry the unuse logic again. + * Under global memory pressure, swap entries can be reinserted back + * into process space after the mmlist loop above passes over them. + * Its not worth continuosuly retrying to unuse the swap in this case. + * So we try SWAP_UNUSE_MAX_TRIES times. + */ + if (++retries >= SWAP_UNUSE_MAX_TRIES) + retval = -EBUSY; + else if (si->inuse_pages) + goto retry; + +out: + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; } /* @@ -2258,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) struct swap_extent *se; pgoff_t offset; - sis = swap_info[swp_type(entry)]; + sis = swp_swap_info(entry); *bdev = sis->bdev; offset = swp_offset(entry); @@ -2700,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2722,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1; - for (; type < nr_swapfiles; type++) { - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ - si = swap_info[type]; + for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; @@ -2813,9 +2713,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; unsigned int type; int i; - int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); - p = kvzalloc(size, GFP_KERNEL); + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -2831,14 +2730,14 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= nr_swapfiles) { p->type = type; - swap_info[type] = p; + WRITE_ONCE(swap_info[type], p); /* * Write swap_info[type] before nr_swapfiles, in case a * racing procfs swap_start() or swap_next() is reading them. * (We never shrink nr_swapfiles, we never free this entry.) */ smp_wmb(); - nr_swapfiles++; + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); } else { kvfree(p); p = swap_info[type]; @@ -3358,7 +3257,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; @@ -3366,10 +3265,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) if (non_swap_entry(entry)) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) + p = swp_swap_info(entry); + if (!p) goto bad_file; - p = swap_info[type]; + offset = swp_offset(entry); if (unlikely(offset >= p->max)) goto out; @@ -3466,7 +3365,7 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *swp_swap_info(swp_entry_t entry) { - return swap_info[swp_type(entry)]; + return swap_type_to_swap_info(swp_type(entry)); } struct swap_info_struct *page_swap_info(struct page *page) |