diff options
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 184 |
1 files changed, 119 insertions, 65 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 5871a2aa86a5..63ac67208453 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -601,7 +601,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, { struct percpu_cluster *cluster; struct swap_cluster_info *ci; - bool found_free; unsigned long tmp, max; new_cluster: @@ -614,17 +613,17 @@ new_cluster: } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; } - found_free = false; - /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster @@ -632,27 +631,23 @@ new_cluster: tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); - if (tmp >= max) { - cluster_set_null(&cluster->index); - goto new_cluster; - } - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (!si->swap_map[tmp]) { - found_free = true; - break; + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; } - tmp++; + unlock_cluster(ci); } - unlock_cluster(ci); - if (!found_free) { + if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return found_free; + return true; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -729,6 +724,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -739,9 +762,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; - - if (nr > SWAP_BATCH) - nr = SWAP_BATCH; + bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -755,17 +776,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) - goto checks; - else + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; - } - - if (unlikely(!si->cluster_nr--)) { + } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -848,7 +874,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -871,19 +896,33 @@ checks: if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; - else - goto done; - } - /* non-ssd case */ - ++offset; - - /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[offset]) { + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -901,6 +940,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } } offset = si->lowest_bit; @@ -916,6 +956,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } offset++; } @@ -1004,11 +1045,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) if (avail_pgs <= 0) goto noswap; - if (n_goal > SWAP_BATCH) - n_goal = SWAP_BATCH; - - if (n_goal > avail_pgs) - n_goal = avail_pgs; + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); @@ -1275,13 +1312,14 @@ unlock_out: } static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, usage); + usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); @@ -1316,7 +1354,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) - __swap_entry_free(p, entry, 1); + __swap_entry_free(p, entry); } /* @@ -1739,7 +1777,7 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { - count = __swap_entry_free(p, entry, 1); + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), @@ -1937,10 +1975,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; @@ -2650,6 +2692,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -2757,20 +2801,24 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; + unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = si->inuse_pages << (PAGE_SHIFT - 10); + file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", - si->pages << (PAGE_SHIFT - 10), - si->inuse_pages << (PAGE_SHIFT - 10), + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } @@ -3202,11 +3250,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3322,6 +3378,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -3654,7 +3712,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; - page = list_entry(head->lru.next, struct page, lru); + page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ @@ -3666,13 +3724,13 @@ static bool swap_count_continued(struct swap_info_struct *si, */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; @@ -3682,12 +3740,10 @@ init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = true; /* incremented */ @@ -3698,7 +3754,7 @@ init_map: *map = 0; /* we didn't zero the page */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } @@ -3707,13 +3763,11 @@ init_map: *map = 0; /* we didn't zero the page */ if (*map == 0) count = 0; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = count == COUNT_CONTINUED; } |