From 51726b1222863852c46ca21ed0115b85d1edfd89 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:25 -0800 Subject: mm: replace some BUG_ONs by VM_BUG_ONs The swap code is over-provisioned with BUG_ONs on assorted page flags, mostly dating back to 2.3. They're good documentation, and guard against developer error, but a waste of space on most systems: change them to VM_BUG_ONs, conditional on CONFIG_DEBUG_VM. Just delete the PagePrivate ones: they're later, from 2.5.69, but even less interesting now. Signed-off-by: Hugh Dickins Reviewed-by: Christoph Lameter Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a9f87e5162..214e90b94946 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -333,7 +333,7 @@ int can_share_swap_page(struct page *page) { int count; - BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageLocked(page)); count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) count += page_swapcount(page); @@ -350,8 +350,7 @@ static int remove_exclusive_swap_page_count(struct page *page, int count) struct swap_info_struct * p; swp_entry_t entry; - BUG_ON(PagePrivate(page)); - BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageLocked(page)); if (!PageSwapCache(page)) return 0; @@ -432,7 +431,6 @@ void free_swap_and_cache(swp_entry_t entry) if (page) { int one_user; - BUG_ON(PagePrivate(page)); one_user = (page_count(page) == 2); /* Only cache user (+us), or swap space full? Free it! */ /* Also recheck PageSwapCache after page is locked (above) */ @@ -1209,7 +1207,7 @@ int page_queue_congested(struct page *page) { struct backing_dev_info *bdi; - BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ + VM_BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ if (PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; -- cgit v1.2.3 From 7b1fe59793e61f826bef053107b57b23954833bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:34 -0800 Subject: mm: reuse_swap_page replaces can_share_swap_page A good place to free up old swap is where do_wp_page(), or do_swap_page(), is about to redirty the page: the data on disk is then stale and won't be read again; and if we do decide to write the page out later, using the previous swap location makes an unnecessary disk seek very likely. So give can_share_swap_page() the side-effect of delete_from_swap_cache() when it safely can. And can_share_swap_page() was always a misleading name, the more so if it has a side-effect: rename it reuse_swap_page(). Irrelevant cleanup nearby: remove swap_token_default_timeout definition from swap.h: it's used nowhere. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Acked-by: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++---- mm/memory.c | 4 ++-- mm/swapfile.c | 15 +++++++++++---- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 48f309dc5a0c..366556c5b148 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -304,7 +304,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern sector_t swapdev_block(int, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); -extern int can_share_swap_page(struct page *); +extern int reuse_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); extern int remove_exclusive_swap_page_ref(struct page *); struct backing_dev_info; @@ -372,8 +372,6 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -#define can_share_swap_page(p) (page_mapcount(p) == 1) - static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { @@ -388,7 +386,7 @@ static inline void delete_from_swap_cache(struct page *page) { } -#define swap_token_default_timeout 0 +#define reuse_swap_page(page) (page_mapcount(page) == 1) static inline int remove_exclusive_swap_page(struct page *p) { diff --git a/mm/memory.c b/mm/memory.c index 3922ffcf3dff..8f471edcb985 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1861,7 +1861,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = can_share_swap_page(old_page); + reuse = reuse_swap_page(old_page); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2392,7 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { + if (write_access && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 214e90b94946..bfd4ee59cb88 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -326,17 +326,24 @@ static inline int page_swapcount(struct page *page) } /* - * We can use this swap cache entry directly - * if there are no other references to it. + * We can write to an anon page without COW if there are no other references + * to it. And as a side-effect, free up its swap: because the old content + * on disk will never be read, and seeking back there to write new content + * later would only waste time away from clustering. */ -int can_share_swap_page(struct page *page) +int reuse_swap_page(struct page *page) { int count; VM_BUG_ON(!PageLocked(page)); count = page_mapcount(page); - if (count <= 1 && PageSwapCache(page)) + if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); + if (count == 1 && !PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + } return count == 1; } -- cgit v1.2.3 From a2c43eed8334e878702fca713b212ae2a11d84b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:36 -0800 Subject: mm: try_to_free_swap replaces remove_exclusive_swap_page remove_exclusive_swap_page(): its problem is in living up to its name. It doesn't matter if someone else has a reference to the page (raised page_count); it doesn't matter if the page is mapped into userspace (raised page_mapcount - though that hints it may be worth keeping the swap): all that matters is that there be no more references to the swap (and no writeback in progress). swapoff (try_to_unuse) has been removing pages from swapcache for years, with no concern for page count or page mapcount, and we used to have a comment in lookup_swap_cache() recognizing that: if you go for a page of swapcache, you'll get the right page, but it could have been removed from swapcache by the time you get page lock. So, give up asking for exclusivity: get rid of remove_exclusive_swap_page(), and remove_exclusive_swap_page_ref() and remove_exclusive_swap_page_count() which were spawned for the recent LRU work: replace them by the simpler try_to_free_swap() which just checks page_swapcount(). Similarly, remove the page_count limitation from free_swap_and_count(), but assume that it's worth holding on to the swap if page is mapped and swap nowhere near full. Add a vm_swap_full() test in free_swap_cache()? It would be consistent, but I think we probably have enough for now. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 10 ++------ mm/memory.c | 2 +- mm/page_io.c | 2 +- mm/swap.c | 3 +-- mm/swap_state.c | 8 +++--- mm/swapfile.c | 70 +++++++++------------------------------------------- mm/vmscan.c | 2 +- 7 files changed, 22 insertions(+), 75 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 366556c5b148..c3ecd478840e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -305,8 +305,7 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern sector_t swapdev_block(int, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int reuse_swap_page(struct page *); -extern int remove_exclusive_swap_page(struct page *); -extern int remove_exclusive_swap_page_ref(struct page *); +extern int try_to_free_swap(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ @@ -388,12 +387,7 @@ static inline void delete_from_swap_cache(struct page *page) #define reuse_swap_page(page) (page_mapcount(page) == 1) -static inline int remove_exclusive_swap_page(struct page *p) -{ - return 0; -} - -static inline int remove_exclusive_swap_page_ref(struct page *page) +static inline int try_to_free_swap(struct page *page) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index 8f471edcb985..1a83fe5339a9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2403,7 +2403,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - remove_exclusive_swap_page(page); + try_to_free_swap(page); unlock_page(page); if (write_access) { diff --git a/mm/page_io.c b/mm/page_io.c index d277a80efa71..dc6ce0afbded 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) struct bio *bio; int ret = 0, rw = WRITE; - if (remove_exclusive_swap_page(page)) { + if (try_to_free_swap(page)) { unlock_page(page); goto out; } diff --git a/mm/swap.c b/mm/swap.c index ff0b290475fd..ba2c0e8b8b54 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -454,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec) struct page *page = pvec->pages[i]; if (PageSwapCache(page) && trylock_page(page)) { - if (PageSwapCache(page)) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); unlock_page(page); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index e793fdea275d..bcb472769299 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -195,14 +195,14 @@ void delete_from_swap_cache(struct page *page) * If we are the only user, then try to free up the swap cache. * * Its ok to check for PageSwapCache without the page lock - * here because we are going to recheck again inside - * exclusive_swap_page() _with_ the lock. + * here because we are going to recheck again inside + * try_to_free_swap() _with_ the lock. * - Marcelo */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && trylock_page(page)) { - remove_exclusive_swap_page(page); + if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { + try_to_free_swap(page); unlock_page(page); } } diff --git a/mm/swapfile.c b/mm/swapfile.c index bfd4ee59cb88..f43601827607 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -348,68 +348,23 @@ int reuse_swap_page(struct page *page) } /* - * Work out if there are any other processes sharing this - * swap cache page. Free it if you can. Return success. + * If swap is getting full, or if there are no more mappings of this page, + * then try_to_free_swap is called to free its swap space. */ -static int remove_exclusive_swap_page_count(struct page *page, int count) +int try_to_free_swap(struct page *page) { - int retval; - struct swap_info_struct * p; - swp_entry_t entry; - VM_BUG_ON(!PageLocked(page)); if (!PageSwapCache(page)) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != count) /* us + cache + ptes */ - return 0; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (!p) + if (page_swapcount(page)) return 0; - /* Is the only swap cache user the cache itself? */ - retval = 0; - if (p->swap_map[swp_offset(entry)] == 1) { - /* Recheck the page count with the swapcache lock held.. */ - spin_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == count) && !PageWriteback(page)) { - __delete_from_swap_cache(page); - SetPageDirty(page); - retval = 1; - } - spin_unlock_irq(&swapper_space.tree_lock); - } - spin_unlock(&swap_lock); - - if (retval) { - swap_free(entry); - page_cache_release(page); - } - - return retval; -} - -/* - * Most of the time the page should have two references: one for the - * process and one for the swap cache. - */ -int remove_exclusive_swap_page(struct page *page) -{ - return remove_exclusive_swap_page_count(page, 2); -} - -/* - * The pageout code holds an extra reference to the page. That raises - * the reference count to test for to 2 for a page that is only in the - * swap cache plus 1 for each process that maps the page. - */ -int remove_exclusive_swap_page_ref(struct page *page) -{ - return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); + delete_from_swap_cache(page); + SetPageDirty(page); + return 1; } /* @@ -436,13 +391,12 @@ void free_swap_and_cache(swp_entry_t entry) spin_unlock(&swap_lock); } if (page) { - int one_user; - - one_user = (page_count(page) == 2); - /* Only cache user (+us), or swap space full? Free it! */ - /* Also recheck PageSwapCache after page is locked (above) */ + /* + * Not mapped elsewhere, or swap space full? Free it! + * Also recheck PageSwapCache now page is locked (above). + */ if (PageSwapCache(page) && !PageWriteback(page) && - (one_user || vm_swap_full())) { + (!page_mapped(page) || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d196f46c8808..c8601dd36603 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -759,7 +759,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; -- cgit v1.2.3 From 68bdc8d64742ccc5e340c5d122ebbab3f0cf2a74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:37 -0800 Subject: mm: try_to_unuse check removing right swap There's a possible race in try_to_unuse() which Nick Piggin led me to two years ago. Where it does lock_page() after read_swap_cache_async(), what if another task removed that page from swapcache just before we locked it? It would sail though the (*swap_map > 1) tests doing nothing (because it could not have been removed from swapcache before its swap references were gone), until it reaches the delete_from_swap_cache(page) near the bottom. Now imagine that this page has been allocated to swap on a different swap area while we dropped page lock (perhaps at the top, perhaps in unuse_mm): we could wrongly remove from swap cache before the page has been written to swap, so a subsequent do_swap_page() would read in stale data from swap. I think this case could not happen before: remove_exclusive_swap_page() refused while page count was raised. But now with reuse_swap_page() and try_to_free_swap() removing from swap cache without minding page count, I think it could happen - the previous patch argued that it was safe because try_to_unuse() already ignored page count, but overlooked that it might be breaking the assumptions in try_to_unuse() itself. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index f43601827607..9ce7f81c8abc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -889,7 +889,16 @@ static int try_to_unuse(unsigned int type) lock_page(page); wait_on_page_writeback(page); } - if (PageSwapCache(page)) + + /* + * It is conceivable that a racing task removed this page from + * swap cache just before we acquired the page lock at the top, + * or while we dropped it in unuse_mm(). The page might even + * be back in swap cache on another swap area: that we must not + * delete, since it may not have been written out to swap yet. + */ + if (PageSwapCache(page) && + likely(page_private(page) == entry.val)) delete_from_swap_cache(page); /* -- cgit v1.2.3 From b962716b459505a8d83aea313fea0abe76749f42 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:41 -0800 Subject: mm: optimize get_scan_ratio for no swap Rik suggests a simplified get_scan_ratio() for !CONFIG_SWAP. Yes, the gcc optimizer gives us that, when nr_swap_pages is #defined as 0L. Move usual declaration to swapfile.c: it never belonged in page_alloc.c. Signed-off-by: Hugh Dickins Cc: Lee Schermerhorn Acked-by: Rik van Riel Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++-- mm/page_alloc.c | 1 - mm/swapfile.c | 1 + mm/vmscan.c | 12 ++++++------ 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index c0d23ac710d5..3a31cc25bd2c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -163,7 +163,6 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; -extern long nr_swap_pages; extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); @@ -291,6 +290,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); /* linux/mm/swapfile.c */ +extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); @@ -331,7 +331,8 @@ static inline void disable_swap_token(void) #else /* CONFIG_SWAP */ -#define total_swap_pages 0 +#define nr_swap_pages 0L +#define total_swap_pages 0L #define total_swapcache_pages 0UL #define si_swapinfo(val) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce2a026219bc..65133436fe3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,7 +69,6 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -long nr_swap_pages; int percpu_pagelist_fraction; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE diff --git a/mm/swapfile.c b/mm/swapfile.c index 9ce7f81c8abc..725e56c362de 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -35,6 +35,7 @@ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; +long nr_swap_pages; long total_swap_pages; static int swap_overflow; static int least_priority; diff --git a/mm/vmscan.c b/mm/vmscan.c index f350523a8eee..d500b906746c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1327,12 +1327,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long anon_prio, file_prio; unsigned long ap, fp; - anon = zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - file = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have no swap space, do not bother scanning anon pages. */ if (nr_swap_pages <= 0) { percent[0] = 0; @@ -1340,6 +1334,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, return; } + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= zone->pages_high)) { percent[0] = 100; -- cgit v1.2.3 From 73fd8748ab0b9b3ddd178bea1d7ae03372033d96 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:47 -0800 Subject: swapfile: swapon needs larger size type sys_swapon()'s swapfilesize (better renamed swapfilepages) is declared as an int, but should be an unsigned long like the maxpages it's compared against: on 64-bit (with 4kB pages) a swapfile of 2^44 bytes was rejected with "Swap area shorter than signature indicates". Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 725e56c362de..e2adc8eb9317 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1461,7 +1461,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) int nr_extents = 0; sector_t span; unsigned long maxpages = 1; - int swapfilesize; + unsigned long swapfilepages; unsigned short *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -1539,7 +1539,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } - swapfilesize = i_size_read(inode) >> PAGE_SHIFT; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; /* * Read the swap header. @@ -1616,7 +1616,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = -EINVAL; if (!maxpages) goto bad_swap; - if (swapfilesize && maxpages > swapfilesize) { + if (swapfilepages && maxpages > swapfilepages) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); goto bad_swap; -- cgit v1.2.3 From 22c6f8fdb31993cf49bdd4a47b64a7002391e1c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:48 -0800 Subject: swapfile: remove SWP_ACTIVE mask Remove the SWP_ACTIVE mask: it just obscures the SWP_WRITEOK flag. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - mm/swapfile.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3a31cc25bd2c..410c8e473727 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -120,7 +120,6 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; diff --git a/mm/swapfile.c b/mm/swapfile.c index e2adc8eb9317..915cb3fc43d7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1222,7 +1222,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) spin_lock(&swap_lock); for (type = swap_list.head; type >= 0; type = swap_info[type].next) { p = swap_info + type; - if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { + if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; } @@ -1674,7 +1674,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) else p->prio = --least_priority; p->swap_map = swap_map; - p->flags = SWP_ACTIVE; + p->flags |= SWP_WRITEOK; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; -- cgit v1.2.3 From 886bb7e9c3ed0bb3e4a2b1f336d8c6a6e5a4b782 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:48 -0800 Subject: swapfile: remove surplus whitespace Remove trailing whitespace from swapfile.c, and odd swap_show() alignment. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 915cb3fc43d7..c46c83d6aab0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -92,7 +92,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) unsigned long offset, last_in_cluster; int latency_ration = LATENCY_LIMIT; - /* + /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting @@ -269,7 +269,7 @@ bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; -} +} static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) { @@ -736,10 +736,10 @@ static int try_to_unuse(unsigned int type) break; } - /* + /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. + * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); @@ -1202,7 +1202,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) char * pathname; int i, type, prev; int err; - + if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1395,12 +1395,12 @@ static int swap_show(struct seq_file *swap, void *v) file = ptr->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", - len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + ptr->pages << (PAGE_SHIFT - 10), + ptr->inuse_pages << (PAGE_SHIFT - 10), + ptr->prio); return 0; } @@ -1565,7 +1565,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = -EINVAL; goto bad_swap; } - + switch (swap_header_version) { case 1: printk(KERN_ERR "version 0 swap is no longer supported. " -- cgit v1.2.3 From 81e33971271ec8603fe696731ff9967afb99e729 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:49 -0800 Subject: swapfile: remove v0 SWAP-SPACE message The kernel has not supported v0 SWAP-SPACE since 2.5.22: I think we can now safely drop its "version 0 swap is no longer supported" message - just say "Unable to find swap-space signature" as usual. This removes one level of indentation from a stretch of sys_swapon(). I'd have liked to be specific, saying "Unable to find SWAPSPACE2 signature", but it's just too confusing that the version 1 signature shows the number 2. Irrelevant nearby cleanup: kmap(page) already gives page_address(page). Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 146 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 65 insertions(+), 81 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index c46c83d6aab0..85ff603385c3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1456,7 +1456,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) int i, prev; int error; union swap_header *swap_header = NULL; - int swap_header_version; unsigned int nr_good_pages = 0; int nr_extents = 0; sector_t span; @@ -1553,101 +1552,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = PTR_ERR(page); goto bad_swap; } - kmap(page); - swap_header = page_address(page); + swap_header = kmap(page); - if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) - swap_header_version = 1; - else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) - swap_header_version = 2; - else { + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; } - switch (swap_header_version) { - case 1: - printk(KERN_ERR "version 0 swap is no longer supported. " - "Use mkswap -v1 %s\n", name); + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); error = -EINVAL; goto bad_swap; - case 2: - /* swap partition endianess hack... */ - if (swab32(swap_header->info.version) == 1) { - swab32s(&swap_header->info.version); - swab32s(&swap_header->info.last_page); - swab32s(&swap_header->info.nr_badpages); - for (i = 0; i < swap_header->info.nr_badpages; i++) - swab32s(&swap_header->info.badpages[i]); - } - /* Check the swap header's sub-version and the size of - the swap file and bad block lists */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } + } - p->lowest_bit = 1; - p->cluster_next = 1; + p->lowest_bit = 1; + p->cluster_next = 1; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; - error = -EINVAL; - if (!maxpages) - goto bad_swap; - if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - goto bad_swap; - } - if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + error = -EINVAL; + if (!maxpages) + goto bad_swap; + if (swapfilepages && maxpages > swapfilepages) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + goto bad_swap; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + /* OK, set up the swap map and apply the bad block list */ + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } - error = 0; - memset(swap_map, 0, maxpages * sizeof(short)); - for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) - error = -EINVAL; - else - swap_map[page_nr] = SWAP_MAP_BAD; - } - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (error) + memset(swap_map, 0, maxpages * sizeof(short)); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + error = -EINVAL; goto bad_swap; + } + swap_map[page_nr] = SWAP_MAP_BAD; } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; -- cgit v1.2.3 From ebebbbe904634b0ca1c674457b399f68db5e05b1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:50 -0800 Subject: swapfile: rearrange scan and swap_info Before making functional changes, rearrange scan_swap_map() to simplify subsequent diffs. Actually, there is one functional change in there: leave cluster_nr negative while scanning for a new cluster - resetting it early increased the likelihood that when we have difficulty finding a free cluster, another task may come in and try doing exactly the same - just a waste of cpu. Before making functional changes, rearrange struct swap_info_struct slightly: flags will be needed as an unsigned long (for wait_on_bit), next is a good int to pair with prio, old_block_size is uninteresting so shift it to the end. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 8 +++---- mm/swapfile.c | 66 +++++++++++++++++++++++++++++----------------------- 2 files changed, 41 insertions(+), 33 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 410c8e473727..9cabb8b21aba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -133,14 +133,14 @@ enum { * The in-memory structure used to track swap areas. */ struct swap_info_struct { - unsigned int flags; + unsigned long flags; int prio; /* swap priority */ + int next; /* next entry on swap list */ struct file *swap_file; struct block_device *bdev; struct list_head extent_list; struct swap_extent *curr_swap_extent; - unsigned old_block_size; - unsigned short * swap_map; + unsigned short *swap_map; unsigned int lowest_bit; unsigned int highest_bit; unsigned int cluster_next; @@ -148,7 +148,7 @@ struct swap_info_struct { unsigned int pages; unsigned int max; unsigned int inuse_pages; - int next; /* next entry on swap list */ + unsigned int old_block_size; }; struct swap_list_t { diff --git a/mm/swapfile.c b/mm/swapfile.c index 85ff603385c3..4d9855f86e7d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -89,7 +89,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) static inline unsigned long scan_swap_map(struct swap_info_struct *si) { - unsigned long offset, last_in_cluster; + unsigned long offset; + unsigned long last_in_cluster; int latency_ration = LATENCY_LIMIT; /* @@ -103,10 +104,13 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) */ si->flags += SWP_SCANNING; - if (unlikely(!si->cluster_nr)) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) - goto lowest; + offset = si->cluster_next; + + if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } spin_unlock(&swap_lock); offset = si->lowest_bit; @@ -118,43 +122,47 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER+1; - goto cluster; + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } + + offset = si->lowest_bit; spin_lock(&swap_lock); - goto lowest; + si->cluster_nr = SWAPFILE_CLUSTER - 1; } - si->cluster_nr--; -cluster: - offset = si->cluster_next; - if (offset > si->highest_bit) -lowest: offset = si->lowest_bit; -checks: if (!(si->flags & SWP_WRITEOK)) +checks: + if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; - if (!si->swap_map[offset]) { - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - } - si->swap_map[offset] = 1; - si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; - return offset; + if (offset > si->highest_bit) + offset = si->lowest_bit; + if (si->swap_map[offset]) + goto scan; + + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + si->inuse_pages++; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; } + si->swap_map[offset] = 1; + si->cluster_next = offset + 1; + si->flags -= SWP_SCANNING; + return offset; +scan: spin_unlock(&swap_lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { @@ -167,7 +175,7 @@ checks: if (!(si->flags & SWP_WRITEOK)) } } spin_lock(&swap_lock); - goto lowest; + goto checks; no_page: si->flags -= SWP_SCANNING; -- cgit v1.2.3 From 6a6ba83175c029c7820765bae44692266b29e67a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:51 -0800 Subject: swapfile: swapon use discard (trim) When adding swap, all the old data on swap can be forgotten: sys_swapon() discard all but the header page of the swap partition (or every extent but the header of the swap file), to give a solidstate swap device the opportunity to optimize its wear-levelling. If that succeeds, note SWP_DISCARDABLE for later use, and report it with a "D" at the right end of the kernel's "Adding ... swap" message. Perhaps something should be shown in /proc/swaps (swapon -s), but we have to be more cautious before making any addition to that format. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: David Woodhouse Cc: Jens Axboe Cc: Matthew Wilcox Cc: Joern Engel Cc: James Bottomley Cc: Donjun Shin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 9cabb8b21aba..0b9210ea96c7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -120,6 +120,7 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ + SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 4d9855f86e7d..fbeb4bb8eb50 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -84,6 +84,37 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) up_read(&swap_unplug_sem); } +/* + * swapon tell device that all the old swap contents can be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static int discard_swap(struct swap_info_struct *si) +{ + struct swap_extent *se; + int err = 0; + + list_for_each_entry(se, &si->extent_list, list) { + sector_t start_block = se->start_block << (PAGE_SHIFT - 9); + pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9); + + if (se->start_page == 0) { + /* Do not discard the swap header page! */ + start_block += 1 << (PAGE_SHIFT - 9); + nr_blocks -= 1 << (PAGE_SHIFT - 9); + if (!nr_blocks) + continue; + } + + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL); + if (err) + break; + + cond_resched(); + } + return err; /* That will often be -EOPNOTSUPP */ +} + #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 @@ -1658,6 +1689,9 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); if (swap_flags & SWAP_FLAG_PREFER) @@ -1671,9 +1705,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) total_swap_pages += nr_good_pages; printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk\n", + "Priority:%d extents:%d across:%lluk%s\n", nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_DISCARDABLE) ? " D" : ""); /* insert swap space into swap_list: */ prev = -1; -- cgit v1.2.3 From 7992fde72ce06c73280a1939b7a1e903bc95ef85 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:53 -0800 Subject: swapfile: swap allocation use discard When scan_swap_map() finds a free cluster of swap pages to allocate, discard the old contents of the cluster if the device supports discard. But don't bother when swap is so fragmented that we allocate single pages. Be careful about racing allocations made while we're scanning for a cluster; and hold up allocations made while we're discarding. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: David Woodhouse Cc: Jens Axboe Cc: Matthew Wilcox Cc: Joern Engel Cc: James Bottomley Cc: Donjun Shin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++ mm/swapfile.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0b9210ea96c7..fe79f44c858e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -121,6 +121,7 @@ enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ + SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; @@ -144,6 +145,8 @@ struct swap_info_struct { unsigned short *swap_map; unsigned int lowest_bit; unsigned int highest_bit; + unsigned int lowest_alloc; /* while preparing discard cluster */ + unsigned int highest_alloc; /* while preparing discard cluster */ unsigned int cluster_next; unsigned int cluster_nr; unsigned int pages; diff --git a/mm/swapfile.c b/mm/swapfile.c index fbeb4bb8eb50..ca75b9e7c09f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si) return err; /* That will often be -EOPNOTSUPP */ } +/* + * swap allocation tell device that a cluster of swap can now be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static void discard_swap_cluster(struct swap_info_struct *si, + pgoff_t start_page, pgoff_t nr_pages) +{ + struct swap_extent *se = si->curr_swap_extent; + int found_extent = 0; + + while (nr_pages) { + struct list_head *lh; + + if (se->start_page <= start_page && + start_page < se->start_page + se->nr_pages) { + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + pgoff_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + if (!found_extent++) + si->curr_swap_extent = se; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO)) + break; + } + + lh = se->list.next; + if (lh == &si->extent_list) + lh = lh->next; + se = list_entry(lh, struct swap_extent, list); + } +} + +static int wait_for_discard(void *word) +{ + schedule(); + return 0; +} + #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si) { unsigned long offset; - unsigned long last_in_cluster; + unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } + if (si->flags & SWP_DISCARDABLE) { + /* + * Start range check on racing allocations, in case + * they overlap the cluster we eventually decide on + * (we scan without swap_lock to allow preemption). + * It's hardly conceivable that cluster_nr could be + * wrapped during our scan, but don't depend on it. + */ + if (si->lowest_alloc) + goto checks; + si->lowest_alloc = si->max; + si->highest_alloc = 0; + } spin_unlock(&swap_lock); offset = si->lowest_bit; @@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) offset = si->lowest_bit; spin_lock(&swap_lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; + si->lowest_alloc = 0; } checks: @@ -191,6 +254,60 @@ checks: si->swap_map[offset] = 1; si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; + + if (si->lowest_alloc) { + /* + * Only set when SWP_DISCARDABLE, and there's a scan + * for a free cluster in progress or just completed. + */ + if (found_free_cluster) { + /* + * To optimize wear-levelling, discard the + * old data of the cluster, taking care not to + * discard any of its pages that have already + * been allocated by racing tasks (offset has + * already stepped over any at the beginning). + */ + if (offset < si->highest_alloc && + si->lowest_alloc <= last_in_cluster) + last_in_cluster = si->lowest_alloc - 1; + si->flags |= SWP_DISCARDING; + spin_unlock(&swap_lock); + + if (offset < last_in_cluster) + discard_swap_cluster(si, offset, + last_in_cluster - offset + 1); + + spin_lock(&swap_lock); + si->lowest_alloc = 0; + si->flags &= ~SWP_DISCARDING; + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); + + } else if (si->flags & SWP_DISCARDING) { + /* + * Delay using pages allocated by racing tasks + * until the whole discard has been issued. We + * could defer that delay until swap_writepage, + * but it's easier to keep this self-contained. + */ + spin_unlock(&swap_lock); + wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), + wait_for_discard, TASK_UNINTERRUPTIBLE); + spin_lock(&swap_lock); + } else { + /* + * Note pages allocated by racing tasks while + * scan for a free cluster is in progress, so + * that its final discard can exclude them. + */ + if (offset < si->lowest_alloc) + si->lowest_alloc = offset; + if (offset > si->highest_alloc) + si->highest_alloc = offset; + } + } return offset; scan: -- cgit v1.2.3 From 20137a490f397d9c01fc9fadd83a8d198bda4477 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:54 -0800 Subject: swapfile: swapon randomize if nonrot Swap allocation has always started from the beginning of the swap area; but if we're dealing with a solidstate swap device which can only remap blocks within limited zones, that would sooner wear out the first zone. Therefore sys_swapon() test whether blk_queue is non-rotational, and if so randomize the cluster_next starting position for allocation. If blk_queue is nonrot, note SWP_SOLIDSTATE for later use, and report it with an "SS" at the right end of the kernel's "Adding ... swap" message (so that if it's both nonrot and discardable, "SSD" will be shown there). Perhaps something should be shown in /proc/swaps (swapon -s), but we have to be more cautious before making any addition to that format. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: David Woodhouse Cc: Jens Axboe Cc: Matthew Wilcox Cc: Joern Engel Cc: James Bottomley Cc: Donjun Shin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index fe79f44c858e..cbf7fbed3dfd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -122,6 +122,7 @@ enum { SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ + SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; diff --git a/mm/swapfile.c b/mm/swapfile.c index ca75b9e7c09f..b0f56603b9be 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1806,6 +1807,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + srandom32((u32)get_seconds()); + p->cluster_next = 1 + (random32() % p->highest_bit); + } if (discard_swap(p) == 0) p->flags |= SWP_DISCARDABLE; @@ -1822,10 +1828,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) total_swap_pages += nr_good_pages; printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk%s\n", + "Priority:%d extents:%d across:%lluk %s%s\n", nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), - (p->flags & SWP_DISCARDABLE) ? " D" : ""); + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : ""); /* insert swap space into swap_list: */ prev = -1; -- cgit v1.2.3 From c60aa176c6de82703f064082b909496fc4fee956 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:55 -0800 Subject: swapfile: swap allocation cycle if nonrot Though attempting to find free clusters (Andrea), swap allocation has always restarted its searches from the beginning of the swap area (sct), to reduce seek times between swap pages, by not scattering them all over the partition. But on a solidstate swap device, seeks are cheap, and block remapping to level the wear may be limited by zones: in that case it's better to cycle around the whole partition. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: David Woodhouse Cc: Jens Axboe Cc: Matthew Wilcox Cc: Joern Engel Cc: James Bottomley Cc: Donjun Shin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index b0f56603b9be..763210732b5f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -169,6 +169,7 @@ static int wait_for_discard(void *word) static inline unsigned long scan_swap_map(struct swap_info_struct *si) { unsigned long offset; + unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int found_free_cluster = 0; @@ -181,10 +182,11 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea + * And we let swap pages go all over an SSD partition. Hugh */ si->flags += SWP_SCANNING; - offset = si->cluster_next; + scan_base = offset = si->cluster_next; if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { @@ -206,7 +208,16 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) } spin_unlock(&swap_lock); - offset = si->lowest_bit; + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. + * But if seek is cheap, search from our current position, so + * that swap is allocated from all over the partition: if the + * Flash Translation Layer only remaps within limited zones, + * we don't want to wear out the first zone too quickly. + */ + if (!(si->flags & SWP_SOLIDSTATE)) + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -228,6 +239,27 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) } offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster < scan_base; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&swap_lock); + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + + offset = scan_base; spin_lock(&swap_lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; si->lowest_alloc = 0; @@ -239,7 +271,7 @@ checks: if (!si->highest_bit) goto no_page; if (offset > si->highest_bit) - offset = si->lowest_bit; + scan_base = offset = si->lowest_bit; if (si->swap_map[offset]) goto scan; @@ -323,8 +355,18 @@ scan: latency_ration = LATENCY_LIMIT; } } + offset = si->lowest_bit; + while (++offset < scan_base) { + if (!si->swap_map[offset]) { + spin_lock(&swap_lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } spin_lock(&swap_lock); - goto checks; no_page: si->flags -= SWP_SCANNING; -- cgit v1.2.3 From 858a29900ea2d639759e697be901a60b759cdcfb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:56 -0800 Subject: swapfile: change discard pgoff_t to sector_t Change pgoff_t nr_blocks in discard_swap() and discard_swap_cluster() to sector_t: given the constraints on swap offsets (in particular, the 5 bits of swap type accommodated in the same unsigned long), pgoff_t was actually safe as is, but it certainly looked worrying when shifted left. [akpm@linux-foundation.org: fix shift overflow] Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Nick Piggin Cc: David Woodhouse Cc: Jens Axboe Cc: Matthew Wilcox Cc: Joern Engel Cc: James Bottomley Cc: Donjun Shin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 763210732b5f..6a078557306a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -96,7 +96,7 @@ static int discard_swap(struct swap_info_struct *si) list_for_each_entry(se, &si->extent_list, list) { sector_t start_block = se->start_block << (PAGE_SHIFT - 9); - pgoff_t nr_blocks = se->nr_pages << (PAGE_SHIFT - 9); + sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); if (se->start_page == 0) { /* Do not discard the swap header page! */ @@ -133,7 +133,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_page < se->start_page + se->nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; - pgoff_t nr_blocks = se->nr_pages - offset; + sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; -- cgit v1.2.3 From f0d7a4b3ed46816f5097d521850a8ab7a0d40f3c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:57 -0800 Subject: swapfile: let others seed random Remove the srandom32((u32)get_seconds()) from non-rotational swapon: there's been a coincidental discussion of earlier randomization, assume that goes ahead, let swapon be a client rather than stirring for itself. Signed-off-by: Hugh Dickins Cc: David Woodhouse Cc: Donjun Shin Cc: James Bottomley Cc: Jens Axboe Cc: Joern Engel Cc: KAMEZAWA Hiroyuki Cc: Matthew Wilcox Cc: Nick Piggin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6a078557306a..d00523601913 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1851,7 +1851,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { p->flags |= SWP_SOLIDSTATE; - srandom32((u32)get_seconds()); p->cluster_next = 1 + (random32() % p->highest_bit); } if (discard_swap(p) == 0) -- cgit v1.2.3 From 2509ef26db4699a5d9fa876e90ddfc107afcab84 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:40:10 -0800 Subject: badpage: zap print_bad_pte on swap and file Complete zap_pte_range()'s coverage of bad pagetable entries by calling print_bad_pte() on a pte_file in a linear vma and on a bad swap entry. That needs free_swap_and_cache() to tell it, which will also have shown one of those "swap_free" errors (but with much less information). Similar checks in fork's copy_one_pte()? No, that would be more noisy than helpful: we'll see them when parent and child exec or exit. Where do_nonlinear_fault() calls print_bad_pte(): omit !VM_CAN_NONLINEAR case, that could only be a bug in sys_remap_file_pages(), not a bad pte. VM_FAULT_OOM rather than VM_FAULT_SIGBUS? Well, okay, that is consistent with what happens if do_swap_page() operates a bad swap entry; but don't we have patches to be more careful about killing when VM_FAULT_OOM? Signed-off-by: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 12 +++--------- mm/memory.c | 11 +++++++---- mm/swapfile.c | 7 ++++--- 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index cbf7fbed3dfd..91dee50fe260 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -302,7 +302,7 @@ extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); -extern void free_swap_and_cache(swp_entry_t); +extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); @@ -352,14 +352,8 @@ static inline void show_swap_cache_info(void) { } -static inline void free_swap_and_cache(swp_entry_t swp) -{ -} - -static inline int swap_duplicate(swp_entry_t swp) -{ - return 0; -} +#define free_swap_and_cache(swp) is_migration_entry(swp) +#define swap_duplicate(swp) is_migration_entry(swp) static inline void swap_free(swp_entry_t swp) { diff --git a/mm/memory.c b/mm/memory.c index 890095f5f36d..b273cc12b15d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -810,8 +810,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ if (unlikely(details)) continue; - if (!pte_file(ptent)) - free_swap_and_cache(pte_to_swp_entry(ptent)); + if (pte_file(ptent)) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) + print_bad_pte(vma, addr, ptent, NULL); + } else if + (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -2707,8 +2711,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || - !(vma->vm_flags & VM_CAN_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index d00523601913..f28745855772 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -571,13 +571,13 @@ int try_to_free_swap(struct page *page) * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ -void free_swap_and_cache(swp_entry_t entry) +int free_swap_and_cache(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; struct page *page = NULL; if (is_migration_entry(entry)) - return; + return 1; p = swap_info_get(entry); if (p) { @@ -603,6 +603,7 @@ void free_swap_and_cache(swp_entry_t entry) unlock_page(page); page_cache_release(page); } + return p != NULL; } #ifdef CONFIG_HIBERNATION -- cgit v1.2.3 From 084f71ae5ceeb16734d8ac47559d3c718456a865 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:40:30 -0800 Subject: mm: kill page_queue_congested() page_queue_congested() was introduced in 2002, but it was never used Signed-off-by: KOSAKI Motohiro Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index f28745855772..eec5ca758a23 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1372,26 +1372,6 @@ out: return ret; } -#if 0 /* We don't need this yet */ -#include -int page_queue_congested(struct page *page) -{ - struct backing_dev_info *bdi; - - VM_BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; - struct swap_info_struct *sis; - - sis = get_swap_info_struct(swp_type(entry)); - bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; - } else - bdi = page->mapping->backing_dev_info; - return bdi_write_congested(bdi); -} -#endif - asmlinkage long sys_swapoff(const char __user * specialfile) { struct swap_info_struct * p = NULL; -- cgit v1.2.3