diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 6 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 30 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 176 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 89 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 22 | ||||
-rw-r--r-- | mm/mprotect.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 109 | ||||
-rw-r--r-- | mm/page-writeback.c | 245 | ||||
-rw-r--r-- | mm/page_alloc.c | 135 | ||||
-rw-r--r-- | mm/page_cgroup.c | 2 | ||||
-rw-r--r-- | mm/page_io.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 82 | ||||
-rw-r--r-- | mm/swap.c | 44 | ||||
-rw-r--r-- | mm/swap_state.c | 31 | ||||
-rw-r--r-- | mm/swapfile.c | 576 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 134 | ||||
-rw-r--r-- | mm/vmalloc.c | 50 | ||||
-rw-r--r-- | mm/vmscan.c | 143 |
29 files changed, 1183 insertions, 865 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 5b5790f8a816..a5b77811fdf2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -181,12 +181,6 @@ config MIGRATION example on NUMA systems to put pages nearer to the processors accessing the page. -config RESOURCES_64BIT - bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) - default 64BIT - help - This option allows memory and IO resources to be 64 bit. - config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT diff --git a/mm/Makefile b/mm/Makefile index 51c27709cc7c..72255be57f89 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o pdflush.o \ - readahead.o swap.o truncate.o vmscan.o \ + readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) @@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o -obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_SLAB) += slab.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a7c6c5613ec9..8e8587444132 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -24,9 +24,9 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); diff --git a/mm/bootmem.c b/mm/bootmem.c index ac5a891f142a..51a0ccf61e0e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); + BUG_ON(!size); BUG_ON(align & (align - 1)); BUG_ON(limit && goal + size > limit); @@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, if (!bdata->node_bootmem_map) return NULL; - bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", - bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, - align, goal, limit); - min = bdata->node_min_pfn; max = bdata->node_low_pfn; diff --git a/mm/filemap.c b/mm/filemap.c index f5769b4dc075..2f55a1e2baf7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, int ret; struct writeback_control wbc = { .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, + .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; @@ -741,7 +741,14 @@ repeat: page = __page_cache_alloc(gfp_mask); if (!page) return NULL; - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + /* + * We want a regular kernel memory (not highmem or DMA etc) + * allocation for the radix tree nodes, but we need to honour + * the context-specific requirements the caller has asked for. + * GFP_RECLAIM_MASK collects those requirements. + */ + err = add_to_page_cache_lru(page, mapping, index, + (gfp_mask & GFP_RECLAIM_MASK)); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) return NULL; } page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { + if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { page_cache_release(page); page = NULL; } @@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = filemap_write_and_wait(mapping); + retval = filemap_write_and_wait_range(mapping, pos, + pos + iov_length(iov, nr_segs) - 1); if (!retval) { retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); @@ -1530,7 +1538,6 @@ retry_find: /* * Found the page and have a reference on it. */ - mark_page_accessed(page); ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - /* - * Unmap all mmappings of the file up-front. - * - * This will cause any pte dirty bits to be propagated into the - * pageframes for the subsequent filemap_write_and_wait(). - */ write_len = iov_length(iov, *nr_segs); end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, pos, write_len, 0); - written = filemap_write_and_wait(mapping); + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); if (written) goto out; @@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * the file data here, to try to honour O_DIRECT expectations. */ if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); + status = filemap_write_and_wait_range(mapping, + pos, pos + written - 1); return written ? written : status; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167dfb2f2d..0c04615651b7 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -193,7 +193,7 @@ retry: /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); - page_remove_rmap(page, vma); + page_remove_rmap(page); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff --git a/mm/fremap.c b/mm/fremap.c index 7d12ca70ef7b..62d5bbda921a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (page) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53dcb89..618e98304080 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, } /* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate; + + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + hstate = hstate_vma(vma); + + return 1UL << (hstate->order + PAGE_SHIFT); +} + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific version of this + * function is required. + */ +#ifndef vma_mmu_pagesize +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} +#endif + +/* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. @@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) - return clear_gigantic_page(page, addr, sz); + if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, sz); + return; + } might_sleep(); for (i = 0; i < sz/PAGE_SIZE; i++) { @@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src, int i; struct hstate *h = hstate_vma(vma); - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) - return copy_gigantic_page(dst, src, addr, vma); + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src, addr, vma); + return; + } might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { @@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) +int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes = nodes_weight(node_online_map); @@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) * puts them into the mem_map). */ m = addr; - if (m) - goto found; + goto found; } hstate_next_node(h); nr_nodes--; diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb68..478223b73a2a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ +extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); /* @@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define GUP_FLAGS_WRITE 0x1 #define GUP_FLAGS_FORCE 0x2 #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 +#define GUP_FLAGS_IGNORE_SIGKILL 0x8 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 866dcc7eeb0c..51ee96545579 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -779,7 +779,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) return 0; } -int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, + unsigned long long val) { int retry_count = MEM_CGROUP_RECLAIM_RETRIES; diff --git a/mm/memory.c b/mm/memory.c index 7b9db658aca2..3f8fa06b963b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,9 @@ #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> +#include <linux/kallsyms.h> +#include <linux/swapops.h> +#include <linux/elf.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -59,9 +62,6 @@ #include <asm/tlbflush.h> #include <asm/pgtable.h> -#include <linux/swapops.h> -#include <linux/elf.h> - #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, - unsigned long vaddr) -{ - printk(KERN_ERR "Bad pte = %08llx, process = %s, " - "vm_flags = %lx, vaddr = %lx\n", - (long long)pte_val(pte), - (vma->vm_mm == current->mm ? current->comm : "???"), - vma->vm_flags, vaddr); +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) { + printk(KERN_ALERT + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + } + printk(KERN_ALERT + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + if (vma->vm_ops) + print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", + (unsigned long)vma->vm_ops->fault); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", + (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); + add_taint(TAINT_BAD_PAGE); } static inline int is_cow_mapping(unsigned int flags) @@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags) struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long pfn; + unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) { - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - return pte_page(pte); - } - VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + if (likely(!pte_special(pte))) + goto check_pfn; + if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !HAVE_PTE_SPECIAL case follows: */ - pfn = pte_pfn(pte); - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - VM_BUG_ON(!pfn_valid(pfn)); +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } /* * NOTE! We still have PageReserved() pages in the page tables. - * * eg. VDSO mappings can cause them to exist. */ out: @@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, else { if (pte_dirty(ptent)) set_page_dirty(page); - if (pte_young(ptent)) - SetPageReferenced(page); + if (pte_young(ptent) && + likely(!VM_SequentialReadHint(vma))) + mark_page_accessed(page); file_rss--; } - page_remove_rmap(page, vma); + page_remove_rmap(page); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); continue; } @@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ if (unlikely(details)) continue; - if (!pte_file(ptent)) - free_swap_and_cache(pte_to_swp_entry(ptent)); + if (pte_file(ptent)) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) + print_bad_pte(vma, addr, ptent, NULL); + } else if + (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int write = !!(flags & GUP_FLAGS_WRITE); int force = !!(flags & GUP_FLAGS_FORCE); int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); if (len <= 0) return 0; @@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page *page; /* - * If tsk is ooming, cut off its access to large memory - * allocations. It has a pending SIGKILL, but it can't - * be processed until returning to user space. + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory, unless + * current is handling munlock--e.g., on exit. In + * that case, we are not allocating memory. Rather, + * we're only unlocking already resident/mapped pages. */ - if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) - return i ? i : -ENOMEM; + if (unlikely(!ignore_sigkill && + fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; if (write) foll_flags |= FOLL_WRITE; @@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). */ - if (ret & VM_FAULT_WRITE) + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; cond_resched(); @@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); + arch_enter_lazy_mmu_mode(); + token = pmd_pgtable(*pmd); do { @@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; @@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (trylock_page(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); + goto unlock; + } + page_cache_release(old_page); } + reuse = reuse_swap_page(old_page); + unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { /* @@ -1943,11 +2025,7 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - SetPageSwapBacked(new_page); - lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); - -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -1973,7 +2051,7 @@ gotten: * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma); + page_remove_rmap(old_page); } /* Free the old page.. */ @@ -2374,7 +2452,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { + if (write_access && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } @@ -2385,7 +2463,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - remove_exclusive_swap_page(page); + try_to_free_swap(page); unlock_page(page); if (write_access) { @@ -2442,8 +2520,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2591,8 +2667,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); @@ -2602,7 +2676,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ @@ -2666,12 +2739,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || - !(vma->vm_flags & VM_CAN_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ - print_bad_pte(vma, orig_pte, address); + print_bad_pte(vma, address, orig_pte, NULL); return VM_FAULT_OOM; } @@ -2953,7 +3025,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, { resource_size_t phys_addr; unsigned long prot = 0; - void *maddr; + void __iomem *maddr; int offset = addr & (PAGE_SIZE-1); if (follow_phys(vma, addr, write, &prot, &phys_addr)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b17371185468..c083cf5fd6df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) +static int __meminit __add_section(int nid, struct zone *zone, + unsigned long phys_start_pfn) { int nr_pages = PAGES_PER_SECTION; int ret; @@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p if (ret < 0) return ret; - return register_new_memory(__pfn_to_section(phys_start_pfn)); + return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) { unsigned long i; int err = 0; @@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(zone, i << PFN_SECTION_SHIFT); + err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); /* * EEXIST is finally dealt with by ioresource collision @@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end) } static struct page * -hotremove_migrate_alloc(struct page *page, - unsigned long private, - int **x) +hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) { - /* This should be improoooooved!! */ - return alloc_page(GFP_HIGHUSER_PAGECACHE); + /* This should be improooooved!! */ + return alloc_page(GFP_HIGHUSER_MOVABLE); } - #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) diff --git a/mm/migrate.c b/mm/migrate.c index 21631ab8c08b..55373983c9c6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -300,12 +300,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page. */ get_page(newpage); /* add cache reference */ -#ifdef CONFIG_SWAP if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } -#endif radix_tree_replace_slot(pslot, newpage); @@ -373,9 +371,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) mlock_migrate_page(newpage, page); -#ifdef CONFIG_SWAP ClearPageSwapCache(page); -#endif ClearPagePrivate(page); set_page_private(page, 0); /* page->mapping contains a flag for PageAnon() */ @@ -848,12 +844,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct vm_area_struct *vma; struct page *page; - /* - * A valid page pointer that will not match any of the - * pages that will be moved. - */ - pp->page = ZERO_PAGE(0); - err = -EFAULT; vma = find_vma(mm, pp->addr); if (!vma || !vma_migratable(vma)) @@ -919,41 +909,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm = NULL; + struct page_to_node *pm; nodemask_t task_nodes; - int err = 0; - int i; + unsigned long chunk_nr_pages; + unsigned long chunk_start; + int err; task_nodes = cpuset_mems_allowed(task); - /* Limit nr_pages so that the multiplication may not overflow */ - if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { - err = -E2BIG; - goto out; - } - - pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); - if (!pm) { - err = -ENOMEM; + err = -ENOMEM; + pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); + if (!pm) goto out; - } - /* - * Get parameters from user space and initialize the pm - * array. Return various errors if the user did something wrong. + * Store a chunk of page_to_node array in a page, + * but keep the last one as a marker */ - for (i = 0; i < nr_pages; i++) { - const void __user *p; + chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - err = -EFAULT; - if (get_user(p, pages + i)) - goto out_pm; + for (chunk_start = 0; + chunk_start < nr_pages; + chunk_start += chunk_nr_pages) { + int j; - pm[i].addr = (unsigned long)p; - if (nodes) { + if (chunk_start + chunk_nr_pages > nr_pages) + chunk_nr_pages = nr_pages - chunk_start; + + /* fill the chunk pm with addrs and nodes from user-space */ + for (j = 0; j < chunk_nr_pages; j++) { + const void __user *p; int node; - if (get_user(node, nodes + i)) + err = -EFAULT; + if (get_user(p, pages + j + chunk_start)) + goto out_pm; + pm[j].addr = (unsigned long) p; + + if (get_user(node, nodes + j + chunk_start)) goto out_pm; err = -ENODEV; @@ -964,22 +956,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, if (!node_isset(node, task_nodes)) goto out_pm; - pm[i].node = node; - } else - pm[i].node = 0; /* anything to not match MAX_NUMNODES */ - } - /* End marker */ - pm[nr_pages].node = MAX_NUMNODES; + pm[j].node = node; + } + + /* End marker for this chunk */ + pm[chunk_nr_pages].node = MAX_NUMNODES; + + /* Migrate this chunk */ + err = do_move_page_to_node_array(mm, pm, + flags & MPOL_MF_MOVE_ALL); + if (err < 0) + goto out_pm; - err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); - if (err >= 0) /* Return status information */ - for (i = 0; i < nr_pages; i++) - if (put_user(pm[i].status, status + i)) + for (j = 0; j < chunk_nr_pages; j++) + if (put_user(pm[j].status, status + j + chunk_start)) { err = -EFAULT; + goto out_pm; + } + } + err = 0; out_pm: - vfree(pm); + free_page((unsigned long)pm); out: return err; } diff --git a/mm/mlock.c b/mm/mlock.c index 3035a56e7616..e125156c664e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, (atomic_read(&mm->mm_users) != 0)); /* - * mlock: don't page populate if page has PROT_NONE permission. - * munlock: the pages always do munlock althrough - * its has PROT_NONE permission. + * mlock: don't page populate if vma has PROT_NONE permission. + * munlock: always do munlock although the vma has PROT_NONE + * permission, or SIGKILL is pending. */ if (!mlock) - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | + GUP_FLAGS_IGNORE_SIGKILL; if (vma->vm_flags & VM_WRITE) gup_flags |= GUP_FLAGS_WRITE; diff --git a/mm/mmap.c b/mm/mmap.c index 2c778fcfd9bd..a910c045cfd4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, static void __vma_link_file(struct vm_area_struct *vma) { - struct file * file; + struct file *file; file = vma->vm_file; if (file) { @@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, * insert vm structure into list and rbtree and anon_vma, * but it has already been inserted into prio_tree earlier. */ -static void -__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *__vma, *prev; + struct rb_node **rb_link, *rb_parent; __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); BUG_ON(__vma && __vma->vm_start < vma->vm_end); @@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, * The caller must hold down_write(current->mm->mmap_sem). */ -unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { @@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; @@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node * rb_node; + struct rb_node *rb_node; if (!mm) goto out; @@ -1541,7 +1540,7 @@ out: * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; @@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ + return; + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); - /* Don't update_hiwater_rss(mm) here, do_exit already did */ + /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); diff --git a/mm/mprotect.c b/mm/mprotect.c index cfb4c4852062..d0f6e7ce09f1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -22,6 +22,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> +#include <linux/migrate.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); -#ifdef CONFIG_MIGRATION - } else if (!pte_file(oldpte)) { + } else if (PAGE_MIGRATION && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { @@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); } -#endif } - } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 558f9afe6e4e..6b9e758c98a5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,7 +31,7 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; -static DEFINE_SPINLOCK(zone_scan_mutex); +static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ /** @@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, printk(KERN_WARNING "%s invoked oom-killer: " "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); + cpuset_print_task_mems_allowed(current); + task_unlock(current); dump_stack(); show_mem(); if (sysctl_oom_dump_tasks) @@ -470,7 +473,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) struct zone *zone; int ret = 1; - spin_lock(&zone_scan_mutex); + spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { if (zone_is_oom_locked(zone)) { ret = 0; @@ -480,7 +483,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* - * Lock each zone in the zonelist under zone_scan_mutex so a + * Lock each zone in the zonelist under zone_scan_lock so a * parallel invocation of try_set_zone_oom() doesn't succeed * when it shouldn't. */ @@ -488,7 +491,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) } out: - spin_unlock(&zone_scan_mutex); + spin_unlock(&zone_scan_lock); return ret; } @@ -502,11 +505,74 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) struct zoneref *z; struct zone *zone; - spin_lock(&zone_scan_mutex); + spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { zone_clear_flag(zone, ZONE_OOM_LOCKED); } - spin_unlock(&zone_scan_mutex); + spin_unlock(&zone_scan_lock); +} + +/* + * Must be called with tasklist_lock held for read. + */ +static void __out_of_memory(gfp_t gfp_mask, int order) +{ + if (sysctl_oom_kill_allocating_task) { + oom_kill_process(current, gfp_mask, order, 0, NULL, + "Out of memory (oom_kill_allocating_task)"); + + } else { + unsigned long points; + struct task_struct *p; + +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); + + if (PTR_ERR(p) == -1UL) + return; + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); + } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; + } +} + +/* + * pagefault handler calls into here because it is out of memory but + * doesn't know exactly how or why. + */ +void pagefault_out_of_memory(void) +{ + unsigned long freed = 0; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; + + if (sysctl_panic_on_oom) + panic("out of memory from page fault. panic_on_oom is selected.\n"); + + read_lock(&tasklist_lock); + __out_of_memory(0, 0); /* unknown gfp_mask and order */ + read_unlock(&tasklist_lock); + + /* + * Give "p" a good chance of killing itself before we + * retry to allocate memory. + */ + if (!test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); } /** @@ -522,8 +588,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { - struct task_struct *p; - unsigned long points = 0; unsigned long freed = 0; enum oom_constraint constraint; @@ -544,7 +608,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, points, NULL, + oom_kill_process(current, gfp_mask, order, 0, NULL, "No available memory (MPOL_BIND)"); break; @@ -553,35 +617,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) panic("out of memory. panic_on_oom is selected\n"); /* Fall-through */ case CONSTRAINT_CPUSET: - if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, points, NULL, - "Out of memory (oom_kill_allocating_task)"); - break; - } -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); - - if (PTR_ERR(p) == -1UL) - goto out; - - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } - - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; - + __out_of_memory(gfp_mask, order); break; } -out: read_unlock(&tasklist_lock); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03f..b493db7841dc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) int dirty_background_ratio = 5; /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; + +/* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ @@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable; int vm_dirty_ratio = 10; /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; + +/* * The interval between `kupdate'-style writebacks, in jiffies */ int dirty_writeback_interval = 5 * HZ; @@ -135,23 +147,75 @@ static int calc_period_shift(void) { unsigned long dirty_total; - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; return 2 + ilog2(dirty_total - 1); } /* - * update the period when the dirty ratio changes. + * update the period when the dirty threshold changes. */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; } return ret; } @@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) } void -get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi) +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { - int background_ratio; /* Percentages */ - int dirty_ratio; - long background; - long dirty; + unsigned long background; + unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; - background_ratio = dirty_background_ratio; - if (background_ratio >= dirty_ratio) - background_ratio = dirty_ratio / 2; + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; - background = (background_ratio * available_memory) / 100; - dirty = (dirty_ratio * available_memory) / 100; + if (background >= dirty) + background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; @@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(gfp_t gfp_mask) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); @@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + @@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; long nr_to_write = wbc->nr_to_write; @@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + lock_page(page); + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - ret = (*writepage)(page, wbc, data); + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (wbc->sync_mode == WB_SYNC_NONE) { + wbc->nr_to_write--; + if (wbc->nr_to_write <= 0) { + done = 1; + break; + } } - if (ret || (--nr_to_write <= 0)) - done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; + break; } } pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (!wbc->no_nrwrite_index_update) { if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; wbc->nr_to_write = nr_to_write; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8ac01474563..7bf22e045318 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -long nr_swap_pages; +unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG - "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - current->comm, page, (int)(2*sizeof(unsigned long)), - (unsigned long)page->flags, page->mapping, - page_mapcount(page), page_count(page)); + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + goto out; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page state: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + printk(KERN_ALERT + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" - KERN_EMERG "Backtrace:\n"); dump_stack(); - page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; - set_page_count(page, 0); - reset_page_mapcount(page); - page->mapping = NULL; +out: + /* Leave bad fields for debug, except PageBuddy could make trouble */ + __ClearPageBuddy(page); add_taint(TAINT_BAD_PAGE); } @@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order) } #endif -static void destroy_compound_page(struct page *page, unsigned long order) +static int destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + int bad = 0; - if (unlikely(compound_order(page) != order)) + if (unlikely(compound_order(page) != order) || + unlikely(!PageHead(page))) { bad_page(page); + bad++; + } - if (unlikely(!PageHead(page))) - bad_page(page); __ClearPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) | - (p->first_page != page))) + if (unlikely(!PageTail(p) | (p->first_page != page))) { bad_page(page); + bad++; + } __ClearPageTail(p); } + + return bad; } static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) @@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page, int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) - destroy_compound_page(page, order); + if (unlikely(destroy_compound_page(page, order))) + return; page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); - if (PageDirty(page)) - __ClearPageDirty(page); - if (PageSwapBacked(page)) - __ClearPageSwapBacked(page); - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not free the page. But we shall soon need - * to do more, for when the ZERO_PAGE count wraps negative. - */ - return PageReserved(page); + return 1; + } + if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; } /* @@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int i; - int reserved = 0; + int bad = 0; for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(page + i); - if (reserved) + bad += free_pages_check(page + i); + if (bad) return; if (!PageHighMem(page)) { @@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); - - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not allocate the page: as a safety net. - */ - if (PageReserved(page)) return 1; + } - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk -#ifdef CONFIG_UNEVICTABLE_LRU - | 1 << PG_mlocked -#endif - ); set_page_private(page, 0); set_page_refcounted(page); @@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long pfn; struct zone *z; + if (highest_memmap_pfn < end_pfn - 1) + highest_memmap_pfn = end_pfn - 1; + z = &NODE_DATA(nid)->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* @@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; - if (usemapsize) { + if (usemapsize) zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); - memset(zone->pageblock_flags, 0, usemapsize); - } } #else static void inline setup_usemap(struct pglist_data *pgdat, @@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", @@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -void setup_per_zone_inactive_ratio(void) +static void setup_per_zone_inactive_ratio(void) { struct zone *zone; @@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -struct page *pfn_to_page(unsigned long pfn) -{ - return __pfn_to_page(pfn); -} -unsigned long page_to_pfn(struct page *page) -{ - return __page_to_pfn(page); -} -EXPORT_SYMBOL(pfn_to_page); -EXPORT_SYMBOL(page_to_pfn); -#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ - /* Return a pointer to the bitmap storing bits affecting a block of pages */ static inline unsigned long *get_pageblock_bitmap(struct zone *zone, unsigned long pfn) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ab27ff750519..d6507a660ed6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -101,7 +101,7 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) } /* __alloc_bootmem...() is protected by !slab_available() */ -int __init_refok init_section_page_cgroup(unsigned long pfn) +static int __init_refok init_section_page_cgroup(unsigned long pfn) { struct mem_section *section; struct page_cgroup *base, *pc; diff --git a/mm/page_io.c b/mm/page_io.c index 065c4480eaf0..dc6ce0afbded 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) struct bio *bio; int ret = 0, rw = WRITE; - if (remove_exclusive_swap_page(page)) { + if (try_to_free_swap(page)) { unlock_page(page); goto out; } @@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page) struct bio *bio; int ret = 0; - BUG_ON(!PageLocked(page)); - BUG_ON(PageUptodate(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageUptodate(page)); bio = get_swap_bio(GFP_KERNEL, page_private(page), page, end_swap_bio_read); if (bio == NULL) { diff --git a/mm/rmap.c b/mm/rmap.c index 10993942d6c9..ac4af8cffbf9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -47,9 +47,9 @@ #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> -#include <linux/kallsyms.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> +#include <linux/migrate.h> #include <asm/tlbflush.h> @@ -191,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +static struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -211,7 +211,7 @@ out: return NULL; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +static void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page, goto out_unmap; } - if (ptep_clear_flush_young_notify(vma, address, pte)) - referenced++; + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - BUG_ON(address < vma->vm_start || address >= vma->vm_end); - atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + SetPageSwapBacked(page); + atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ __page_set_anon_rmap(page, vma, address); + if (page_evictable(page, vma)) + lru_cache_add_lru(page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(page); } /** @@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page) */ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - BUG_ON(page_mapcount(page) == 0); if (PageAnon(page)) __page_check_anon_rmap(page, vma, address); atomic_inc(&page->_mapcount); @@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from - * @vma: the vm area in which the mapping is removed * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, struct vm_area_struct *vma) +void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (unlikely(page_mapcount(page) < 0)) { - printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); - printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); - printk (KERN_EMERG " page->flags = %lx\n", page->flags); - printk (KERN_EMERG " page->count = %x\n", page_count(page)); - printk (KERN_EMERG " page->mapping = %p\n", page->mapping); - print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); - if (vma->vm_ops) { - print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); - } - if (vma->vm_file && vma->vm_file->f_op) - print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); - BUG(); - } - /* * Now that the last pte has gone, s390 must transfer dirty * flag from storage key to struct page. We can usually skip @@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, anon_rss); -#ifdef CONFIG_MIGRATION - } else { + } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration @@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ BUG_ON(!migration); entry = make_migration_entry(page, pte_write(pteval)); -#endif } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else -#ifdef CONFIG_MIGRATION - if (migration) { + } else if (PAGE_MIGRATION && migration) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else -#endif dec_mm_counter(mm, file_rss); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); out_unmap: @@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (pte_dirty(pteval)) set_page_dirty(page); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff --git a/mm/shmem.c b/mm/shmem.c index f1b0d4871f3a..5941f9801363 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -14,31 +14,39 @@ * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * + * tiny-shmem: + * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> + * * This file is released under the GPL. */ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/vfs.h> +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/swap.h> + +static struct vfsmount *shm_mnt; + +#ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ -#include <linux/module.h> -#include <linux/init.h> -#include <linux/fs.h> #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/generic_acl.h> -#include <linux/mm.h> #include <linux/mman.h> -#include <linux/file.h> -#include <linux/swap.h> #include <linux/pagemap.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/shmem_fs.h> -#include <linux/mount.h> #include <linux/writeback.h> #include <linux/vfs.h> #include <linux/blkdev.h> @@ -1444,7 +1452,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - mark_page_accessed(vmf->page); return ret | VM_FAULT_LOCKED; } @@ -2486,7 +2493,6 @@ static struct file_system_type tmpfs_fs_type = { .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; -static struct vfsmount *shm_mnt; static int __init init_tmpfs(void) { @@ -2525,7 +2531,51 @@ out4: shm_mnt = ERR_PTR(error); return error; } -module_init(init_tmpfs) + +#else /* !CONFIG_SHMEM */ + +/* + * tiny-shmem: simple shmemfs and tmpfs using ramfs code + * + * This is intended for small system where the benefits of the full + * shmem code (swap-backed and resource-limited) are outweighed by + * their complexity. On systems without swap this code should be + * effectively equivalent, but much lighter weight. + */ + +#include <linux/ramfs.h> + +static struct file_system_type tmpfs_fs_type = { + .name = "tmpfs", + .get_sb = ramfs_get_sb, + .kill_sb = kill_litter_super, +}; + +static int __init init_tmpfs(void) +{ + BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); + + shm_mnt = kern_mount(&tmpfs_fs_type); + BUG_ON(IS_ERR(shm_mnt)); + + return 0; +} + +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + return 0; +} + +#define shmem_file_operations ramfs_file_operations +#define shmem_vm_ops generic_file_vm_ops +#define shmem_get_inode ramfs_get_inode +#define shmem_acct_size(a, b) 0 +#define shmem_unacct_size(a, b) do {} while (0) +#define SHMEM_MAX_BYTES LLONG_MAX + +#endif /* CONFIG_SHMEM */ + +/* common code */ /** * shmem_file_setup - get an unlinked file living in tmpfs @@ -2569,12 +2619,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!inode) goto close_file; +#ifdef CONFIG_SHMEM SHMEM_I(inode)->flags = flags & VM_ACCOUNT; +#endif d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); + &shmem_file_operations); + +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) + goto close_file; +#endif return file; close_file: @@ -2606,3 +2664,5 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_ops = &shmem_vm_ops; return 0; } + +module_init(init_tmpfs) diff --git a/mm/swap.c b/mm/swap.c index b135ec90cdeb..ba2c0e8b8b54 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -246,25 +246,6 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } -/** - * lru_cache_add_active_or_unevictable - * @page: the page to be added to LRU - * @vma: vma in which page is mapped for determining reclaimability - * - * place @page on active or unevictable LRU list, depending on - * page_evictable(). Note that if the page is not evictable, - * it goes directly back onto it's zone's unevictable list. It does - * NOT use a per cpu pagevec. - */ -void lru_cache_add_active_or_unevictable(struct page *page, - struct vm_area_struct *vma) -{ - if (page_evictable(page, vma)) - lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); - else - add_page_to_unevictable_list(page); -} - /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -398,28 +379,6 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); /* - * pagevec_release() for pages which are known to not be on the LRU - * - * This function reinitialises the caller's pagevec. - */ -void __pagevec_release_nonlru(struct pagevec *pvec) -{ - int i; - struct pagevec pages_to_free; - - pagevec_init(&pages_to_free, pvec->cold); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - VM_BUG_ON(PageLRU(page)); - if (put_page_testzero(page)) - pagevec_add(&pages_to_free, page); - } - pagevec_free(&pages_to_free); - pagevec_reinit(pvec); -} - -/* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ @@ -495,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec) struct page *page = pvec->pages[i]; if (PageSwapCache(page) && trylock_page(page)) { - if (PageSwapCache(page)) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); unlock_page(page); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 3353c9029cef..81c825f67a7f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -72,10 +72,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - BUG_ON(!PageLocked(page)); - BUG_ON(PageSwapCache(page)); - BUG_ON(PagePrivate(page)); - BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapCache(page)); + VM_BUG_ON(!PageSwapBacked(page)); + error = radix_tree_preload(gfp_mask); if (!error) { page_cache_get(page); @@ -108,10 +108,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - BUG_ON(!PageLocked(page)); - BUG_ON(!PageSwapCache(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageSwapCache(page)); + VM_BUG_ON(PageWriteback(page)); radix_tree_delete(&swapper_space.page_tree, page_private(page)); set_page_private(page, 0); @@ -129,13 +128,13 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page * page, gfp_t gfp_mask) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; - BUG_ON(!PageLocked(page)); - BUG_ON(!PageUptodate(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageUptodate(page)); for (;;) { entry = get_swap_page(); @@ -154,7 +153,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) * Add it to the swap cache and mark it dirty */ err = add_to_swap_cache(page, entry, - gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ @@ -196,14 +195,14 @@ void delete_from_swap_cache(struct page *page) * If we are the only user, then try to free up the swap cache. * * Its ok to check for PageSwapCache without the page lock - * here because we are going to recheck again inside - * exclusive_swap_page() _with_ the lock. + * here because we are going to recheck again inside + * try_to_free_swap() _with_ the lock. * - Marcelo */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && trylock_page(page)) { - remove_exclusive_swap_page(page); + if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { + try_to_free_swap(page); unlock_page(page); } } diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a9f87e5162..eec5ca758a23 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/shm.h> #include <linux/blkdev.h> +#include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -35,6 +36,7 @@ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; +long nr_swap_pages; long total_swap_pages; static int swap_overflow; static int least_priority; @@ -83,15 +85,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) up_read(&swap_unplug_sem); } +/* + * swapon tell device that all the old swap contents can be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static int discard_swap(struct swap_info_struct *si) +{ + struct swap_extent *se; + int err = 0; + + list_for_each_entry(se, &si->extent_list, list) { + sector_t start_block = se->start_block << (PAGE_SHIFT - 9); + sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + + if (se->start_page == 0) { + /* Do not discard the swap header page! */ + start_block += 1 << (PAGE_SHIFT - 9); + nr_blocks -= 1 << (PAGE_SHIFT - 9); + if (!nr_blocks) + continue; + } + + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL); + if (err) + break; + + cond_resched(); + } + return err; /* That will often be -EOPNOTSUPP */ +} + +/* + * swap allocation tell device that a cluster of swap can now be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static void discard_swap_cluster(struct swap_info_struct *si, + pgoff_t start_page, pgoff_t nr_pages) +{ + struct swap_extent *se = si->curr_swap_extent; + int found_extent = 0; + + while (nr_pages) { + struct list_head *lh; + + if (se->start_page <= start_page && + start_page < se->start_page + se->nr_pages) { + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + if (!found_extent++) + si->curr_swap_extent = se; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO)) + break; + } + + lh = se->list.next; + if (lh == &si->extent_list) + lh = lh->next; + se = list_entry(lh, struct swap_extent, list); + } +} + +static int wait_for_discard(void *word) +{ + schedule(); + return 0; +} + #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si) { - unsigned long offset, last_in_cluster; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int found_free_cluster = 0; - /* + /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting @@ -99,16 +182,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea + * And we let swap pages go all over an SSD partition. Hugh */ si->flags += SWP_SCANNING; - if (unlikely(!si->cluster_nr)) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) - goto lowest; + scan_base = offset = si->cluster_next; + + if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + if (si->flags & SWP_DISCARDABLE) { + /* + * Start range check on racing allocations, in case + * they overlap the cluster we eventually decide on + * (we scan without swap_lock to allow preemption). + * It's hardly conceivable that cluster_nr could be + * wrapped during our scan, but don't depend on it. + */ + if (si->lowest_alloc) + goto checks; + si->lowest_alloc = si->max; + si->highest_alloc = 0; + } spin_unlock(&swap_lock); - offset = si->lowest_bit; + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. + * But if seek is cheap, search from our current position, so + * that swap is allocated from all over the partition: if the + * Flash Translation Layer only remaps within limited zones, + * we don't want to wear out the first zone too quickly. + */ + if (!(si->flags & SWP_SOLIDSTATE)) + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -117,43 +226,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER+1; - goto cluster; + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } + + offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster < scan_base; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&swap_lock); + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + + offset = scan_base; spin_lock(&swap_lock); - goto lowest; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + si->lowest_alloc = 0; } - si->cluster_nr--; -cluster: - offset = si->cluster_next; - if (offset > si->highest_bit) -lowest: offset = si->lowest_bit; -checks: if (!(si->flags & SWP_WRITEOK)) +checks: + if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; - if (!si->swap_map[offset]) { - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + if (si->swap_map[offset]) + goto scan; + + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + si->inuse_pages++; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + si->cluster_next = offset + 1; + si->flags -= SWP_SCANNING; + + if (si->lowest_alloc) { + /* + * Only set when SWP_DISCARDABLE, and there's a scan + * for a free cluster in progress or just completed. + */ + if (found_free_cluster) { + /* + * To optimize wear-levelling, discard the + * old data of the cluster, taking care not to + * discard any of its pages that have already + * been allocated by racing tasks (offset has + * already stepped over any at the beginning). + */ + if (offset < si->highest_alloc && + si->lowest_alloc <= last_in_cluster) + last_in_cluster = si->lowest_alloc - 1; + si->flags |= SWP_DISCARDING; + spin_unlock(&swap_lock); + + if (offset < last_in_cluster) + discard_swap_cluster(si, offset, + last_in_cluster - offset + 1); + + spin_lock(&swap_lock); + si->lowest_alloc = 0; + si->flags &= ~SWP_DISCARDING; + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); + + } else if (si->flags & SWP_DISCARDING) { + /* + * Delay using pages allocated by racing tasks + * until the whole discard has been issued. We + * could defer that delay until swap_writepage, + * but it's easier to keep this self-contained. + */ + spin_unlock(&swap_lock); + wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), + wait_for_discard, TASK_UNINTERRUPTIBLE); + spin_lock(&swap_lock); + } else { + /* + * Note pages allocated by racing tasks while + * scan for a free cluster is in progress, so + * that its final discard can exclude them. + */ + if (offset < si->lowest_alloc) + si->lowest_alloc = offset; + if (offset > si->highest_alloc) + si->highest_alloc = offset; } - si->swap_map[offset] = 1; - si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; - return offset; } + return offset; +scan: spin_unlock(&swap_lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { @@ -165,8 +355,18 @@ checks: if (!(si->flags & SWP_WRITEOK)) latency_ration = LATENCY_LIMIT; } } + offset = si->lowest_bit; + while (++offset < scan_base) { + if (!si->swap_map[offset]) { + spin_lock(&swap_lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } spin_lock(&swap_lock); - goto lowest; no_page: si->flags -= SWP_SCANNING; @@ -268,7 +468,7 @@ bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; -} +} static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) { @@ -326,97 +526,58 @@ static inline int page_swapcount(struct page *page) } /* - * We can use this swap cache entry directly - * if there are no other references to it. + * We can write to an anon page without COW if there are no other references + * to it. And as a side-effect, free up its swap: because the old content + * on disk will never be read, and seeking back there to write new content + * later would only waste time away from clustering. */ -int can_share_swap_page(struct page *page) +int reuse_swap_page(struct page *page) { int count; - BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageLocked(page)); count = page_mapcount(page); - if (count <= 1 && PageSwapCache(page)) + if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); + if (count == 1 && !PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + } return count == 1; } /* - * Work out if there are any other processes sharing this - * swap cache page. Free it if you can. Return success. + * If swap is getting full, or if there are no more mappings of this page, + * then try_to_free_swap is called to free its swap space. */ -static int remove_exclusive_swap_page_count(struct page *page, int count) +int try_to_free_swap(struct page *page) { - int retval; - struct swap_info_struct * p; - swp_entry_t entry; - - BUG_ON(PagePrivate(page)); - BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageLocked(page)); if (!PageSwapCache(page)) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != count) /* us + cache + ptes */ - return 0; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (!p) + if (page_swapcount(page)) return 0; - /* Is the only swap cache user the cache itself? */ - retval = 0; - if (p->swap_map[swp_offset(entry)] == 1) { - /* Recheck the page count with the swapcache lock held.. */ - spin_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == count) && !PageWriteback(page)) { - __delete_from_swap_cache(page); - SetPageDirty(page); - retval = 1; - } - spin_unlock_irq(&swapper_space.tree_lock); - } - spin_unlock(&swap_lock); - - if (retval) { - swap_free(entry); - page_cache_release(page); - } - - return retval; -} - -/* - * Most of the time the page should have two references: one for the - * process and one for the swap cache. - */ -int remove_exclusive_swap_page(struct page *page) -{ - return remove_exclusive_swap_page_count(page, 2); -} - -/* - * The pageout code holds an extra reference to the page. That raises - * the reference count to test for to 2 for a page that is only in the - * swap cache plus 1 for each process that maps the page. - */ -int remove_exclusive_swap_page_ref(struct page *page) -{ - return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); + delete_from_swap_cache(page); + SetPageDirty(page); + return 1; } /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ -void free_swap_and_cache(swp_entry_t entry) +int free_swap_and_cache(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; struct page *page = NULL; if (is_migration_entry(entry)) - return; + return 1; p = swap_info_get(entry); if (p) { @@ -430,20 +591,19 @@ void free_swap_and_cache(swp_entry_t entry) spin_unlock(&swap_lock); } if (page) { - int one_user; - - BUG_ON(PagePrivate(page)); - one_user = (page_count(page) == 2); - /* Only cache user (+us), or swap space full? Free it! */ - /* Also recheck PageSwapCache after page is locked (above) */ + /* + * Not mapped elsewhere, or swap space full? Free it! + * Also recheck PageSwapCache now page is locked (above). + */ if (PageSwapCache(page) && !PageWriteback(page) && - (one_user || vm_swap_full())) { + (!page_mapped(page) || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } unlock_page(page); page_cache_release(page); } + return p != NULL; } #ifdef CONFIG_HIBERNATION @@ -776,10 +936,10 @@ static int try_to_unuse(unsigned int type) break; } - /* + /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. + * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); @@ -930,7 +1090,16 @@ static int try_to_unuse(unsigned int type) lock_page(page); wait_on_page_writeback(page); } - if (PageSwapCache(page)) + + /* + * It is conceivable that a racing task removed this page from + * swap cache just before we acquired the page lock at the top, + * or while we dropped it in unuse_mm(). The page might even + * be back in swap cache on another swap area: that we must not + * delete, since it may not have been written out to swap yet. + */ + if (PageSwapCache(page) && + likely(page_private(page) == entry.val)) delete_from_swap_cache(page); /* @@ -1203,26 +1372,6 @@ out: return ret; } -#if 0 /* We don't need this yet */ -#include <linux/backing-dev.h> -int page_queue_congested(struct page *page) -{ - struct backing_dev_info *bdi; - - BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; - struct swap_info_struct *sis; - - sis = get_swap_info_struct(swp_type(entry)); - bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; - } else - bdi = page->mapping->backing_dev_info; - return bdi_write_congested(bdi); -} -#endif - asmlinkage long sys_swapoff(const char __user * specialfile) { struct swap_info_struct * p = NULL; @@ -1233,7 +1382,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) char * pathname; int i, type, prev; int err; - + if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1253,7 +1402,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) spin_lock(&swap_lock); for (type = swap_list.head; type >= 0; type = swap_info[type].next) { p = swap_info + type; - if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { + if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; } @@ -1426,12 +1575,12 @@ static int swap_show(struct seq_file *swap, void *v) file = ptr->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", - len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + ptr->pages << (PAGE_SHIFT - 10), + ptr->inuse_pages << (PAGE_SHIFT - 10), + ptr->prio); return 0; } @@ -1487,12 +1636,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) int i, prev; int error; union swap_header *swap_header = NULL; - int swap_header_version; unsigned int nr_good_pages = 0; int nr_extents = 0; sector_t span; unsigned long maxpages = 1; - int swapfilesize; + unsigned long swapfilepages; unsigned short *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -1570,7 +1718,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } - swapfilesize = i_size_read(inode) >> PAGE_SHIFT; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; /* * Read the swap header. @@ -1584,101 +1732,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = PTR_ERR(page); goto bad_swap; } - kmap(page); - swap_header = page_address(page); + swap_header = kmap(page); - if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) - swap_header_version = 1; - else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) - swap_header_version = 2; - else { + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; } - - switch (swap_header_version) { - case 1: - printk(KERN_ERR "version 0 swap is no longer supported. " - "Use mkswap -v1 %s\n", name); + + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); error = -EINVAL; goto bad_swap; - case 2: - /* swap partition endianess hack... */ - if (swab32(swap_header->info.version) == 1) { - swab32s(&swap_header->info.version); - swab32s(&swap_header->info.last_page); - swab32s(&swap_header->info.nr_badpages); - for (i = 0; i < swap_header->info.nr_badpages; i++) - swab32s(&swap_header->info.badpages[i]); - } - /* Check the swap header's sub-version and the size of - the swap file and bad block lists */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } + } - p->lowest_bit = 1; - p->cluster_next = 1; + p->lowest_bit = 1; + p->cluster_next = 1; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; - error = -EINVAL; - if (!maxpages) - goto bad_swap; - if (swapfilesize && maxpages > swapfilesize) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - goto bad_swap; - } - if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + error = -EINVAL; + if (!maxpages) + goto bad_swap; + if (swapfilepages && maxpages > swapfilepages) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + goto bad_swap; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + /* OK, set up the swap map and apply the bad block list */ + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } - error = 0; - memset(swap_map, 0, maxpages * sizeof(short)); - for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) - error = -EINVAL; - else - swap_map[page_nr] = SWAP_MAP_BAD; - } - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (error) + memset(swap_map, 0, maxpages * sizeof(short)); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + error = -EINVAL; goto bad_swap; + } + swap_map[page_nr] = SWAP_MAP_BAD; } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; @@ -1697,6 +1830,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); if (swap_flags & SWAP_FLAG_PREFER) @@ -1705,14 +1845,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) else p->prio = --least_priority; p->swap_map = swap_map; - p->flags = SWP_ACTIVE; + p->flags |= SWP_WRITEOK; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk\n", + "Priority:%d extents:%d across:%lluk %s%s\n", nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : ""); /* insert swap space into swap_list: */ prev = -1; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c deleted file mode 100644 index 3e67d575ee6e..000000000000 --- a/mm/tiny-shmem.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code - * - * Matt Mackall <mpm@selenic.com> January, 2004 - * derived from mm/shmem.c and fs/ramfs/inode.c - * - * This is intended for small system where the benefits of the full - * shmem code (swap-backed and resource-limited) are outweighed by - * their complexity. On systems without swap this code should be - * effectively equivalent, but much lighter weight. - */ - -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/vfs.h> -#include <linux/mount.h> -#include <linux/file.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/swap.h> -#include <linux/ramfs.h> - -static struct file_system_type tmpfs_fs_type = { - .name = "tmpfs", - .get_sb = ramfs_get_sb, - .kill_sb = kill_litter_super, -}; - -static struct vfsmount *shm_mnt; - -static int __init init_tmpfs(void) -{ - BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); - - shm_mnt = kern_mount(&tmpfs_fs_type); - BUG_ON(IS_ERR(shm_mnt)); - - return 0; -} -module_init(init_tmpfs) - -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps - * @size: size to be set for the file - * @flags: vm_flags - */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) -{ - int error; - struct file *file; - struct inode *inode; - struct dentry *dentry, *root; - struct qstr this; - - if (IS_ERR(shm_mnt)) - return (void *)shm_mnt; - - error = -ENOMEM; - this.name = name; - this.len = strlen(name); - this.hash = 0; /* will go */ - root = shm_mnt->mnt_root; - dentry = d_alloc(root, &this); - if (!dentry) - goto put_memory; - - error = -ENFILE; - file = get_empty_filp(); - if (!file) - goto put_dentry; - - error = -ENOSPC; - inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); - if (!inode) - goto close_file; - - d_instantiate(dentry, inode); - inode->i_size = size; - inode->i_nlink = 0; /* It is unlinked */ - init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); - -#ifndef CONFIG_MMU - error = ramfs_nommu_expand_for_mapping(inode, size); - if (error) - goto close_file; -#endif - return file; - -close_file: - put_filp(file); -put_dentry: - dput(dentry); -put_memory: - return ERR_PTR(error); -} -EXPORT_SYMBOL_GPL(shmem_file_setup); - -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap_pgoff - */ -int shmem_zero_setup(struct vm_area_struct *vma) -{ - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; - - file = shmem_file_setup("dev/zero", size, vma->vm_flags); - if (IS_ERR(file)) - return PTR_ERR(file); - - if (vma->vm_file) - fput(vma->vm_file); - vma->vm_file = file; - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -int shmem_unuse(swp_entry_t entry, struct page *page) -{ - return 0; -} - -#ifndef CONFIG_MMU -unsigned long shmem_get_unmapped_area(struct file *file, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); -} -#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7465f22fec0c..c5db9a7264d9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,6 +14,7 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -381,8 +382,9 @@ found: goto retry; } if (printk_ratelimit()) - printk(KERN_WARNING "vmap allocation failed: " - "use vmalloc=<size> to increase size.\n"); + printk(KERN_WARNING + "vmap allocation for size %lu failed: " + "use vmalloc=<size> to increase size.\n", size); return ERR_PTR(-EBUSY); } @@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } +static void vmap_debug_free_range(unsigned long start, unsigned long end) +{ + /* + * Unmap page tables and force a TLB flush immediately if + * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free + * bugs similarly to those in linear kernel virtual address + * space after a page has been freed. + * + * All the lazy freeing logic is still retained, in order to + * minimise intrusiveness of this debugging feature. + * + * This is going to be *slow* (linear kernel virtual address + * debugging doesn't do a broadcast TLB flush so it is a lot + * faster). + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); +#endif +} + /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { - static DEFINE_SPINLOCK(purge_lock); + static DEFINE_MUTEX(purge_lock); LIST_HEAD(valist); struct vmap_area *va; int nr = 0; @@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) + if (!mutex_trylock(&purge_lock)) return; } else - spin_lock(&purge_lock); + mutex_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { @@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, __free_vmap_area(va); spin_unlock(&vmap_area_lock); } - spin_unlock(&purge_lock); + mutex_unlock(&purge_lock); } /* @@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr & (PAGE_SIZE-1)); debug_check_no_locks_freed(mem, size); + vmap_debug_free_range(addr, addr+size); if (likely(count <= VMAP_MAX_ALLOC)) vb_free(mem, size); @@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; + + vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; @@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; @@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) @@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); + return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, + -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; diff --git a/mm/vmscan.c b/mm/vmscan.c index d196f46c8808..b07c48b09a93 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -52,6 +52,9 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -617,7 +620,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced && page_mapping_inuse(page)) goto activate_locked; -#ifdef CONFIG_SWAP /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. @@ -625,20 +627,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageAnon(page) && !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - switch (try_to_munlock(page)) { - case SWAP_FAIL: /* shouldn't happen */ - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_SUCCESS: - ; /* fall thru'; add to swap cache */ - } - if (!add_to_swap(page, GFP_ATOMIC)) + if (!add_to_swap(page)) goto activate_locked; may_enter_fs = 1; } -#endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -752,6 +744,8 @@ free_it: continue; cull_mlocked: + if (PageSwapCache(page)) + try_to_free_swap(page); unlock_page(page); putback_lru_page(page); continue; @@ -759,7 +753,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; @@ -1173,11 +1167,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) zone->prev_priority = priority; } -static inline int zone_is_near_oom(struct zone *zone) -{ - return zone->pages_scanned >= (zone_lru_pages(zone) * 3); -} - /* * This moves pages from the active list to the inactive list. * @@ -1248,6 +1237,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, list_add(&page->lru, &l_inactive); } + /* + * Move the pages to the [file or anon] inactive list. + */ + pagevec_init(&pvec, 1); + pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; + spin_lock_irq(&zone->lru_lock); /* * Count referenced pages from currently used mappings as @@ -1255,15 +1251,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * This helps balance scan pressure between file and anonymous * pages in get_scan_ratio. */ - zone->recent_rotated[!!file] += pgmoved; - - /* - * Move the pages to the [file or anon] inactive list. - */ - pagevec_init(&pvec, 1); + if (scan_global_lru(sc)) + zone->recent_rotated[!!file] += pgmoved; - pgmoved = 0; - lru = LRU_BASE + file * LRU_FILE; while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1336,12 +1326,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long anon_prio, file_prio; unsigned long ap, fp; - anon = zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - file = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have no swap space, do not bother scanning anon pages. */ if (nr_swap_pages <= 0) { percent[0] = 0; @@ -1349,6 +1333,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, return; } + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= zone->pages_high)) { percent[0] = 100; @@ -1408,14 +1398,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static unsigned long shrink_zone(int priority, struct zone *zone, +static void shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long nr_reclaimed = 0; unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; + unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long swap_cluster_max = sc->swap_cluster_max; get_scan_ratio(zone, sc, percent); @@ -1431,7 +1422,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone, } zone->lru[l].nr_scan += scan; nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= sc->swap_cluster_max) + if (nr[l] >= swap_cluster_max) zone->lru[l].nr_scan = 0; else nr[l] = 0; @@ -1450,16 +1441,28 @@ static unsigned long shrink_zone(int priority, struct zone *zone, nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], - (unsigned long)sc->swap_cluster_max); + nr_to_scan = min(nr[l], swap_cluster_max); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + zone, sc, priority); } } + /* + * On large memory systems, scan >> priority can become + * really large. This is fine for the starting priority; + * we want to put equal scanning pressure on each zone. + * However, if the VM has a harder time of freeing pages, + * with multiple processes reclaiming pages, the total + * freeing target can get unreasonably large. + */ + if (nr_reclaimed > swap_cluster_max && + priority < DEF_PRIORITY && !current_is_kswapd()) + break; } + sc->nr_reclaimed = nr_reclaimed; + /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. @@ -1470,7 +1473,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); - return nr_reclaimed; } /* @@ -1484,16 +1486,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * b) The zones may be over pages_high but they must go *over* pages_high to * satisfy the `incremental min' zone defense algorithm. * - * Returns the number of reclaimed pages. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); - unsigned long nr_reclaimed = 0; struct zoneref *z; struct zone *zone; @@ -1524,10 +1523,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, priority); } - nr_reclaimed += shrink_zone(priority, zone, sc); + shrink_zone(priority, zone, sc); } - - return nr_reclaimed; } /* @@ -1552,7 +1549,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int priority; unsigned long ret = 0; unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; struct zoneref *z; @@ -1580,7 +1576,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1588,13 +1584,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (scan_global_lru(sc)) { shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; + sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } } total_scanned += sc->nr_scanned; - if (nr_reclaimed >= sc->swap_cluster_max) { - ret = nr_reclaimed; + if (sc->nr_reclaimed >= sc->swap_cluster_max) { + ret = sc->nr_reclaimed; goto out; } @@ -1617,7 +1613,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scan_global_lru(sc)) - ret = nr_reclaimed; + ret = sc->nr_reclaimed; out: /* * Now that we've scanned all the zones at this priority level, note @@ -1712,7 +1708,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) int priority; int i; unsigned long total_scanned; - unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1731,7 +1726,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) loop_again: total_scanned = 0; - nr_reclaimed = 0; + sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); @@ -1817,11 +1812,11 @@ loop_again: */ if (!zone_watermark_ok(zone, order, 8*zone->pages_high, end_zone, 0)) - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - nr_reclaimed += reclaim_state->reclaimed_slab; + sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) continue; @@ -1835,7 +1830,7 @@ loop_again: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > nr_reclaimed + nr_reclaimed / 2) + total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; } if (all_zones_ok) @@ -1853,7 +1848,7 @@ loop_again: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if (nr_reclaimed >= SWAP_CLUSTER_MAX) + if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; } out: @@ -1872,10 +1867,27 @@ out: try_to_freeze(); + /* + * Fragmentation may mean that the system cannot be + * rebalanced for high-order allocations in all zones. + * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, + * it means the zones have been fully scanned and are still + * not balanced. For high-order allocations, there is + * little point trying all over again as kswapd may + * infinite loop. + * + * Instead, recheck all watermarks at order-0 as they + * are the most important. If watermarks are ok, kswapd will go + * back to sleep. High-order users can still perform direct + * reclaim if they wish. + */ + if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) + order = sc.order = 0; + goto loop_again; } - return nr_reclaimed; + return sc.nr_reclaimed; } /* @@ -2227,7 +2239,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; - unsigned long nr_reclaimed = 0; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), @@ -2260,9 +2271,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) priority = ZONE_RECLAIM_PRIORITY; do { note_zone_scanning_priority(zone, priority); - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -2286,13 +2297,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - nr_reclaimed += slab_reclaimable - + sc.nr_reclaimed += slab_reclaimable - zone_page_state(zone, NR_SLAB_RECLAIMABLE); } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - return nr_reclaimed >= nr_pages; + return sc.nr_reclaimed >= nr_pages; } int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -2472,7 +2483,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) * back onto @zone's unevictable list. */ #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -void scan_zone_unevictable_pages(struct zone *zone) +static void scan_zone_unevictable_pages(struct zone *zone) { struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; unsigned long scan; @@ -2514,7 +2525,7 @@ void scan_zone_unevictable_pages(struct zone *zone) * that has possibly/probably made some previously unevictable pages * evictable. */ -void scan_all_zones_unevictable_pages(void) +static void scan_all_zones_unevictable_pages(void) { struct zone *zone; |