diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/allocpercpu.c | 9 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 404 | ||||
-rw-r--r-- | mm/filemap_xip.c | 41 | ||||
-rw-r--r-- | mm/fremap.c | 179 | ||||
-rw-r--r-- | mm/highmem.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 80 | ||||
-rw-r--r-- | mm/madvise.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 341 | ||||
-rw-r--r-- | mm/mempolicy.c | 65 | ||||
-rw-r--r-- | mm/mempool.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 98 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 15 | ||||
-rw-r--r-- | mm/nommu.c | 57 | ||||
-rw-r--r-- | mm/page-writeback.c | 33 | ||||
-rw-r--r-- | mm/page_alloc.c | 631 | ||||
-rw-r--r-- | mm/pdflush.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 516 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 94 | ||||
-rw-r--r-- | mm/slab.c | 141 | ||||
-rw-r--r-- | mm/slob.c | 600 | ||||
-rw-r--r-- | mm/slub.c | 753 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 59 | ||||
-rw-r--r-- | mm/util.c | 74 | ||||
-rw-r--r-- | mm/vmalloc.c | 78 | ||||
-rw-r--r-- | mm/vmscan.c | 212 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
36 files changed, 2673 insertions, 1884 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8ac412b45f18..86187221e78f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -117,7 +117,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64) + depends on (IA64 || X86 || PPC64 || SUPERH) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND @@ -163,8 +163,16 @@ config ZONE_DMA_FLAG default "0" if !ZONE_DMA default "1" +config BOUNCE + def_bool y + depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + config NR_QUICK int depends on QUICKLIST default "2" if (SUPERH && !SUPERH64) default "1" + +config VIRT_TO_BUS + def_bool y + depends on !ARCH_NO_VIRT_TO_BUS diff --git a/mm/Makefile b/mm/Makefile index a9148ea329aa..245e33ab00c4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -13,9 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ $(mmu-y) -ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) -obj-y += bounce.o -endif +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index b2486cf887a0..00b02623f008 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -53,12 +53,9 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) int node = cpu_to_node(cpu); BUG_ON(pdata->ptrs[cpu]); - if (node_online(node)) { - /* FIXME: kzalloc_node(size, gfp, node) */ - pdata->ptrs[cpu] = kmalloc_node(size, gfp, node); - if (pdata->ptrs[cpu]) - memset(pdata->ptrs[cpu], 0, size); - } else + if (node_online(node)) + pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); + else pdata->ptrs[cpu] = kzalloc(size, gfp); return pdata->ptrs[cpu]; } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e5de3781d3fe..f50a2811f9dc 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout) } EXPORT_SYMBOL(congestion_wait); -long congestion_wait_interruptible(int rw, long timeout) -{ - long ret; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; - - prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE); - if (signal_pending(current)) - ret = -ERESTARTSYS; - else - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - return ret; -} -EXPORT_SYMBOL(congestion_wait_interruptible); - /** * congestion_end - wake up sleepers on a congested backing_dev_info * @rw: READ or WRITE diff --git a/mm/filemap.c b/mm/filemap.c index c6ebd9f912ab..49a6fe375d01 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + BUG_ON(page_mapped(page)); } void remove_from_page_cache(struct page *page) @@ -866,13 +867,11 @@ void do_generic_mapping_read(struct address_space *mapping, { struct inode *inode = mapping->host; unsigned long index; - unsigned long end_index; unsigned long offset; unsigned long last_index; unsigned long next_index; unsigned long prev_index; unsigned int prev_offset; - loff_t isize; struct page *cached_page; int error; struct file_ra_state ra = *_ra; @@ -885,42 +884,58 @@ void do_generic_mapping_read(struct address_space *mapping, last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; - isize = i_size_read(inode); - if (!isize) - goto out; - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; + unsigned long end_index; + loff_t isize; unsigned long nr, ret; + cond_resched(); +find_page: + page = find_get_page(mapping, index); + if (!page) { + page_cache_sync_readahead(mapping, + &ra, filp, + index, last_index - index); + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, + &ra, filp, page, + index, last_index - index); + } + if (!PageUptodate(page)) + goto page_not_up_to_date; +page_ok: + /* + * i_size must be checked after we know the page is Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; + if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { + page_cache_release(page); goto out; } } nr = nr - offset; - cond_resched(); - if (index == next_index) - next_index = page_cache_readahead(mapping, &ra, filp, - index, last_index - index); - -find_page: - page = find_get_page(mapping, index); - if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); - goto no_cached_page; - } - if (!PageUptodate(page)) - goto page_not_up_to_date; -page_ok: - /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. @@ -1006,31 +1021,6 @@ readpage: unlock_page(page); } - /* - * i_size must be checked after we have done ->readpage. - * - * Checking i_size after the readpage allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); - goto out; - } - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - page_cache_release(page); - goto out; - } - } - nr = nr - offset; goto page_ok; readpage_error: @@ -1066,6 +1056,7 @@ no_cached_page: out: *_ra = ra; + _ra->prev_index = prev_index; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) @@ -1218,6 +1209,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = retval ?: desc.error; break; } + if (desc.count > 0) + break; } } out: @@ -1314,62 +1307,62 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) #define MMAP_LOTSAMISS (100) /** - * filemap_nopage - read in file data for page fault handling - * @area: the applicable vm_area - * @address: target address to read in - * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL + * filemap_fault - read in file data for page fault handling + * @vma: vma in which the fault was taken + * @vmf: struct vm_fault containing details of the fault * - * filemap_nopage() is invoked via the vma operations vector for a + * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. */ -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int error; - struct file *file = area->vm_file; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff; - int did_readaround = 0, majmin = VM_FAULT_MINOR; - - pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + unsigned long size; + int did_readaround = 0; + int ret = 0; -retry_all: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) + if (vmf->pgoff >= size) goto outside_data_content; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(area)) + if (VM_RandomReadHint(vma)) goto no_cached_page; /* - * The readahead code wants to be told about each and every page - * so it can build and shrink its windows appropriately - * - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff, 1); - - /* * Do we have something in the page cache already? */ retry_find: - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, vmf->pgoff); + /* + * For sequential accesses, we use the generic readahead logic. + */ + if (VM_SequentialReadHint(vma)) { + if (!page) { + page_cache_sync_readahead(mapping, ra, file, + vmf->pgoff, 1); + page = find_lock_page(mapping, vmf->pgoff); + if (!page) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, ra, file, page, + vmf->pgoff, 1); + } + } + if (!page) { unsigned long ra_pages; - if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); - goto no_cached_page; - } ra->mmap_miss++; /* @@ -1384,7 +1377,7 @@ retry_find: * check did_readaround, as this is an inner loop. */ if (!did_readaround) { - majmin = VM_FAULT_MAJOR; + ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } did_readaround = 1; @@ -1392,11 +1385,11 @@ retry_find: if (ra_pages) { pgoff_t start = 0; - if (pgoff > ra_pages / 2) - start = pgoff - ra_pages / 2; + if (vmf->pgoff > ra_pages / 2) + start = vmf->pgoff - ra_pages / 2; do_page_cache_readahead(mapping, file, start, ra_pages); } - page = find_get_page(mapping, pgoff); + page = find_lock_page(mapping, vmf->pgoff); if (!page) goto no_cached_page; } @@ -1405,35 +1398,42 @@ retry_find: ra->mmap_hit++; /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. */ - if (!PageUptodate(page)) + if (unlikely(!PageUptodate(page))) goto page_not_uptodate; -success: + /* Must recheck i_size under page lock */ + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + unlock_page(page); + goto outside_data_content; + } + /* * Found the page and have a reference on it. */ mark_page_accessed(page); - if (type) - *type = majmin; - return page; + ra->prev_index = page->index; + vmf->page = page; + return ret | VM_FAULT_LOCKED; outside_data_content: /* * An external ptracer can access pages that normally aren't * accessible.. */ - if (area->vm_mm == current->mm) - return NOPAGE_SIGBUS; + if (vma->vm_mm == current->mm) + return VM_FAULT_SIGBUS; + /* Fall through to the non-read-ahead case */ no_cached_page: /* * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, pgoff); + error = page_cache_read(file, vmf->pgoff); /* * The page we want has now been added to the page cache. @@ -1449,12 +1449,13 @@ no_cached_page: * to schedule I/O. */ if (error == -ENOMEM) - return NOPAGE_OOM; - return NOPAGE_SIGBUS; + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; page_not_uptodate: + /* IO error path */ if (!did_readaround) { - majmin = VM_FAULT_MAJOR; + ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } @@ -1464,217 +1465,21 @@ page_not_uptodate: * because there really aren't any performance issues here * and we need to check for errors. */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } ClearPageError(page); error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ - shrink_readahead_size_eio(file, ra); page_cache_release(page); - return NOPAGE_SIGBUS; -} -EXPORT_SYMBOL(filemap_nopage); -static struct page * filemap_getpage(struct file *file, unsigned long pgoff, - int nonblock) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int error; - - /* - * Do we have something in the page cache already? - */ -retry_find: - page = find_get_page(mapping, pgoff); - if (!page) { - if (nonblock) - return NULL; - goto no_cached_page; - } - - /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. - */ - if (!PageUptodate(page)) { - if (nonblock) { - page_cache_release(page); - return NULL; - } - goto page_not_uptodate; - } - -success: - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - return page; - -no_cached_page: - error = page_cache_read(file, pgoff); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) + if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return NULL; - -page_not_uptodate: - lock_page(page); - - /* Did it get truncated while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Umm, take care of errors if the page isn't up-to-date. - * Try to re-read it _once_. We do this synchronously, - * because there really aren't any performance issues here - * and we need to check for errors. - */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - ClearPageError(page); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ -err: - page_cache_release(page); - - return NULL; -} - -int filemap_populate(struct vm_area_struct *vma, unsigned long addr, - unsigned long len, pgprot_t prot, unsigned long pgoff, - int nonblock) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long size; - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int err; - - if (!nonblock) - force_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); - -repeat: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) - return -EINVAL; - - page = filemap_getpage(file, pgoff, nonblock); - - /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as - * done in shmem_populate calling shmem_getpage */ - if (!page && !nonblock) - return -ENOMEM; - - if (page) { - err = install_page(mm, vma, addr, page, prot); - if (err) { - page_cache_release(page); - return err; - } - } else if (vma->vm_flags & VM_NONLINEAR) { - /* No page was found just because we can't read it in now (being - * here implies nonblock != 0), but the page may exist, so set - * the PTE to fault it in later. */ - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } - - len -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - if (len) - goto repeat; - - return 0; + /* Things didn't work out. Return zero to tell the mm layer so. */ + shrink_readahead_size_eio(file, ra); + return VM_FAULT_SIGBUS; } -EXPORT_SYMBOL(filemap_populate); +EXPORT_SYMBOL(filemap_fault); struct vm_operations_struct generic_file_vm_ops = { - .nopage = filemap_nopage, - .populate = filemap_populate, + .fault = filemap_fault, }; /* This is used for a general mmap of a disk file */ @@ -1687,6 +1492,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -1964,7 +1770,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (unlikely(*pos + *count > MAX_NON_LFS && !(file->f_flags & O_LARGEFILE))) { if (*pos >= MAX_NON_LFS) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } if (*count > MAX_NON_LFS - (unsigned long)*pos) { @@ -1982,7 +1787,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (likely(!isblk)) { if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { if (*count || *pos > inode->i_sb->s_maxbytes) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } /* zero-length writes at ->s_maxbytes are OK */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 65ffc321f0c0..53ee6a299635 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -205,62 +205,58 @@ __xip_unmap (struct address_space * mapping, } /* - * xip_nopage() is invoked via the vma operations vector for a + * xip_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * - * This function is derived from filemap_nopage, but used for execute in place + * This function is derived from filemap_fault, but used for execute in place */ -static struct page * -xip_file_nopage(struct vm_area_struct * area, - unsigned long address, - int *type) +static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) { struct file *file = area->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff, endoff; + pgoff_t size; - pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) - + area->vm_pgoff; - endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) - + area->vm_pgoff; + /* XXX: are VM_FAULT_ codes OK? */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) - return NOPAGE_SIGBUS; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; - page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); + page = mapping->a_ops->get_xip_page(mapping, + vmf->pgoff*(PAGE_SIZE/512), 0); if (!IS_ERR(page)) goto out; if (PTR_ERR(page) != -ENODATA) - return NOPAGE_SIGBUS; + return VM_FAULT_OOM; /* sparse block */ if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) && (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { /* maybe shared writable, allocate new block */ - page = mapping->a_ops->get_xip_page (mapping, - pgoff*(PAGE_SIZE/512), 1); + page = mapping->a_ops->get_xip_page(mapping, + vmf->pgoff*(PAGE_SIZE/512), 1); if (IS_ERR(page)) - return NOPAGE_SIGBUS; + return VM_FAULT_SIGBUS; /* unmap page at pgoff from all other vmas */ - __xip_unmap(mapping, pgoff); + __xip_unmap(mapping, vmf->pgoff); } else { /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - return NOPAGE_OOM; + return VM_FAULT_OOM; } out: page_cache_get(page); - return page; + vmf->page = page; + return 0; } static struct vm_operations_struct xip_file_vm_ops = { - .nopage = xip_file_nopage, + .fault = xip_file_fault, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -269,6 +265,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) file_accessed(file); vma->vm_ops = &xip_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); diff --git a/mm/fremap.c b/mm/fremap.c index 4e3f53dd5fd4..c395b1abf082 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -20,13 +20,14 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> -static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, +static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; - struct page *page = NULL; if (pte_present(pte)) { + struct page *page; + flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); @@ -35,68 +36,21 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_page_dirty(page); page_remove_rmap(page, vma); page_cache_release(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, file_rss); } } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } - return !!page; } /* - * Install a file page to a given virtual memory address, release any - * previously existing mapping. - */ -int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, pgprot_t prot) -{ - struct inode *inode; - pgoff_t size; - int err = -ENOMEM; - pte_t *pte; - pte_t pte_val; - spinlock_t *ptl; - - pte = get_locked_pte(mm, addr, &ptl); - if (!pte) - goto out; - - /* - * This page may have been truncated. Tell the - * caller about it. - */ - err = -EINVAL; - inode = vma->vm_file->f_mapping->host; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (!page->mapping || page->index >= size) - goto unlock; - err = -ENOMEM; - if (page_mapcount(page) > INT_MAX/2) - goto unlock; - - if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) - inc_mm_counter(mm, file_rss); - - flush_icache_page(vma, page); - pte_val = mk_pte(page, prot); - set_pte_at(mm, addr, pte, pte_val); - page_add_file_rmap(page); - update_mmu_cache(vma, addr, pte_val); - lazy_mmu_prot_update(pte_val); - err = 0; -unlock: - pte_unmap_unlock(pte, ptl); -out: - return err; -} -EXPORT_SYMBOL(install_page); - -/* * Install a file pte to a given virtual memory address, release any * previously existing mapping. */ -int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, +static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; @@ -107,10 +61,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto out; - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { - update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); - } + if (!pte_none(*pte)) + zap_pte(mm, vma, addr, pte); set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); /* @@ -126,6 +78,25 @@ out: return err; } +static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + int err; + + do { + err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); + if (err) + return err; + + size -= PAGE_SIZE; + addr += PAGE_SIZE; + pgoff++; + } while (size); + + return 0; + +} + /*** * sys_remap_file_pages - remap arbitrary pages of a shared backing store * file within an existing vma. @@ -183,41 +154,77 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, * the single existing vma. vm_private_data is used as a * swapout cursor in a VM_NONLINEAR vma. */ - if (vma && (vma->vm_flags & VM_SHARED) && - (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) && - vma->vm_ops && vma->vm_ops->populate && - end > start && start >= vma->vm_start && - end <= vma->vm_end) { - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != linear_page_index(vma, start) && - !(vma->vm_flags & VM_NONLINEAR)) { - if (!has_write_lock) { - up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - has_write_lock = 1; - goto retry; - } - mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); - flush_dcache_mmap_lock(mapping); - vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); - } + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; - err = vma->vm_ops->populate(vma, start, size, - vma->vm_page_prot, - pgoff, flags & MAP_NONBLOCK); + if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) + goto out; + + if (!vma->vm_flags & VM_CAN_NONLINEAR) + goto out; + if (end <= start || start < vma->vm_start || end > vma->vm_end) + goto out; + + /* Must set VM_NONLINEAR before any pages are populated. */ + if (!(vma->vm_flags & VM_NONLINEAR)) { + /* Don't need a nonlinear mapping, exit success */ + if (pgoff == linear_page_index(vma, start)) { + err = 0; + goto out; + } + + if (!has_write_lock) { + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + has_write_lock = 1; + goto retry; + } + mapping = vma->vm_file->f_mapping; /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). + * page_mkclean doesn't work on nonlinear vmas, so if + * dirty pages need to be accounted, emulate with linear + * vmas. */ + if (mapping_cap_account_dirty(mapping)) { + unsigned long addr; + + flags &= MAP_NONBLOCK; + addr = mmap_region(vma->vm_file, start, size, + flags, vma->vm_flags, pgoff, 1); + if (IS_ERR_VALUE(addr)) { + err = addr; + } else { + BUG_ON(addr != start); + err = 0; + } + goto out; + } + spin_lock(&mapping->i_mmap_lock); + flush_dcache_mmap_lock(mapping); + vma->vm_flags |= VM_NONLINEAR; + vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); + } + + err = populate_range(mm, vma, start, size, pgoff); + if (!err && !(flags & MAP_NONBLOCK)) { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); } + + /* + * We can't clear VM_NONLINEAR because we'd have to do + * it after ->populate completes, and that would prevent + * downgrading the lock. (Locks can't be upgraded). + */ + +out: if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else diff --git a/mm/highmem.c b/mm/highmem.c index be8f8d36a8b9..7a967bc35152 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -46,9 +46,14 @@ unsigned int nr_free_highpages (void) pg_data_t *pgdat; unsigned int pages = 0; - for_each_online_pgdat(pgdat) + for_each_online_pgdat(pgdat) { pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], NR_FREE_PAGES); + if (zone_movable_is_highmem()) + pages += zone_page_state( + &pgdat->node_zones[ZONE_MOVABLE], + NR_FREE_PAGES); + } return pages; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a45d1f0691ce..f127940ec24f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,9 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static gfp_t htlb_alloc_mask = GFP_HIGHUSER; +unsigned long hugepages_treat_as_movable; + /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ @@ -66,24 +69,22 @@ static void enqueue_huge_page(struct page *page) static struct page *dequeue_huge_page(struct vm_area_struct *vma, unsigned long address) { - int nid = numa_node_id(); + int nid; struct page *page = NULL; - struct zonelist *zonelist = huge_zonelist(vma, address); + struct zonelist *zonelist = huge_zonelist(vma, address, + htlb_alloc_mask); struct zone **z; for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && - !list_empty(&hugepage_freelists[nid])) - break; - } - - if (*z) { - page = list_entry(hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && + !list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + free_huge_pages--; + free_huge_pages_node[nid]--; + } } return page; } @@ -101,13 +102,24 @@ static void free_huge_page(struct page *page) static int alloc_fresh_huge_page(void) { - static int nid = 0; + static int prev_nid; struct page *page; - page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); - nid = next_node(nid, node_online_map); + int nid; + + /* + * Copy static prev_nid to local nid, work on that, then copy it + * back to prev_nid afterwards: otherwise there's a window in which + * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really doesn't + * matter if occasionally a racer chooses the same nid as we do. + */ + nid = next_node(prev_nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); + prev_nid = nid; + + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, + HUGETLB_PAGE_ORDER); if (page) { set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); @@ -196,7 +208,7 @@ static void update_and_free_page(struct page *page) 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); } - page[1].lru.next = NULL; + set_compound_page_dtor(page, NULL); set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -256,6 +268,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, max_huge_pages = set_max_huge_pages(max_huge_pages); return 0; } + +int hugetlb_treat_movable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (hugepages_treat_as_movable) + htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; + else + htlb_alloc_mask = GFP_HIGHUSER; + return 0; +} + #endif /* CONFIG_SYSCTL */ int hugetlb_report_meminfo(char *buf) @@ -292,15 +317,14 @@ unsigned long hugetlb_total_pages(void) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static struct page *hugetlb_nopage(struct vm_area_struct *vma, - unsigned long address, int *unused) +static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return NULL; + return 0; } struct vm_operations_struct hugetlb_vm_ops = { - .nopage = hugetlb_nopage, + .fault = hugetlb_vm_op_fault, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -446,7 +470,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, avoidcopy = (page_count(old_page) == 1); if (avoidcopy) { set_huge_ptep_writable(vma, address, ptep); - return VM_FAULT_MINOR; + return 0; } page_cache_get(old_page); @@ -471,10 +495,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(new_page); page_cache_release(old_page); - return VM_FAULT_MINOR; + return 0; } -int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; @@ -528,7 +552,7 @@ retry: if (idx >= size) goto backout; - ret = VM_FAULT_MINOR; + ret = 0; if (!pte_none(*ptep)) goto backout; @@ -579,7 +603,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - ret = VM_FAULT_MINOR; + ret = 0; spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ @@ -618,7 +642,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); spin_lock(&mm->page_table_lock); - if (ret == VM_FAULT_MINOR) + if (!(ret & VM_FAULT_MAJOR)) continue; remainder = 0; diff --git a/mm/madvise.c b/mm/madvise.c index 60542d006ec1..93ee375b38e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; + int write; size_t len; - if (madvise_need_mmap_write(behavior)) + write = madvise_need_mmap_write(behavior); + if (write) down_write(¤t->mm->mmap_sem); else down_read(¤t->mm->mmap_sem); @@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) vma = find_vma(current->mm, start); } out: - if (madvise_need_mmap_write(behavior)) + if (write) up_write(¤t->mm->mmap_sem); else up_read(¤t->mm->mmap_sem); diff --git a/mm/memory.c b/mm/memory.c index f64cbf9baa36..ca8cac11bd2c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -78,11 +78,9 @@ unsigned long num_physpages; * and ZONE_HIGHMEM. */ void * high_memory; -unsigned long vmalloc_earlyreserve; EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); -EXPORT_SYMBOL(vmalloc_earlyreserve); int randomize_va_space __read_mostly = 1; @@ -1049,43 +1047,51 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) foll_flags |= FOLL_GET; if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->nopage)) + (!vma->vm_ops || (!vma->vm_ops->nopage && + !vma->vm_ops->fault))) foll_flags |= FOLL_ANON; do { struct page *page; + /* + * If tsk is ooming, cut off its access to large memory + * allocations. It has a pending SIGKILL, but it can't + * be processed until returning to user space. + */ + if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) + return -ENOMEM; + if (write) foll_flags |= FOLL_WRITE; cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; - ret = __handle_mm_fault(mm, vma, start, + ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + return i ? i : -EFAULT; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + /* - * The VM_FAULT_WRITE bit tells us that do_wp_page has - * broken COW when necessary, even if maybe_mkwrite - * decided not to set pte_write. We can thus safely do - * subsequent page lookups as if they were reads. + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) foll_flags &= ~FOLL_WRITE; - - switch (ret & ~VM_FAULT_WRITE) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - return i ? i : -EFAULT; - case VM_FAULT_OOM: - return i ? i : -ENOMEM; - default: - BUG(); - } + cond_resched(); } if (pages) { @@ -1632,7 +1638,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = VM_FAULT_MINOR; + int reuse = 0, ret = 0; struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); @@ -1709,11 +1715,11 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage(vma, address); + new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; } else { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma); @@ -1759,6 +1765,15 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + wait_on_page_locked(dirty_page); set_page_dirty_balance(dirty_page); put_page(dirty_page); } @@ -1825,6 +1840,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long restart_addr; int need_break; + /* + * files that support invalidating or truncating portions of the + * file from under mmaped areas must have their ->fault function + * return a locked page (and set VM_FAULT_LOCKED in the return). + * This provides synchronisation against concurrent unmapping here. + */ + again: restart_addr = vma->vm_truncate_count; if (is_restart_addr(restart_addr) && start_addr < restart_addr) { @@ -1953,17 +1975,8 @@ void unmap_mapping_range(struct address_space *mapping, spin_lock(&mapping->i_mmap_lock); - /* serialize i_size write against truncate_count write */ - smp_wmb(); - /* Protect against page faults, and endless unmapping loops */ + /* Protect against endless unmapping loops */ mapping->truncate_count++; - /* - * For archs where spin_lock has inclusive semantics like ia64 - * this smp_mb() will prevent to read pagetable contents - * before the truncate_count increment is visible to - * other cpus. - */ - smp_mb(); if (unlikely(is_restart_addr(mapping->truncate_count))) { if (mapping->truncate_count == 0) reset_vma_truncate_counts(mapping); @@ -2002,8 +2015,18 @@ int vmtruncate(struct inode * inode, loff_t offset) if (IS_SWAPFILE(inode)) goto out_busy; i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for efficiency + * so that truncate_inode_pages does fewer single-page unmaps. However + * after this first call, and before truncate_inode_pages finishes, + * it is possible for private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second unmap_mapping_range + * call must be made for correctness. + */ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); goto out_truncate; do_expand: @@ -2043,6 +2066,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); + unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); @@ -2124,7 +2148,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; swp_entry_t entry; pte_t pte; - int ret = VM_FAULT_MINOR; + int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; @@ -2192,15 +2216,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(page); if (write_access) { + /* XXX: We could OR the do_wp_page code with this one? */ if (do_wp_page(mm, vma, address, - page_table, pmd, ptl, pte) == VM_FAULT_OOM) + page_table, pmd, ptl, pte) & VM_FAULT_OOM) ret = VM_FAULT_OOM; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); - lazy_mmu_prot_update(pte); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2231,7 +2255,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage(vma, address); + page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; @@ -2265,7 +2289,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); - return VM_FAULT_MINOR; + return 0; release: page_cache_release(page); goto unlock; @@ -2274,10 +2298,10 @@ oom: } /* - * do_no_page() tries to create a new page mapping. It aggressively + * __do_fault() tries to create a new page mapping. It aggressively * tries to share with existing pages, but makes a separate copy if - * the "write_access" parameter is true in order to avoid the next - * page fault. + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid + * the next page fault. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. @@ -2286,89 +2310,100 @@ oom: * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *new_page; - struct address_space *mapping = NULL; + struct page *page; pte_t entry; - unsigned int sequence = 0; - int ret = VM_FAULT_MINOR; int anon = 0; struct page *dirty_page = NULL; + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - sequence = mapping->truncate_count; - smp_rmb(); /* serializes i_size against truncate_count */ + if (likely(vma->vm_ops->fault)) { + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + } else { + /* Legacy ->nopage path */ + ret = 0; + vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* no page was available -- either SIGBUS or OOM */ + if (unlikely(vmf.page == NOPAGE_SIGBUS)) + return VM_FAULT_SIGBUS; + else if (unlikely(vmf.page == NOPAGE_OOM)) + return VM_FAULT_OOM; } -retry: - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* - * No smp_rmb is needed here as long as there's a full - * spin_lock/unlock sequence inside the ->nopage callback - * (for the pagecache lookup) that acts as an implicit - * smp_mb() and prevents the i_size read to happen - * after the next truncate_count read. + * For consistency in subsequent calls, make the faulted page always + * locked. */ - - /* no page was available -- either SIGBUS, OOM or REFAULT */ - if (unlikely(new_page == NOPAGE_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(new_page == NOPAGE_OOM)) - return VM_FAULT_OOM; - else if (unlikely(new_page == NOPAGE_REFAULT)) - return VM_FAULT_MINOR; + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON(!PageLocked(vmf.page)); /* * Should we do an early C-O-W break? */ - if (write_access) { + page = vmf.page; + if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { - struct page *page; - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address, vma); - page_cache_release(new_page); - new_page = page; anon = 1; - + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out; + } + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + copy_user_highpage(page, vmf.page, address, vma); } else { - /* if the page will be shareable, see if the backing + /* + * If the page will be shareable, see if the backing * address space wants to know that the page is about - * to become writable */ - if (vma->vm_ops->page_mkwrite && - vma->vm_ops->page_mkwrite(vma, new_page) < 0 - ) { - page_cache_release(new_page); - return VM_FAULT_SIGBUS; + * to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(page); + if (vma->vm_ops->page_mkwrite(vma, page) < 0) { + ret = VM_FAULT_SIGBUS; + anon = 1; /* no anon but release vmf.page */ + goto out_unlocked; + } + lock_page(page); + /* + * XXX: this is not quite right (racy vs + * invalidate) to unlock and relock the page + * like this, however a better fix requires + * reworking page_mkwrite locking API, which + * is better done later. + */ + if (!page->mapping) { + ret = 0; + anon = 1; /* no anon but release vmf.page */ + goto out; + } } } + } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - /* - * For a file-backed vma, someone could have truncated or otherwise - * invalidated this page. If unmap_mapping_range got called, - * retry getting the page. - */ - if (mapping && unlikely(sequence != mapping->truncate_count)) { - pte_unmap_unlock(page_table, ptl); - page_cache_release(new_page); - cond_resched(); - sequence = mapping->truncate_count; - smp_rmb(); - goto retry; - } /* * This silly early PAGE_DIRTY setting removes a race @@ -2381,45 +2416,63 @@ retry: * handle that later. */ /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) + if (likely(pte_same(*page_table, orig_pte))) { + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(new_page); - page_add_new_anon_rmap(new_page, vma, address); + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); - page_add_file_rmap(new_page); - if (write_access) { - dirty_page = new_page; + page_add_file_rmap(page); + if (flags & FAULT_FLAG_WRITE) { + dirty_page = page; get_page(dirty_page); } } + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); } else { - /* One of our sibling threads was faster, back out. */ - page_cache_release(new_page); - goto unlock; + if (anon) + page_cache_release(page); + else + anon = 1; /* no anon but release faulted_page */ } - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); -unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { + +out: + unlock_page(vmf.page); +out_unlocked: + if (anon) + page_cache_release(vmf.page); + else if (dirty_page) { set_page_dirty_balance(dirty_page); put_page(dirty_page); } + return ret; -oom: - page_cache_release(new_page); - return VM_FAULT_OOM; } +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; + unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); + + return __do_fault(mm, vma, address, page_table, pmd, pgoff, + flags, orig_pte); +} + + /* * do_no_pfn() tries to create a new page mapping for a page without * a struct_page backing it @@ -2443,7 +2496,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; unsigned long pfn; - int ret = VM_FAULT_MINOR; pte_unmap(page_table); BUG_ON(!(vma->vm_flags & VM_PFNMAP)); @@ -2455,7 +2507,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, else if (unlikely(pfn == NOPFN_SIGBUS)) return VM_FAULT_SIGBUS; else if (unlikely(pfn == NOPFN_REFAULT)) - return VM_FAULT_MINOR; + return 0; page_table = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2467,7 +2519,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, entry); } pte_unmap_unlock(page_table, ptl); - return ret; + return 0; } /* @@ -2479,33 +2531,30 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access, pte_t orig_pte) { + unsigned int flags = FAULT_FLAG_NONLINEAR | + (write_access ? FAULT_FLAG_WRITE : 0); pgoff_t pgoff; - int err; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) - return VM_FAULT_MINOR; + return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || + !(vma->vm_flags & VM_CAN_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, orig_pte, address); return VM_FAULT_OOM; } - /* We can then assume vm->vm_ops && vma->vm_ops->populate */ pgoff = pte_to_pgoff(orig_pte); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, - vma->vm_page_prot, pgoff, 0); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_MAJOR; + + return __do_fault(mm, vma, address, page_table, pmd, pgoff, + flags, orig_pte); } /* @@ -2532,10 +2581,9 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { - if (vma->vm_ops->nopage) - return do_no_page(mm, vma, address, - pte, pmd, - write_access); + if (vma->vm_ops->fault || vma->vm_ops->nopage) + return do_linear_fault(mm, vma, address, + pte, pmd, write_access, entry); if (unlikely(vma->vm_ops->nopfn)) return do_no_pfn(mm, vma, address, pte, pmd, write_access); @@ -2544,7 +2592,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte, pmd, write_access); } if (pte_file(entry)) - return do_file_page(mm, vma, address, + return do_nonlinear_fault(mm, vma, address, pte, pmd, write_access, entry); return do_swap_page(mm, vma, address, pte, pmd, write_access, entry); @@ -2576,13 +2624,13 @@ static inline int handle_pte_fault(struct mm_struct *mm, } unlock: pte_unmap_unlock(pte, ptl); - return VM_FAULT_MINOR; + return 0; } /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2611,8 +2659,6 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } -EXPORT_SYMBOL_GPL(__handle_mm_fault); - #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. @@ -2673,7 +2719,7 @@ int make_pages_present(unsigned long addr, unsigned long end) write = (vma->vm_flags & VM_WRITE) != 0; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); if (ret < 0) @@ -2817,3 +2863,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in return buf - old_buf; } +EXPORT_SYMBOL_GPL(access_process_vm); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d76e8eb342d0..71b84b45154a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -101,8 +101,6 @@ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; -#define PDprintk(fmt...) - /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; @@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); + pr_debug("setting mode %d nodes[0] %lx\n", + mode, nodes ? nodes_addr(*nodes)[0] : -1); + if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) int err = 0; struct mempolicy *old = vma->vm_policy; - PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); @@ -594,7 +594,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER, 0); + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -710,7 +710,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * { struct vm_area_struct *vma = (struct vm_area_struct *)private; - return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + page_address_in_vma(page, vma)); } #else @@ -776,8 +777,8 @@ long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); + pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode, nmask ? nodes_addr(*nmask)[0] : -1); down_write(&mm->mmap_sem); vma = check_range(mm, start, end, nmask, @@ -1202,7 +1203,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* Return a zonelist suitable for a huge page allocation. */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags) { struct mempolicy *pol = get_vma_policy(current, vma, addr); @@ -1210,7 +1212,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) unsigned nid; nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); - return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); } return zonelist_policy(GFP_HIGHUSER, pol); } @@ -1434,7 +1436,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, + pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, new->policy ? new->policy->policy : 0); } @@ -1459,7 +1461,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - PDprintk("deleting %lx-l%x\n", n->start, n->end); + pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); mpol_free(n->policy); kmem_cache_free(sn_cache, n); @@ -1558,10 +1560,10 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - PDprintk("set_shared_policy %lx sz %lu %d %lx\n", + pr_debug("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -1597,18 +1599,43 @@ void mpol_free_shared_policy(struct shared_policy *p) /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { + nodemask_t interleave_nodes; + unsigned long largest = 0; + int nid, prefer = 0; + policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); + + /* + * Set interleaving policy for system init. Interleaving is only + * enabled across suitably sized nodes (default is >= 16MB), or + * fall back to the largest node if they're all smaller. + */ + nodes_clear(interleave_nodes); + for_each_online_node(nid) { + unsigned long total_pages = node_present_pages(nid); + + /* Preserve the largest node */ + if (largest < total_pages) { + largest = total_pages; + prefer = nid; + } + + /* Interleave this node? */ + if ((total_pages << PAGE_SHIFT) >= (16 << 20)) + node_set(nid, interleave_nodes); + } - /* Set interleaving policy for system init. This way not all - the data structures allocated at system boot end up in node zero. */ + /* All too small, use the largest */ + if (unlikely(nodes_empty(interleave_nodes))) + node_set(prefer, interleave_nodes); - if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) + if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); } diff --git a/mm/mempool.c b/mm/mempool.c index cc1ca86dfc24..02d5ec3feabc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -62,10 +62,9 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, int node_id) { mempool_t *pool; - pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id); + pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); if (!pool) return NULL; - memset(pool, 0, sizeof(*pool)); pool->elements = kmalloc_node(min_nr * sizeof(void *), GFP_KERNEL, node_id); if (!pool->elements) { @@ -263,6 +262,9 @@ void mempool_free(void *element, mempool_t *pool) { unsigned long flags; + if (unlikely(element == NULL)) + return; + smp_mb(); if (pool->curr_nr < pool->min_nr) { spin_lock_irqsave(&pool->lock, flags); diff --git a/mm/migrate.c b/mm/migrate.c index a91ca00abebe..34d8ada053e4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -761,7 +761,8 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0); + return alloc_pages_node(pm->node, + GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } /* diff --git a/mm/mlock.c b/mm/mlock.c index 4d3fea267e0d..7b2656055d6a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user) locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + if (lock_limit == RLIM_INFINITY) + allowed = 1; lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + if (!allowed && + locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) goto out; get_uid(user); user->locked_shm += locked; diff --git a/mm/mmap.c b/mm/mmap.c index 906ed402f7ca..7afc7a7cec6f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long flags, unsigned long pgoff) { struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; struct inode *inode; unsigned int vm_flags; - int correct_wcount = 0; int error; - struct rb_node ** rb_link, * rb_parent; int accountable = 1; - unsigned long charged = 0, reqprot = prot; + unsigned long reqprot = prot; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } } - error = security_file_mmap(file, reqprot, prot, flags); + error = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (error) return error; - + + return mmap_region(file, addr, len, flags, vm_flags, pgoff, + accountable); +} +EXPORT_SYMBOL(do_mmap_pgoff); + +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, unsigned long flags, + unsigned int vm_flags, unsigned long pgoff, + int accountable) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + int correct_wcount = 0; + int error; + struct rb_node **rb_link, *rb_parent; + unsigned long charged = 0; + struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + /* Clear old maps */ error = -ENOMEM; munmap_back: @@ -1150,12 +1165,8 @@ out: mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } - if (flags & MAP_POPULATE) { - up_write(&mm->mmap_sem); - sys_remap_file_pages(addr, len, 0, - pgoff, flags & MAP_NONBLOCK); - down_write(&mm->mmap_sem); - } + if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + make_pages_present(addr, addr + len); return addr; unmap_and_free_vma: @@ -1175,8 +1186,6 @@ unacct_error: return error; } -EXPORT_SYMBOL(do_mmap_pgoff); - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * @@ -1562,33 +1571,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ -#ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return expand_upwards(vma, address); -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma, *prev; - - addr &= PAGE_MASK; - vma = find_vma_prev(mm, addr, &prev); - if (vma && (vma->vm_start <= addr)) - return vma; - if (!prev || expand_stack(prev, addr)) - return NULL; - if (prev->vm_flags & VM_LOCKED) { - make_pages_present(addr, prev->vm_end); - } - return prev; -} -#else /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +static inline int expand_downwards(struct vm_area_struct *vma, + unsigned long address) { int error; @@ -1625,6 +1612,38 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) return error; } +int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_upwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + make_pages_present(addr, prev->vm_end); + return prev; +} +#else +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) { @@ -1642,9 +1661,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { + if (vma->vm_flags & VM_LOCKED) make_pages_present(addr, start); - } return vma; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 3b8f3c0c63f3..e8346c30abec 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -128,7 +128,7 @@ static void change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } -static int +int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) { diff --git a/mm/mremap.c b/mm/mremap.c index 5d4bd4f95b8e..8ea5c2412c6e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -120,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, #define LATENCY_LIMIT (64 * PAGE_SIZE) -static unsigned long move_page_tables(struct vm_area_struct *vma, +unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len) { @@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; + ret = security_file_mmap(0, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + ret = do_munmap(mm, new_addr, new_len); if (ret) goto out; @@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr, new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); - ret = new_addr; - if (new_addr & ~PAGE_MASK) + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; + } + + ret = security_file_mmap(0, 0, 0, 0, new_addr, 1); + if (ret) goto out; } ret = move_vma(vma, addr, old_len, new_len, new_addr); diff --git a/mm/nommu.c b/mm/nommu.c index 2b16b00a5b11..9eef6a398555 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -54,12 +54,6 @@ DECLARE_RWSEM(nommu_vma_sem); struct vm_operations_struct generic_file_vm_ops = { }; -EXPORT_SYMBOL(vfree); -EXPORT_SYMBOL(vmalloc_to_page); -EXPORT_SYMBOL(vmalloc_32); -EXPORT_SYMBOL(vmap); -EXPORT_SYMBOL(vunmap); - /* * Handle all mappings that got truncated by a "truncate()" * system call. @@ -168,7 +162,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, finish_or_fault: return i ? : -EFAULT; } - EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); @@ -178,6 +171,7 @@ void vfree(void *addr) { kfree(addr); } +EXPORT_SYMBOL(vfree); void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { @@ -186,17 +180,19 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) */ return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } +EXPORT_SYMBOL(__vmalloc); struct page * vmalloc_to_page(void *addr) { return virt_to_page(addr); } +EXPORT_SYMBOL(vmalloc_to_page); unsigned long vmalloc_to_pfn(void *addr) { return page_to_pfn(virt_to_page(addr)); } - +EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { @@ -237,9 +233,8 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); -/* - * vmalloc_32 - allocate virtually continguos memory (32bit addressable) - * +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the @@ -249,17 +244,33 @@ void *vmalloc_32(unsigned long size) { return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); } +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + */ +void *vmalloc_32_user(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc_32_user); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { BUG(); return NULL; } +EXPORT_SYMBOL(vmap); void vunmap(void *addr) { BUG(); } +EXPORT_SYMBOL(vunmap); /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to @@ -269,6 +280,13 @@ void __attribute__((weak)) vmalloc_sync_all(void) { } +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty @@ -367,6 +385,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) return find_vma(mm, addr); } +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + /* * look up the first VMA exactly that exactly matches addr * - should be called with mm->mmap_sem at least held readlocked @@ -639,7 +662,7 @@ static int validate_mmap_request(struct file *file, } /* allow the security API to have its say */ - ret = security_file_mmap(file, reqprot, prot, flags); + ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (ret < 0) return ret; @@ -989,6 +1012,7 @@ unsigned long do_mmap_pgoff(struct file *file, show_free_areas(); return -ENOMEM; } +EXPORT_SYMBOL(do_mmap_pgoff); /* * handle mapping disposal for uClinux @@ -1069,6 +1093,7 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) return 0; } +EXPORT_SYMBOL(do_munmap); asmlinkage long sys_munmap(unsigned long addr, size_t len) { @@ -1159,6 +1184,7 @@ unsigned long do_mremap(unsigned long addr, return vma->vm_start; } +EXPORT_SYMBOL(do_mremap); asmlinkage unsigned long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, @@ -1226,7 +1252,6 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr, return get_area(file, addr, len, pgoff, flags); } - EXPORT_SYMBOL(get_unmapped_area); /* @@ -1336,12 +1361,12 @@ int in_gate_area_no_task(unsigned long addr) return 0; } -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return NULL; + return 0; } +EXPORT_SYMBOL(filemap_fault); /* * Access another process' address space. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eec1481ba44f..63512a9ed57e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) { - mod_timer(&wb_timer, - jiffies + dirty_writeback_interval); - } else { + if (dirty_writeback_interval) + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + else del_timer(&wb_timer); - } return 0; } @@ -826,6 +824,7 @@ int __set_page_dirty_nobuffers(struct page *page) mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); task_io_account_write(PAGE_CACHE_SIZE); @@ -919,6 +918,9 @@ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); + BUG_ON(!PageLocked(page)); + + ClearPageReclaim(page); if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. @@ -944,14 +946,19 @@ int clear_page_dirty_for_io(struct page *page) * We basically use the page "master dirty bit" * as a serialization point for all the different * threads doing their things. - * - * FIXME! We still have a race here: if somebody - * adds the page back to the page tables in - * between the "page_mkclean()" and the "TestClearPageDirty()", - * we might have it mapped without the dirty bit set. */ if (page_mkclean(page)) set_page_dirty(page); + /* + * We carefully synchronise fault handlers against + * installing a dirty pte and marking the page dirty + * at this point. We do this by having them hold the + * page lock at some point after installing their + * pte, but before marking the page dirty. + * Pages are always locked coming in here, so we get + * the desired exclusion. See mm/memory.c:do_wp_page() + * for more comments. + */ if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); return 1; @@ -980,6 +987,8 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } + if (ret) + dec_zone_page_state(page, NR_WRITEBACK); return ret; } @@ -1005,6 +1014,8 @@ int test_set_page_writeback(struct page *page) } else { ret = TestSetPageWriteback(page); } + if (!ret) + inc_zone_page_state(page, NR_WRITEBACK); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05ace44852eb..40954fb81598 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -80,8 +80,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, #endif #ifdef CONFIG_HIGHMEM - 32 + 32, #endif + 32, }; EXPORT_SYMBOL(totalram_pages); @@ -95,8 +96,9 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif "Normal", #ifdef CONFIG_HIGHMEM - "HighMem" + "HighMem", #endif + "Movable", }; int min_free_kbytes = 1024; @@ -126,14 +128,21 @@ static unsigned long __meminitdata dma_reserve; #endif #endif - struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; - int __meminitdata nr_nodemap_entries; - unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; - unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; + static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; + static int __meminitdata nr_nodemap_entries; + static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; - unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ + unsigned long __initdata required_kernelcore; + unsigned long __initdata required_movablecore; + unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ + int movable_zone; + EXPORT_SYMBOL(movable_zone); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #if MAX_NUMNODES > 1 @@ -444,12 +453,6 @@ static inline int free_pages_check(struct page *page) 1 << PG_reserved | 1 << PG_buddy )))) bad_page(page); - /* - * PageReclaim == PageTail. It is only an error - * for PageReclaim to be set if PageCompound is clear. - */ - if (unlikely(!PageCompound(page) && PageReclaim(page))) - bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -593,7 +596,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_locked | 1 << PG_active | 1 << PG_dirty | - 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | @@ -608,7 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (PageReserved(page)) return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); set_page_private(page, 0); @@ -900,11 +902,13 @@ static struct fail_page_alloc_attr { u32 ignore_gfp_highmem; u32 ignore_gfp_wait; + u32 min_order; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *ignore_gfp_highmem_file; struct dentry *ignore_gfp_wait_file; + struct dentry *min_order_file; #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ @@ -912,6 +916,7 @@ static struct fail_page_alloc_attr { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, .ignore_gfp_highmem = 1, + .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) @@ -922,6 +927,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { + if (order < fail_page_alloc.min_order) + return 0; if (gfp_mask & __GFP_NOFAIL) return 0; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) @@ -953,12 +960,17 @@ static int __init fail_page_alloc_debugfs(void) fail_page_alloc.ignore_gfp_highmem_file = debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); + fail_page_alloc.min_order_file = + debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order); if (!fail_page_alloc.ignore_gfp_wait_file || - !fail_page_alloc.ignore_gfp_highmem_file) { + !fail_page_alloc.ignore_gfp_highmem_file || + !fail_page_alloc.min_order_file) { err = -ENOMEM; debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); + debugfs_remove(fail_page_alloc.min_order_file); cleanup_fault_attr_dentries(&fail_page_alloc.attr); } @@ -1314,7 +1326,7 @@ nofail_alloc: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); + did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -1351,7 +1363,8 @@ nofail_alloc: */ do_retry = 0; if (!(gfp_mask & __GFP_NORETRY)) { - if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + if ((order <= PAGE_ALLOC_COSTLY_ORDER) || + (gfp_mask & __GFP_REPEAT)) do_retry = 1; if (gfp_mask & __GFP_NOFAIL) do_retry = 1; @@ -1464,13 +1477,14 @@ unsigned int nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } +EXPORT_SYMBOL_GPL(nr_free_buffer_pages); /* * Amount of free RAM allocatable within all zones */ unsigned int nr_free_pagecache_pages(void) { - return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } static inline void show_node(struct zone *zone) @@ -1621,8 +1635,8 @@ void show_free_areas(void) * * Add all populated zones of a node to the zonelist. */ -static int __meminit build_zonelists_node(pg_data_t *pgdat, - struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) +static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, + int nr_zones, enum zone_type zone_type) { struct zone *zone; @@ -1641,9 +1655,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat, return nr_zones; } + +/* + * zonelist_order: + * 0 = automatic detection of better ordering. + * 1 = order by ([node] distance, -zonetype) + * 2 = order by (-zonetype, [node] distance) + * + * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create + * the same zonelist. So only NUMA can configure this param. + */ +#define ZONELIST_ORDER_DEFAULT 0 +#define ZONELIST_ORDER_NODE 1 +#define ZONELIST_ORDER_ZONE 2 + +/* zonelist order in the kernel. + * set_zonelist_order() will set this to NODE or ZONE. + */ +static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; +static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; + + #ifdef CONFIG_NUMA +/* The value user specified ....changed by config */ +static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; +/* string for sysctl */ +#define NUMA_ZONELIST_ORDER_LEN 16 +char numa_zonelist_order[16] = "default"; + +/* + * interface for configure zonelist ordering. + * command line option "numa_zonelist_order" + * = "[dD]efault - default, automatic configuration. + * = "[nN]ode - order by node locality, then by zone within node + * = "[zZ]one - order by zone, then by locality within zone + */ + +static int __parse_numa_zonelist_order(char *s) +{ + if (*s == 'd' || *s == 'D') { + user_zonelist_order = ZONELIST_ORDER_DEFAULT; + } else if (*s == 'n' || *s == 'N') { + user_zonelist_order = ZONELIST_ORDER_NODE; + } else if (*s == 'z' || *s == 'Z') { + user_zonelist_order = ZONELIST_ORDER_ZONE; + } else { + printk(KERN_WARNING + "Ignoring invalid numa_zonelist_order value: " + "%s\n", s); + return -EINVAL; + } + return 0; +} + +static __init int setup_numa_zonelist_order(char *s) +{ + if (s) + return __parse_numa_zonelist_order(s); + return 0; +} +early_param("numa_zonelist_order", setup_numa_zonelist_order); + +/* + * sysctl handler for numa_zonelist_order + */ +int numa_zonelist_order_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, + loff_t *ppos) +{ + char saved_string[NUMA_ZONELIST_ORDER_LEN]; + int ret; + + if (write) + strncpy(saved_string, (char*)table->data, + NUMA_ZONELIST_ORDER_LEN); + ret = proc_dostring(table, write, file, buffer, length, ppos); + if (ret) + return ret; + if (write) { + int oldval = user_zonelist_order; + if (__parse_numa_zonelist_order((char*)table->data)) { + /* + * bogus value. restore saved string + */ + strncpy((char*)table->data, saved_string, + NUMA_ZONELIST_ORDER_LEN); + user_zonelist_order = oldval; + } else if (oldval != user_zonelist_order) + build_all_zonelists(); + } + return 0; +} + + #define MAX_NODE_LOAD (num_online_nodes()) -static int __meminitdata node_load[MAX_NUMNODES]; +static int node_load[MAX_NUMNODES]; + /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1658,7 +1765,7 @@ static int __meminitdata node_load[MAX_NUMNODES]; * on them otherwise. * It returns -1 if no node is found. */ -static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) +static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; @@ -1704,13 +1811,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) return best_node; } -static void __meminit build_zonelists(pg_data_t *pgdat) + +/* + * Build zonelists ordered by node and zones within node. + * This results in maximum locality--normal zone overflows into local + * DMA zone, if any--but risks exhausting DMA zone. + */ +static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) { - int j, node, local_node; enum zone_type i; - int prev_node, load; + int j; struct zonelist *zonelist; + + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++) + ; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); + zonelist->zones[j] = NULL; + } +} + +/* + * Build zonelists ordered by zone and nodes within zones. + * This results in conserving DMA zone[s] until all Normal memory is + * exhausted, but results in overflowing to remote node while memory + * may still exist in local DMA zone. + */ +static int node_order[MAX_NUMNODES]; + +static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) +{ + enum zone_type i; + int pos, j, node; + int zone_type; /* needs to be signed */ + struct zone *z; + struct zonelist *zonelist; + + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + pos = 0; + for (zone_type = i; zone_type >= 0; zone_type--) { + for (j = 0; j < nr_nodes; j++) { + node = node_order[j]; + z = &NODE_DATA(node)->node_zones[zone_type]; + if (populated_zone(z)) { + zonelist->zones[pos++] = z; + check_highest_zone(zone_type); + } + } + } + zonelist->zones[pos] = NULL; + } +} + +static int default_zonelist_order(void) +{ + int nid, zone_type; + unsigned long low_kmem_size,total_size; + struct zone *z; + int average_size; + /* + * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. + * If they are really small and used heavily, the system can fall + * into OOM very easily. + * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + */ + /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ + low_kmem_size = 0; + total_size = 0; + for_each_online_node(nid) { + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + z = &NODE_DATA(nid)->node_zones[zone_type]; + if (populated_zone(z)) { + if (zone_type < ZONE_NORMAL) + low_kmem_size += z->present_pages; + total_size += z->present_pages; + } + } + } + if (!low_kmem_size || /* there are no DMA area. */ + low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ + return ZONELIST_ORDER_NODE; + /* + * look into each node's config. + * If there is a node whose DMA/DMA32 memory is very big area on + * local memory, NODE_ORDER may be suitable. + */ + average_size = total_size / (num_online_nodes() + 1); + for_each_online_node(nid) { + low_kmem_size = 0; + total_size = 0; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + z = &NODE_DATA(nid)->node_zones[zone_type]; + if (populated_zone(z)) { + if (zone_type < ZONE_NORMAL) + low_kmem_size += z->present_pages; + total_size += z->present_pages; + } + } + if (low_kmem_size && + total_size > average_size && /* ignore small node */ + low_kmem_size > total_size * 70/100) + return ZONELIST_ORDER_NODE; + } + return ZONELIST_ORDER_ZONE; +} + +static void set_zonelist_order(void) +{ + if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) + current_zonelist_order = default_zonelist_order(); + else + current_zonelist_order = user_zonelist_order; +} + +static void build_zonelists(pg_data_t *pgdat) +{ + int j, node, load; + enum zone_type i; nodemask_t used_mask; + int local_node, prev_node; + struct zonelist *zonelist; + int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_NR_ZONES; i++) { @@ -1723,6 +1946,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) load = num_online_nodes(); prev_node = local_node; nodes_clear(used_mask); + + memset(node_load, 0, sizeof(node_load)); + memset(node_order, 0, sizeof(node_order)); + j = 0; + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { int distance = node_distance(local_node, node); @@ -1738,23 +1966,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat) * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (distance != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] = load; + prev_node = node; load--; - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + i; - for (j = 0; zonelist->zones[j] != NULL; j++); + if (order == ZONELIST_ORDER_NODE) + build_zonelists_in_node_order(pgdat, node); + else + node_order[j++] = node; /* remember order */ + } - j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); - zonelist->zones[j] = NULL; - } + if (order == ZONELIST_ORDER_ZONE) { + /* calculate node order -- i.e., DMA last! */ + build_zonelists_in_zone_order(pgdat, j); } } /* Construct the zonelist performance cache - see further mmzone.h */ -static void __meminit build_zonelist_cache(pg_data_t *pgdat) +static void build_zonelist_cache(pg_data_t *pgdat) { int i; @@ -1771,9 +2001,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) } } + #else /* CONFIG_NUMA */ -static void __meminit build_zonelists(pg_data_t *pgdat) +static void set_zonelist_order(void) +{ + current_zonelist_order = ZONELIST_ORDER_ZONE; +} + +static void build_zonelists(pg_data_t *pgdat) { int node, local_node; enum zone_type i,j; @@ -1809,7 +2045,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat) } /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void __meminit build_zonelist_cache(pg_data_t *pgdat) +static void build_zonelist_cache(pg_data_t *pgdat) { int i; @@ -1820,7 +2056,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ /* return values int ....just for stop_machine_run() */ -static int __meminit __build_all_zonelists(void *dummy) +static int __build_all_zonelists(void *dummy) { int nid; @@ -1831,8 +2067,10 @@ static int __meminit __build_all_zonelists(void *dummy) return 0; } -void __meminit build_all_zonelists(void) +void build_all_zonelists(void) { + set_zonelist_order(); + if (system_state == SYSTEM_BOOTING) { __build_all_zonelists(NULL); cpuset_init_current_mems_allowed(); @@ -1843,8 +2081,13 @@ void __meminit build_all_zonelists(void) /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); - printk("Built %i zonelists. Total pages: %ld\n", - num_online_nodes(), vm_total_pages); + printk("Built %i zonelists in %s order. Total pages: %ld\n", + num_online_nodes(), + zonelist_order_name[current_zonelist_order], + vm_total_pages); +#ifdef CONFIG_NUMA + printk("Policy zone: %s\n", zone_names[policy_zone]); +#endif } /* @@ -1953,8 +2196,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } } -void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, - unsigned long size) +static void __meminit zone_init_free_lists(struct pglist_data *pgdat, + struct zone *zone, unsigned long size) { int order; for (order = 0; order < MAX_ORDER ; order++) { @@ -2370,7 +2613,7 @@ void __init push_node_boundaries(unsigned int nid, } /* If necessary, push the node boundary out for reserve hotadd */ -static void __init account_node_boundary(unsigned int nid, +static void __meminit account_node_boundary(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", @@ -2390,7 +2633,7 @@ static void __init account_node_boundary(unsigned int nid, void __init push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn) {} -static void __init account_node_boundary(unsigned int nid, +static void __meminit account_node_boundary(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) {} #endif @@ -2428,10 +2671,67 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, } /* + * This finds a zone that can be used for ZONE_MOVABLE pages. The + * assumption is made that zones within a node are ordered in monotonic + * increasing memory addresses so that the "highest" populated zone is used + */ +void __init find_usable_zone_for_movable(void) +{ + int zone_index; + for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { + if (zone_index == ZONE_MOVABLE) + continue; + + if (arch_zone_highest_possible_pfn[zone_index] > + arch_zone_lowest_possible_pfn[zone_index]) + break; + } + + VM_BUG_ON(zone_index == -1); + movable_zone = zone_index; +} + +/* + * The zone ranges provided by the architecture do not include ZONE_MOVABLE + * because it is sized independant of architecture. Unlike the other zones, + * the starting point for ZONE_MOVABLE is not fixed. It may be different + * in each node depending on the size of each node and how evenly kernelcore + * is distributed. This helper function adjusts the zone ranges + * provided by the architecture for a given node by using the end of the + * highest usable zone for ZONE_MOVABLE. This preserves the assumption that + * zones within a node are in order of monotonic increases memory addresses + */ +void __meminit adjust_zone_range_for_zone_movable(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn) +{ + /* Only adjust if ZONE_MOVABLE is on this node */ + if (zone_movable_pfn[nid]) { + /* Size ZONE_MOVABLE */ + if (zone_type == ZONE_MOVABLE) { + *zone_start_pfn = zone_movable_pfn[nid]; + *zone_end_pfn = min(node_end_pfn, + arch_zone_highest_possible_pfn[movable_zone]); + + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (*zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + + /* Check if this whole range is within ZONE_MOVABLE */ + } else if (*zone_start_pfn >= zone_movable_pfn[nid]) + *zone_start_pfn = *zone_end_pfn; + } +} + +/* * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ -unsigned long __meminit zone_spanned_pages_in_node(int nid, +static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { @@ -2442,6 +2742,9 @@ unsigned long __meminit zone_spanned_pages_in_node(int nid, get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); /* Check that this node has pages within the zone's required range */ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) @@ -2519,7 +2822,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, } /* Return the number of page frames in holes in a zone on a node */ -unsigned long __meminit zone_absent_pages_in_node(int nid, +static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { @@ -2532,18 +2835,21 @@ unsigned long __meminit zone_absent_pages_in_node(int nid, zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], node_end_pfn); + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } #else -static inline unsigned long zone_spanned_pages_in_node(int nid, +static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *zones_size) { return zones_size[zone_type]; } -static inline unsigned long zone_absent_pages_in_node(int nid, +static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *zholes_size) { @@ -2909,6 +3215,157 @@ unsigned long __init find_max_pfn_with_active_regions(void) return max_pfn; } +unsigned long __init early_calculate_totalpages(void) +{ + int i; + unsigned long totalpages = 0; + + for (i = 0; i < nr_nodemap_entries; i++) + totalpages += early_node_map[i].end_pfn - + early_node_map[i].start_pfn; + + return totalpages; +} + +/* + * Find the PFN the Movable zone begins in each node. Kernel memory + * is spread evenly between nodes as long as the nodes have enough + * memory. When they don't, some nodes will have more kernelcore than + * others + */ +void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +{ + int i, nid; + unsigned long usable_startpfn; + unsigned long kernelcore_node, kernelcore_remaining; + int usable_nodes = num_online_nodes(); + + /* + * If movablecore was specified, calculate what size of + * kernelcore that corresponds so that memory usable for + * any allocation type is evenly spread. If both kernelcore + * and movablecore are specified, then the value of kernelcore + * will be used for required_kernelcore if it's greater than + * what movablecore would have allowed. + */ + if (required_movablecore) { + unsigned long totalpages = early_calculate_totalpages(); + unsigned long corepages; + + /* + * Round-up so that ZONE_MOVABLE is at least as large as what + * was requested by the user + */ + required_movablecore = + roundup(required_movablecore, MAX_ORDER_NR_PAGES); + corepages = totalpages - required_movablecore; + + required_kernelcore = max(required_kernelcore, corepages); + } + + /* If kernelcore was not specified, there is no ZONE_MOVABLE */ + if (!required_kernelcore) + return; + + /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + find_usable_zone_for_movable(); + usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; + +restart: + /* Spread kernelcore memory as evenly as possible throughout nodes */ + kernelcore_node = required_kernelcore / usable_nodes; + for_each_online_node(nid) { + /* + * Recalculate kernelcore_node if the division per node + * now exceeds what is necessary to satisfy the requested + * amount of memory for the kernel + */ + if (required_kernelcore < kernelcore_node) + kernelcore_node = required_kernelcore / usable_nodes; + + /* + * As the map is walked, we track how much memory is usable + * by the kernel using kernelcore_remaining. When it is + * 0, the rest of the node is usable by ZONE_MOVABLE + */ + kernelcore_remaining = kernelcore_node; + + /* Go through each range of PFNs within this node */ + for_each_active_range_index_in_nid(i, nid) { + unsigned long start_pfn, end_pfn; + unsigned long size_pages; + + start_pfn = max(early_node_map[i].start_pfn, + zone_movable_pfn[nid]); + end_pfn = early_node_map[i].end_pfn; + if (start_pfn >= end_pfn) + continue; + + /* Account for what is only usable for kernelcore */ + if (start_pfn < usable_startpfn) { + unsigned long kernel_pages; + kernel_pages = min(end_pfn, usable_startpfn) + - start_pfn; + + kernelcore_remaining -= min(kernel_pages, + kernelcore_remaining); + required_kernelcore -= min(kernel_pages, + required_kernelcore); + + /* Continue if range is now fully accounted */ + if (end_pfn <= usable_startpfn) { + + /* + * Push zone_movable_pfn to the end so + * that if we have to rebalance + * kernelcore across nodes, we will + * not double account here + */ + zone_movable_pfn[nid] = end_pfn; + continue; + } + start_pfn = usable_startpfn; + } + + /* + * The usable PFN range for ZONE_MOVABLE is from + * start_pfn->end_pfn. Calculate size_pages as the + * number of pages used as kernelcore + */ + size_pages = end_pfn - start_pfn; + if (size_pages > kernelcore_remaining) + size_pages = kernelcore_remaining; + zone_movable_pfn[nid] = start_pfn + size_pages; + + /* + * Some kernelcore has been met, update counts and + * break if the kernelcore for this node has been + * satisified + */ + required_kernelcore -= min(required_kernelcore, + size_pages); + kernelcore_remaining -= size_pages; + if (!kernelcore_remaining) + break; + } + } + + /* + * If there is still required_kernelcore, we do another pass with one + * less node in the count. This will push zone_movable_pfn[nid] further + * along on the nodes that still have memory until kernelcore is + * satisified + */ + usable_nodes--; + if (usable_nodes && required_kernelcore > usable_nodes) + goto restart; + + /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + zone_movable_pfn[nid] = + roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); +} + /** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -2938,19 +3395,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; for (i = 1; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1]; arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } + arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; + arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; + + /* Find the PFNs that ZONE_MOVABLE begins at in each node */ + memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); + find_zone_movable_pfns_for_nodes(zone_movable_pfn); /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); - for (i = 0; i < MAX_NR_ZONES; i++) + for (i = 0; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; printk(" %-8s %8lu -> %8lu\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); + } + + /* Print out the PFNs ZONE_MOVABLE begins at in each node */ + printk("Movable zone start PFN for each node\n"); + for (i = 0; i < MAX_NUMNODES; i++) { + if (zone_movable_pfn[i]) + printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + } /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); @@ -2967,6 +3442,43 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_min_pfn_for_node(nid), NULL); } } + +static int __init cmdline_parse_core(char *p, unsigned long *core) +{ + unsigned long long coremem; + if (!p) + return -EINVAL; + + coremem = memparse(p, &p); + *core = coremem >> PAGE_SHIFT; + + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + + return 0; +} + +/* + * kernelcore=size sets the amount of memory for use for allocations that + * cannot be reclaimed or migrated. + */ +static int __init cmdline_parse_kernelcore(char *p) +{ + return cmdline_parse_core(p, &required_kernelcore); +} + +/* + * movablecore=size sets the amount of memory for use for allocations that + * can be reclaimed or migrated. + */ +static int __init cmdline_parse_movablecore(char *p) +{ + return cmdline_parse_core(p, &required_movablecore); +} + +early_param("kernelcore", cmdline_parse_kernelcore); +early_param("movablecore", cmdline_parse_movablecore); + #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ /** @@ -3355,13 +3867,28 @@ void *__init alloc_large_system_hash(const char *tablename, for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) ; table = (void*) __get_free_pages(GFP_ATOMIC, order); + /* + * If bucketsize is not a power-of-two, we may free + * some pages at the end of hash table. + */ + if (table) { + unsigned long alloc_end = (unsigned long)table + + (PAGE_SIZE << order); + unsigned long used = (unsigned long)table + + PAGE_ALIGN(size); + split_page(virt_to_page(table), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", tablename, (1U << log2qty), ilog2(size) - PAGE_SHIFT, diff --git a/mm/pdflush.c b/mm/pdflush.c index 8ce0900dc95c..8f6ee073c0e3 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -92,6 +92,7 @@ struct pdflush_work { static int __pdflush(struct pdflush_work *my_work) { current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); my_work->fn = NULL; my_work->who = current; INIT_LIST_HEAD(&my_work->list); diff --git a/mm/readahead.c b/mm/readahead.c index 9861e883fe57..39bf45d43320 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -21,8 +21,16 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) } EXPORT_SYMBOL(default_unplug_io_fn); +/* + * Convienent macros for min/max read-ahead pages. + * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. + * The latter is necessary for systems with large page size(i.e. 64k). + */ +#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) +#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) + struct backing_dev_info default_backing_dev_info = { - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .ra_pages = MAX_RA_PAGES, .state = 0, .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, @@ -41,82 +49,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -/* - * Return max readahead size for this inode in number-of-pages. - */ -static inline unsigned long get_max_readahead(struct file_ra_state *ra) -{ - return ra->ra_pages; -} - -static inline unsigned long get_min_readahead(struct file_ra_state *ra) -{ - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; -} - -static inline void reset_ahead_window(struct file_ra_state *ra) -{ - /* - * ... but preserve ahead_start + ahead_size value, - * see 'recheck:' label in page_cache_readahead(). - * Note: We never use ->ahead_size as rvalue without - * checking ->ahead_start != 0 first. - */ - ra->ahead_size += ra->ahead_start; - ra->ahead_start = 0; -} - -static inline void ra_off(struct file_ra_state *ra) -{ - ra->start = 0; - ra->flags = 0; - ra->size = 0; - reset_ahead_window(ra); - return; -} - -/* - * Set the initial window size, round to next power of 2 and square - * for small size, x 4 for medium, and x 2 for large - * for 128k (32 page) max ra - * 1-8 page = 32k initial, > 8 page = 128k initial - */ -static unsigned long get_init_ra_size(unsigned long size, unsigned long max) -{ - unsigned long newsize = roundup_pow_of_two(size); - - if (newsize <= max / 32) - newsize = newsize * 4; - else if (newsize <= max / 4) - newsize = newsize * 2; - else - newsize = max; - return newsize; -} - -/* - * Set the new window size, this is called only when I/O is to be submitted, - * not for each call to readahead. If a cache miss occured, reduce next I/O - * size, else increase depending on how close to max we are. - */ -static inline unsigned long get_next_ra_size(struct file_ra_state *ra) -{ - unsigned long max = get_max_readahead(ra); - unsigned long min = get_min_readahead(ra); - unsigned long cur = ra->size; - unsigned long newsize; - - if (ra->flags & RA_FLAG_MISS) { - ra->flags &= ~RA_FLAG_MISS; - newsize = max((cur - 2), min); - } else if (cur < max / 16) { - newsize = 4 * cur; - } else { - newsize = 2 * cur; - } - return min(newsize, max); -} - #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** @@ -193,66 +125,6 @@ out: } /* - * Readahead design. - * - * The fields in struct file_ra_state represent the most-recently-executed - * readahead attempt: - * - * start: Page index at which we started the readahead - * size: Number of pages in that read - * Together, these form the "current window". - * Together, start and size represent the `readahead window'. - * prev_index: The page which the readahead algorithm most-recently inspected. - * It is mainly used to detect sequential file reading. - * If page_cache_readahead sees that it is again being called for - * a page which it just looked at, it can return immediately without - * making any state changes. - * offset: Offset in the prev_index where the last read ended - used for - * detection of sequential file reading. - * ahead_start, - * ahead_size: Together, these form the "ahead window". - * ra_pages: The externally controlled max readahead for this fd. - * - * When readahead is in the off state (size == 0), readahead is disabled. - * In this state, prev_index is used to detect the resumption of sequential I/O. - * - * The readahead code manages two windows - the "current" and the "ahead" - * windows. The intent is that while the application is walking the pages - * in the current window, I/O is underway on the ahead window. When the - * current window is fully traversed, it is replaced by the ahead window - * and the ahead window is invalidated. When this copying happens, the - * new current window's pages are probably still locked. So - * we submit a new batch of I/O immediately, creating a new ahead window. - * - * So: - * - * ----|----------------|----------------|----- - * ^start ^start+size - * ^ahead_start ^ahead_start+ahead_size - * - * ^ When this page is read, we submit I/O for the - * ahead window. - * - * A `readahead hit' occurs when a read request is made against a page which is - * the next sequential page. Ahead window calculations are done only when it - * is time to submit a new IO. The code ramps up the size agressively at first, - * but slow down as it approaches max_readhead. - * - * Any seek/ramdom IO will result in readahead being turned off. It will resume - * at the first sequential access. - * - * There is a special-case: if the first page which the application tries to - * read happens to be the first page of the file, it is assumed that a linear - * read is about to happen and the window is immediately set to the initial size - * based on I/O request size and the max_readahead. - * - * This function is to be called for every read request, rather than when - * it is time to perform readahead. It is called only once for the entire I/O - * regardless of size unless readahead is unable to start enough I/O to satisfy - * the request (I/O request > max_readahead). - */ - -/* * do_page_cache_readahead actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. @@ -265,7 +137,8 @@ out: */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) { struct inode *inode = mapping->host; struct page *page; @@ -278,7 +151,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); /* * Preallocate as many pages as we will need. @@ -286,7 +159,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; - + if (page_offset > end_index) break; @@ -301,6 +174,8 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; page->index = page_offset; list_add(&page->lru, &page_pool); + if (page_idx == nr_to_read - lookahead_size) + SetPageReadahead(page); ret++; } read_unlock_irq(&mapping->tree_lock); @@ -337,7 +212,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (this_chunk > nr_to_read) this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, - offset, this_chunk); + offset, this_chunk, 0); if (err < 0) { ret = err; break; @@ -350,28 +225,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, } /* - * Check how effective readahead is being. If the amount of started IO is - * less than expected then the file is partly or fully in pagecache and - * readahead isn't helping. - * - */ -static inline int check_ra_success(struct file_ra_state *ra, - unsigned long nr_to_read, unsigned long actual) -{ - if (actual == 0) { - ra->cache_hit += nr_to_read; - if (ra->cache_hit >= VM_MAX_CACHE_HIT) { - ra_off(ra); - ra->flags |= RA_FLAG_INCACHE; - return 0; - } - } else { - ra->cache_hit=0; - } - return 1; -} - -/* * This version skips the IO if the queue is read-congested, and will tell the * block layer to abandon the readahead if request allocation would block. * @@ -384,200 +237,237 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (bdi_read_congested(mapping->backing_dev_info)) return -1; - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); } /* - * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' - * is set wait till the read completes. Otherwise attempt to read without - * blocking. - * Returns 1 meaning 'success' if read is successful without switching off - * readahead mode. Otherwise return failure. + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. */ -static int -blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read, - struct file_ra_state *ra, int block) +unsigned long max_sane_readahead(unsigned long nr) +{ + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); +} + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) { int actual; - if (!block && bdi_read_congested(mapping->backing_dev_info)) - return 0; + actual = __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); + + return actual; +} - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); - return check_ra_success(ra, nr_to_read, actual); + if (newsize <= max / 32) + newsize = newsize * 4; + else if (newsize <= max / 4) + newsize = newsize * 2; + else + newsize = max; + + return newsize; } -static int make_ahead_window(struct address_space *mapping, struct file *filp, - struct file_ra_state *ra, int force) +/* + * Get the previous window size, ramp it up, and + * return it as the new window size. + */ +static unsigned long get_next_ra_size(struct file_ra_state *ra, + unsigned long max) { - int block, ret; - - ra->ahead_size = get_next_ra_size(ra); - ra->ahead_start = ra->start + ra->size; - - block = force || (ra->prev_index >= ra->ahead_start); - ret = blockable_page_cache_readahead(mapping, filp, - ra->ahead_start, ra->ahead_size, ra, block); - - if (!ret && !force) { - /* A read failure in blocking mode, implies pages are - * all cached. So we can safely assume we have taken - * care of all the pages requested in this call. - * A read failure in non-blocking mode, implies we are - * reading more pages than requested in this call. So - * we safely assume we have taken care of all the pages - * requested in this call. - * - * Just reset the ahead window in case we failed due to - * congestion. The ahead window will any way be closed - * in case we failed due to excessive page cache hits. - */ - reset_ahead_window(ra); - } + unsigned long cur = ra->size; + unsigned long newsize; - return ret; + if (cur < max / 16) + newsize = 4 * cur; + else + newsize = 2 * cur; + + return min(newsize, max); } -/** - * page_cache_readahead - generic adaptive readahead - * @mapping: address_space which holds the pagecache and I/O vectors - * @ra: file_ra_state which holds the readahead state - * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units - * @req_size: hint: total size of the read which the caller is performing in - * PAGE_CACHE_SIZE units +/* + * On-demand readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * |<----- async_size ---------| + * |------------------- size -------------------->| + * |==================#===========================| + * ^start ^page marked with PG_readahead * - * page_cache_readahead() is the main function. If performs the adaptive - * readahead window size management and submits the readahead I/O. + * To overlap application thinking time and disk I/O time, we do + * `readahead pipelining': Do not wait until the application consumed all + * readahead pages and stalled on the missing page at readahead_index; + * Instead, submit an asynchronous readahead I/O as soon as there are + * only async_size pages left in the readahead window. Normally async_size + * will be equal to size, for maximum pipelining. * - * Note that @filp is purely used for passing on to the ->readpage[s]() - * handler: it may refer to a different file from @mapping (so we may not use - * @filp->f_mapping or @filp->f_path.dentry->d_inode here). - * Also, @ra may not be equal to &@filp->f_ra. + * In interleaved sequential reads, concurrent streams on the same fd can + * be invalidating each other's readahead state. So we flag the new readahead + * page at (start+size-async_size) with PG_readahead, and use it as readahead + * indicator. The flag won't be set on already cached pages, to avoid the + * readahead-for-nothing fuss, saving pointless page cache lookups. + * + * prev_index tracks the last visited page in the _previous_ read request. + * It should be maintained by the caller, and will be used for detecting + * small random reads. Note that the readahead algorithm checks loosely + * for sequential patterns. Hence interleaved reads might be served as + * sequential ones. + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. * + * The code ramps up the readahead size aggressively at first, but slow down as + * it approaches max_readhead. + */ + +/* + * A minimal readahead algorithm for trivial sequential/random reads. */ -unsigned long -page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, - struct file *filp, pgoff_t offset, unsigned long req_size) +static unsigned long +ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) { - unsigned long max, newsize; + unsigned long max; /* max readahead pages */ int sequential; - /* - * We avoid doing extra work and bogusly perturbing the readahead - * window expansion logic. - */ - if (offset == ra->prev_index && --req_size) - ++offset; - - /* Note that prev_index == -1 if it is a first read */ - sequential = (offset == ra->prev_index + 1); - ra->prev_index = offset; - ra->prev_offset = 0; - - max = get_max_readahead(ra); - newsize = min(req_size, max); - - /* No readahead or sub-page sized read or file already in cache */ - if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) - goto out; - - ra->prev_index += newsize - 1; + max = ra->ra_pages; + sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); /* - * Special case - first read at start of file. We'll assume it's - * a whole-file read and grow the window fast. Or detect first - * sequential access + * It's the expected callback offset, assume sequential access. + * Ramp up sizes, and push forward the readahead window. */ - if (sequential && ra->size == 0) { - ra->size = get_init_ra_size(newsize, max); - ra->start = offset; - if (!blockable_page_cache_readahead(mapping, filp, offset, - ra->size, ra, 1)) - goto out; - - /* - * If the request size is larger than our max readahead, we - * at least want to be sure that we get 2 IOs in flight and - * we know that we will definitly need the new I/O. - * once we do this, subsequent calls should be able to overlap - * IOs,* thus preventing stalls. so issue the ahead window - * immediately. - */ - if (req_size >= max) - make_ahead_window(mapping, filp, ra, 1); - - goto out; + if (offset && (offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; } /* - * Now handle the random case: - * partial page reads and first access were handled above, - * so this must be the next page otherwise it is random + * Standalone, small read. + * Read as is, and do not pollute the readahead state. */ - if (!sequential) { - ra_off(ra); - blockable_page_cache_readahead(mapping, filp, offset, - newsize, ra, 1); - goto out; + if (!hit_readahead_marker && !sequential) { + return __do_page_cache_readahead(mapping, filp, + offset, req_size, 0); } /* - * If we get here we are doing sequential IO and this was not the first - * occurence (ie we have an existing window) + * It may be one of + * - first read on start of file + * - sequential cache miss + * - oversize random read + * Start readahead for it. */ - if (ra->ahead_start == 0) { /* no ahead window yet */ - if (!make_ahead_window(mapping, filp, ra, 0)) - goto recheck; - } + ra->start = offset; + ra->size = get_init_ra_size(req_size, max); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; /* - * Already have an ahead window, check if we crossed into it. - * If so, shift windows and issue a new ahead window. - * Only return the #pages that are in the current window, so that - * we get called back on the first page of the ahead window which - * will allow us to submit more IO. + * Hit on a marked page without valid readahead state. + * E.g. interleaved reads. + * Not knowing its readahead pos/size, bet on the minimal possible one. */ - if (ra->prev_index >= ra->ahead_start) { - ra->start = ra->ahead_start; - ra->size = ra->ahead_size; - make_ahead_window(mapping, filp, ra, 0); -recheck: - /* prev_index shouldn't overrun the ahead window */ - ra->prev_index = min(ra->prev_index, - ra->ahead_start + ra->ahead_size - 1); + if (hit_readahead_marker) { + ra->start++; + ra->size = get_next_ra_size(ra, max); } -out: - return ra->prev_index + 1; +readit: + return ra_submit(ra, mapping, filp); } -EXPORT_SYMBOL_GPL(page_cache_readahead); -/* - * handle_ra_miss() is called when it is known that a page which should have - * been present in the pagecache (we just did some readahead there) was in fact - * not found. This will happen if it was evicted by the VM (readahead - * thrashing) +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages * - * Turn on the cache miss flag in the RA struct, this will cause the RA code - * to reduce the RA size on the next read. + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. */ -void handle_ra_miss(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset) +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + pgoff_t offset, unsigned long req_size) { - ra->flags |= RA_FLAG_MISS; - ra->flags &= ~RA_FLAG_INCACHE; - ra->cache_hit = 0; + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, false, offset, req_size); } +EXPORT_SYMBOL_GPL(page_cache_sync_readahead); -/* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a - * sensible upper limit. - */ -unsigned long max_sane_readahead(unsigned long nr) +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @page: the page at @offset which has the PG_readahead flag set + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages + * + * page_cache_async_ondemand() should be called when a page is used which + * has the PG_readahead flag: this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. */ +void +page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *page, pgoff_t offset, + unsigned long req_size) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* + * Same bit is used for PG_readahead and PG_reclaim. + */ + if (PageWriteback(page)) + return; + + ClearPageReadahead(page); + + /* + * Defer asynchronous read-ahead on IO congestion. + */ + if (bdi_read_congested(mapping->backing_dev_info)) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, true, offset, req_size); } +EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index 61e492597a0b..41ac39749ef4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -149,7 +149,7 @@ static void anon_vma_ctor(void *data, struct kmem_cache *cachep, void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); } /* @@ -621,8 +621,10 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); - if (vma->vm_ops) + if (vma->vm_ops) { print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); + print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); + } if (vma->vm_file && vma->vm_file->f_op) print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); BUG(); diff --git a/mm/shmem.c b/mm/shmem.c index 0493e4d0bcaa..fcd19d323f9f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -27,6 +27,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/xattr.h> +#include <linux/exportfs.h> #include <linux/generic_acl.h> #include <linux/mm.h> #include <linux/mman.h> @@ -82,6 +83,7 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_FAULT, /* same as SGP_CACHE, return with page locked */ }; static int shmem_getpage(struct inode *inode, unsigned long idx, @@ -93,8 +95,11 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) * The above definition of ENTRIES_PER_PAGE, and the use of * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: * might be reconsidered if it ever diverges from PAGE_SIZE. + * + * __GFP_MOVABLE is masked out as swap vectors cannot move */ - return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); + return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, + PAGE_CACHE_SHIFT-PAGE_SHIFT); } static inline void shmem_dir_free(struct page *page) @@ -372,7 +377,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); + page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); if (page) set_page_private(page, 0); spin_lock(&info->lock); @@ -1096,6 +1101,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, if (idx >= SHMEM_MAX_INDEX) return -EFBIG; + + if (type) + *type = 0; + /* * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read @@ -1129,9 +1138,9 @@ repeat: if (!swappage) { shmem_swp_unmap(entry); /* here we actually do the io */ - if (type && *type == VM_FAULT_MINOR) { + if (type && !(*type & VM_FAULT_MAJOR)) { __count_vm_event(PGMAJFAULT); - *type = VM_FAULT_MAJOR; + *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); swappage = shmem_swapin(info, swap, idx); @@ -1285,8 +1294,10 @@ repeat: } done: if (*pagep != filepage) { - unlock_page(filepage); *pagep = filepage; + if (sgp != SGP_FAULT) + unlock_page(filepage); + } return 0; @@ -1298,72 +1309,21 @@ failed: return error; } -static struct page *shmem_nopage(struct vm_area_struct *vma, - unsigned long address, int *type) +static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct page *page = NULL; - unsigned long idx; int error; + int ret; - idx = (address - vma->vm_start) >> PAGE_SHIFT; - idx += vma->vm_pgoff; - idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return NOPAGE_SIGBUS; + if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; - error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); if (error) - return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; - - mark_page_accessed(page); - return page; -} - -static int shmem_populate(struct vm_area_struct *vma, - unsigned long addr, unsigned long len, - pgprot_t prot, unsigned long pgoff, int nonblock) -{ - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct mm_struct *mm = vma->vm_mm; - enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; - unsigned long size; + return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size) - return -EINVAL; - - while ((long) len > 0) { - struct page *page = NULL; - int err; - /* - * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE - */ - err = shmem_getpage(inode, pgoff, &page, sgp, NULL); - if (err) - return err; - /* Page may still be null, but only if nonblock was set. */ - if (page) { - mark_page_accessed(page); - err = install_page(mm, vma, addr, page, prot); - if (err) { - page_cache_release(page); - return err; - } - } else if (vma->vm_flags & VM_NONLINEAR) { - /* No page was found just because we can't read it in - * now (being here implies nonblock != 0), but the page - * may exist, so set the PTE to fault it in later. */ - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } - - len -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - } - return 0; + mark_page_accessed(vmf->page); + return ret | VM_FAULT_LOCKED; } #ifdef CONFIG_NUMA @@ -1410,6 +1370,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -2361,7 +2322,7 @@ static int init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, 0, init_once, NULL); + 0, 0, init_once); if (shmem_inode_cachep == NULL) return -ENOMEM; return 0; @@ -2455,8 +2416,7 @@ static const struct super_operations shmem_ops = { }; static struct vm_operations_struct shmem_vm_ops = { - .nopage = shmem_nopage, - .populate = shmem_populate, + .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, diff --git a/mm/slab.c b/mm/slab.c index b344e6707128..bde271c001ba 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -775,6 +775,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, */ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); #endif + if (!size) + return ZERO_SIZE_PTR; + while (size > csizep->cs_size) csizep++; @@ -929,7 +932,7 @@ static void next_reap_node(void) * the CPUs getting into lockstep and contending for the global cache chain * lock. */ -static void __devinit start_cpu_timer(int cpu) +static void __cpuinit start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(reap_work, cpu); @@ -1160,7 +1163,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - int memsize = sizeof(struct kmem_list3); + const int memsize = sizeof(struct kmem_list3); switch (action) { case CPU_LOCK_ACQUIRE: @@ -1481,7 +1484,7 @@ void __init kmem_cache_init(void) sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = @@ -1489,7 +1492,7 @@ void __init kmem_cache_init(void) sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } slab_early_init = 0; @@ -1507,7 +1510,7 @@ void __init kmem_cache_init(void) sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } #ifdef CONFIG_ZONE_DMA sizes->cs_dmacachep = kmem_cache_create( @@ -1516,7 +1519,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC, - NULL, NULL); + NULL); #endif sizes++; names++; @@ -2098,12 +2101,10 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. - * @dtor: A destructor for the objects (not implemented anymore). * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache - * and the @dtor is run before the pages are handed back. + * The @ctor is run when new pages are allocated by the cache. * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. @@ -2123,8 +2124,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long), - void (*dtor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2133,7 +2133,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * Sanity checks... these are all serious usage bugs. */ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || - size > KMALLOC_MAX_SIZE || dtor) { + size > KMALLOC_MAX_SIZE) { printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, name); BUG(); @@ -2351,7 +2351,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(!cachep->slabp_cache); + BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; @@ -2743,7 +2743,7 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); local_flags = (flags & GFP_LEVEL_MASK); /* Take the l3 list lock to change the colour_next on this node */ @@ -3389,6 +3389,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + if (unlikely((flags & __GFP_ZERO) && ptr)) + memset(ptr, 0, obj_size(cachep)); + return ptr; } @@ -3440,6 +3443,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); + return objp; } @@ -3581,23 +3587,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_zalloc - Allocate an object. The memory is set to zero. - * @cache: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache and set the allocated memory to zero. - * The flags are only relevant if the cache has no available objects. - */ -void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) -{ - void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); - if (ret) - memset(ret, 0, obj_size(cache)); - return ret; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - -/** * kmem_ptr_validate - check if an untrusted pointer might * be a slab entry. * @cachep: the cache we're checking against @@ -3653,8 +3642,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) struct kmem_cache *cachep; cachep = kmem_find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return kmem_cache_alloc_node(cachep, flags, node); } @@ -3698,8 +3687,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, * functions. */ cachep = __find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return __cache_alloc(cachep, flags, caller); } @@ -3726,52 +3715,6 @@ EXPORT_SYMBOL(__kmalloc); #endif /** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - struct kmem_cache *cache, *new_cache; - void *ret; - - if (unlikely(!p)) - return kmalloc_track_caller(new_size, flags); - - if (unlikely(!new_size)) { - kfree(p); - return NULL; - } - - cache = virt_to_cache(p); - new_cache = __find_general_cachep(new_size, flags); - - /* - * If new size fits in the current cache, bail out. - */ - if (likely(cache == new_cache)) - return (void *)p; - - /* - * We are on the slow-path here so do not use __cache_alloc - * because it bloats kernel text. - */ - ret = kmalloc_track_caller(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ksize(p))); - kfree(p); - } - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. * @objp: The previously allocated object. @@ -3806,7 +3749,7 @@ void kfree(const void *objp) struct kmem_cache *c; unsigned long flags; - if (unlikely(!objp)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return; local_irq_save(flags); kfree_debugcheck(objp); @@ -4157,26 +4100,17 @@ static void print_slabinfo_header(struct seq_file *m) static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; - struct list_head *p; mutex_lock(&cache_chain_mutex); if (!n) print_slabinfo_header(m); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + + return seq_list_start(&cache_chain, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct kmem_cache *cachep = p; - ++*pos; - return cachep->next.next == &cache_chain ? - NULL : list_entry(cachep->next.next, struct kmem_cache, next); + return seq_list_next(p, &cache_chain, pos); } static void s_stop(struct seq_file *m, void *p) @@ -4186,7 +4120,7 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4355,17 +4289,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, static void *leaks_start(struct seq_file *m, loff_t *pos) { - loff_t n = *pos; - struct list_head *p; - mutex_lock(&cache_chain_mutex); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + return seq_list_start(&cache_chain, *pos); } static inline int add_caller(unsigned long *n, unsigned long v) @@ -4416,7 +4341,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) { #ifdef CONFIG_KALLSYMS unsigned long offset, size; - char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1]; + char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { seq_printf(m, "%s+%#lx/%#lx", name, offset, size); @@ -4430,7 +4355,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; struct kmem_list3 *l3; const char *name; @@ -4511,7 +4436,7 @@ const struct seq_operations slabstats_op = { */ size_t ksize(const void *objp) { - if (unlikely(objp == NULL)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return 0; return obj_size(virt_to_cache(objp)); diff --git a/mm/slob.c b/mm/slob.c index 71976c5d40d3..ec33fcdc852e 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -3,57 +3,159 @@ * * Matt Mackall <mpm@selenic.com> 12/30/03 * + * NUMA support by Paul Mundt, 2007. + * * How SLOB works: * * The core of SLOB is a traditional K&R style heap allocator, with * support for returning aligned objects. The granularity of this - * allocator is 8 bytes on x86, though it's perhaps possible to reduce - * this to 4 if it's deemed worth the effort. The slob heap is a - * singly-linked list of pages from __get_free_page, grown on demand - * and allocation from the heap is currently first-fit. + * allocator is as little as 2 bytes, however typically most architectures + * will require 4 bytes on 32-bit and 8 bytes on 64-bit. + * + * The slob heap is a linked list of pages from alloc_pages(), and + * within each page, there is a singly-linked list of free blocks (slob_t). + * The heap is grown on demand and allocation from the heap is currently + * first-fit. * * Above this is an implementation of kmalloc/kfree. Blocks returned - * from kmalloc are 8-byte aligned and prepended with a 8-byte header. + * from kmalloc are prepended with a 4-byte header with the kmalloc size. * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls - * __get_free_pages directly so that it can return page-aligned blocks - * and keeps a linked list of such pages and their orders. These - * objects are detected in kfree() by their page alignment. + * alloc_pages() directly, allocating compound pages so the page order + * does not have to be separately tracked, and also stores the exact + * allocation size in page->private so that it can be used to accurately + * provide ksize(). These objects are detected in kfree() because slob_page() + * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and - * destructors for every SLAB allocation. Objects are returned with - * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is - * set, in which case the low-level allocator will fragment blocks to - * create the proper alignment. Again, objects of page-size or greater - * are allocated by calling __get_free_pages. As SLAB objects know - * their size, no separate size bookkeeping is necessary and there is - * essentially no allocation space overhead. + * destructors for every SLAB allocation. Objects are returned with the + * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which + * case the low-level allocator will fragment blocks to create the proper + * alignment. Again, objects of page-size or greater are allocated by + * calling alloc_pages(). As SLAB objects know their size, no separate + * size bookkeeping is necessary and there is essentially no allocation + * space overhead, and compound pages aren't needed for multi-page + * allocations. + * + * NUMA support in SLOB is fairly simplistic, pushing most of the real + * logic down to the page allocator, and simply doing the node accounting + * on the upper levels. In the event that a node id is explicitly + * provided, alloc_pages_node() with the specified node id is used + * instead. The common case (or when the node id isn't explicitly provided) + * will default to the current node, as per numa_node_id(). + * + * Node aware pages are still inserted in to the global freelist, and + * these are scanned for by matching against the node id encoded in the + * page flags. As a result, block allocations that can be satisfied from + * the freelist will only be done so on pages residing on the same node, + * in order to prevent random node placement. */ +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/timer.h> #include <linux/rcupdate.h> +#include <linux/list.h> +#include <asm/atomic.h> + +/* + * slob_block has a field 'units', which indicates size of block if +ve, + * or offset of next block if -ve (in SLOB_UNITs). + * + * Free blocks of size 1 unit simply contain the offset of the next block. + * Those with larger size contain their size in the first SLOB_UNIT of + * memory, and the offset of the next free block in the second SLOB_UNIT. + */ +#if PAGE_SIZE <= (32767 * 2) +typedef s16 slobidx_t; +#else +typedef s32 slobidx_t; +#endif struct slob_block { - int units; - struct slob_block *next; + slobidx_t units; }; typedef struct slob_block slob_t; +/* + * We use struct page fields to manage some slob allocation aspects, + * however to avoid the horrible mess in include/linux/mm_types.h, we'll + * just define our own struct page type variant here. + */ +struct slob_page { + union { + struct { + unsigned long flags; /* mandatory */ + atomic_t _count; /* mandatory */ + slobidx_t units; /* free units left in page */ + unsigned long pad[2]; + slob_t *free; /* first free slob_t in page */ + struct list_head list; /* linked list of free pages */ + }; + struct page page; + }; +}; +static inline void struct_slob_page_wrong_size(void) +{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } + +/* + * free_slob_page: call before a slob_page is returned to the page allocator. + */ +static inline void free_slob_page(struct slob_page *sp) +{ + reset_page_mapcount(&sp->page); + sp->page.mapping = NULL; +} + +/* + * All (partially) free slob pages go on this list. + */ +static LIST_HEAD(free_slob_pages); + +/* + * slob_page: True for all slob pages (false for bigblock pages) + */ +static inline int slob_page(struct slob_page *sp) +{ + return test_bit(PG_active, &sp->flags); +} + +static inline void set_slob_page(struct slob_page *sp) +{ + __set_bit(PG_active, &sp->flags); +} + +static inline void clear_slob_page(struct slob_page *sp) +{ + __clear_bit(PG_active, &sp->flags); +} + +/* + * slob_page_free: true for pages on free_slob_pages list. + */ +static inline int slob_page_free(struct slob_page *sp) +{ + return test_bit(PG_private, &sp->flags); +} + +static inline void set_slob_page_free(struct slob_page *sp) +{ + list_add(&sp->list, &free_slob_pages); + __set_bit(PG_private, &sp->flags); +} + +static inline void clear_slob_page_free(struct slob_page *sp) +{ + list_del(&sp->list); + __clear_bit(PG_private, &sp->flags); +} + #define SLOB_UNIT sizeof(slob_t) #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) #define SLOB_ALIGN L1_CACHE_BYTES -struct bigblock { - int order; - void *pages; - struct bigblock *next; -}; -typedef struct bigblock bigblock_t; - /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free @@ -64,215 +166,332 @@ struct slob_rcu { int size; }; -static slob_t arena = { .next = &arena, .units = 1 }; -static slob_t *slobfree = &arena; -static bigblock_t *bigblocks; +/* + * slob_lock protects all slob allocator structures. + */ static DEFINE_SPINLOCK(slob_lock); -static DEFINE_SPINLOCK(block_lock); -static void slob_free(void *b, int size); -static void slob_timer_cbk(void); +/* + * Encode the given size and next info into a free slob block s. + */ +static void set_slob(slob_t *s, slobidx_t size, slob_t *next) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t offset = next - base; + + if (size > 1) { + s[0].units = size; + s[1].units = offset; + } else + s[0].units = -offset; +} + +/* + * Return the size of a slob block. + */ +static slobidx_t slob_units(slob_t *s) +{ + if (s->units > 0) + return s->units; + return 1; +} + +/* + * Return the next free slob block pointer after this one. + */ +static slob_t *slob_next(slob_t *s) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t next; + + if (s[0].units < 0) + next = -s[0].units; + else + next = s[1].units; + return base+next; +} + +/* + * Returns true if s is the last free block in its page. + */ +static int slob_last(slob_t *s) +{ + return !((unsigned long)slob_next(s) & ~PAGE_MASK); +} + +static void *slob_new_page(gfp_t gfp, int order, int node) +{ + void *page; + +#ifdef CONFIG_NUMA + if (node != -1) + page = alloc_pages_node(node, gfp, order); + else +#endif + page = alloc_pages(gfp, order); + if (!page) + return NULL; -static void *slob_alloc(size_t size, gfp_t gfp, int align) + return page_address(page); +} + +/* + * Allocate a slob block within a given slob_page sp. + */ +static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) { slob_t *prev, *cur, *aligned = 0; int delta = 0, units = SLOB_UNITS(size); - unsigned long flags; - spin_lock_irqsave(&slob_lock, flags); - prev = slobfree; - for (cur = prev->next; ; prev = cur, cur = cur->next) { + for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { + slobidx_t avail = slob_units(cur); + if (align) { aligned = (slob_t *)ALIGN((unsigned long)cur, align); delta = aligned - cur; } - if (cur->units >= units + delta) { /* room enough? */ + if (avail >= units + delta) { /* room enough? */ + slob_t *next; + if (delta) { /* need to fragment head to align? */ - aligned->units = cur->units - delta; - aligned->next = cur->next; - cur->next = aligned; - cur->units = delta; + next = slob_next(cur); + set_slob(aligned, avail - delta, next); + set_slob(cur, delta, aligned); prev = cur; cur = aligned; + avail = slob_units(cur); } - if (cur->units == units) /* exact fit? */ - prev->next = cur->next; /* unlink */ - else { /* fragment */ - prev->next = cur + units; - prev->next->units = cur->units - units; - prev->next->next = cur->next; - cur->units = units; + next = slob_next(cur); + if (avail == units) { /* exact fit? unlink. */ + if (prev) + set_slob(prev, slob_units(prev), next); + else + sp->free = next; + } else { /* fragment */ + if (prev) + set_slob(prev, slob_units(prev), cur + units); + else + sp->free = cur + units; + set_slob(cur + units, avail - units, next); } - slobfree = prev; - spin_unlock_irqrestore(&slob_lock, flags); + sp->units -= units; + if (!sp->units) + clear_slob_page_free(sp); return cur; } - if (cur == slobfree) { - spin_unlock_irqrestore(&slob_lock, flags); + if (slob_last(cur)) + return NULL; + } +} - if (size == PAGE_SIZE) /* trying to shrink arena? */ - return 0; +/* + * slob_alloc: entry point into the slob allocator. + */ +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +{ + struct slob_page *sp; + struct list_head *prev; + slob_t *b = NULL; + unsigned long flags; - cur = (slob_t *)__get_free_page(gfp); - if (!cur) - return 0; + spin_lock_irqsave(&slob_lock, flags); + /* Iterate through each partially free page, try to find room */ + list_for_each_entry(sp, &free_slob_pages, list) { +#ifdef CONFIG_NUMA + /* + * If there's a node specification, search for a partial + * page with a matching node id in the freelist. + */ + if (node != -1 && page_to_nid(&sp->page) != node) + continue; +#endif + /* Enough room on this page? */ + if (sp->units < SLOB_UNITS(size)) + continue; + + /* Attempt to alloc */ + prev = sp->list.prev; + b = slob_page_alloc(sp, size, align); + if (!b) + continue; + + /* Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) */ + if (free_slob_pages.next != prev->next) + list_move_tail(&free_slob_pages, prev->next); + break; + } + spin_unlock_irqrestore(&slob_lock, flags); - slob_free(cur, PAGE_SIZE); - spin_lock_irqsave(&slob_lock, flags); - cur = slobfree; - } + /* Not enough space: must allocate a new page */ + if (!b) { + b = slob_new_page(gfp, 0, node); + if (!b) + return 0; + sp = (struct slob_page *)virt_to_page(b); + set_slob_page(sp); + + spin_lock_irqsave(&slob_lock, flags); + sp->units = SLOB_UNITS(PAGE_SIZE); + sp->free = b; + INIT_LIST_HEAD(&sp->list); + set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); + set_slob_page_free(sp); + b = slob_page_alloc(sp, size, align); + BUG_ON(!b); + spin_unlock_irqrestore(&slob_lock, flags); } + if (unlikely((gfp & __GFP_ZERO) && b)) + memset(b, 0, size); + return b; } +/* + * slob_free: entry point into the slob allocator. + */ static void slob_free(void *block, int size) { - slob_t *cur, *b = (slob_t *)block; + struct slob_page *sp; + slob_t *prev, *next, *b = (slob_t *)block; + slobidx_t units; unsigned long flags; - if (!block) + if (ZERO_OR_NULL_PTR(block)) return; + BUG_ON(!size); - if (size) - b->units = SLOB_UNITS(size); + sp = (struct slob_page *)virt_to_page(block); + units = SLOB_UNITS(size); - /* Find reinsertion point */ spin_lock_irqsave(&slob_lock, flags); - for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) - if (cur >= cur->next && (b > cur || b < cur->next)) - break; - - if (b + b->units == cur->next) { - b->units += cur->next->units; - b->next = cur->next->next; - } else - b->next = cur->next; - - if (cur + cur->units == b) { - cur->units += b->units; - cur->next = b->next; - } else - cur->next = b; - slobfree = cur; - - spin_unlock_irqrestore(&slob_lock, flags); -} - -void *__kmalloc(size_t size, gfp_t gfp) -{ - slob_t *m; - bigblock_t *bb; - unsigned long flags; + if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { + /* Go directly to page allocator. Do not pass slob allocator */ + if (slob_page_free(sp)) + clear_slob_page_free(sp); + clear_slob_page(sp); + free_slob_page(sp); + free_page((unsigned long)b); + goto out; + } - if (size < PAGE_SIZE - SLOB_UNIT) { - m = slob_alloc(size + SLOB_UNIT, gfp, 0); - return m ? (void *)(m + 1) : 0; + if (!slob_page_free(sp)) { + /* This slob page is about to become partially free. Easy! */ + sp->units = units; + sp->free = b; + set_slob(b, units, + (void *)((unsigned long)(b + + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); + set_slob_page_free(sp); + goto out; } - bb = slob_alloc(sizeof(bigblock_t), gfp, 0); - if (!bb) - return 0; + /* + * Otherwise the page is already partially free, so find reinsertion + * point. + */ + sp->units += units; - bb->order = get_order(size); - bb->pages = (void *)__get_free_pages(gfp, bb->order); + if (b < sp->free) { + set_slob(b, units, sp->free); + sp->free = b; + } else { + prev = sp->free; + next = slob_next(prev); + while (b > next) { + prev = next; + next = slob_next(prev); + } - if (bb->pages) { - spin_lock_irqsave(&block_lock, flags); - bb->next = bigblocks; - bigblocks = bb; - spin_unlock_irqrestore(&block_lock, flags); - return bb->pages; + if (!slob_last(prev) && b + units == next) { + units += slob_units(next); + set_slob(b, units, slob_next(next)); + } else + set_slob(b, units, next); + + if (prev + slob_units(prev) == b) { + units = slob_units(b) + slob_units(prev); + set_slob(prev, units, slob_next(b)); + } else + set_slob(prev, slob_units(prev), b); } - - slob_free(bb, sizeof(bigblock_t)); - return 0; +out: + spin_unlock_irqrestore(&slob_lock, flags); } -EXPORT_SYMBOL(__kmalloc); -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. +/* + * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) +#endif + +void *__kmalloc_node(size_t size, gfp_t gfp, int node) { - void *ret; + unsigned int *m; + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); - if (unlikely(!p)) - return kmalloc_track_caller(new_size, flags); + if (size < PAGE_SIZE - align) { + if (!size) + return ZERO_SIZE_PTR; - if (unlikely(!new_size)) { - kfree(p); - return NULL; - } + m = slob_alloc(size + align, gfp, align, node); + if (m) + *m = size; + return (void *)m + align; + } else { + void *ret; - ret = kmalloc_track_caller(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ksize(p))); - kfree(p); + ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + if (ret) { + struct page *page; + page = virt_to_page(ret); + page->private = size; + } + return ret; } - return ret; } -EXPORT_SYMBOL(krealloc); +EXPORT_SYMBOL(__kmalloc_node); void kfree(const void *block) { - bigblock_t *bb, **last = &bigblocks; - unsigned long flags; + struct slob_page *sp; - if (!block) + if (ZERO_OR_NULL_PTR(block)) return; - if (!((unsigned long)block & (PAGE_SIZE-1))) { - /* might be on the big block list */ - spin_lock_irqsave(&block_lock, flags); - for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { - if (bb->pages == block) { - *last = bb->next; - spin_unlock_irqrestore(&block_lock, flags); - free_pages((unsigned long)block, bb->order); - slob_free(bb, sizeof(bigblock_t)); - return; - } - } - spin_unlock_irqrestore(&block_lock, flags); - } - - slob_free((slob_t *)block - 1, 0); - return; + sp = (struct slob_page *)virt_to_page(block); + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + slob_free(m, *m + align); + } else + put_page(&sp->page); } - EXPORT_SYMBOL(kfree); +/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t ksize(const void *block) { - bigblock_t *bb; - unsigned long flags; + struct slob_page *sp; - if (!block) + if (ZERO_OR_NULL_PTR(block)) return 0; - if (!((unsigned long)block & (PAGE_SIZE-1))) { - spin_lock_irqsave(&block_lock, flags); - for (bb = bigblocks; bb; bb = bb->next) - if (bb->pages == block) { - spin_unlock_irqrestore(&slob_lock, flags); - return PAGE_SIZE << bb->order; - } - spin_unlock_irqrestore(&block_lock, flags); - } - - return ((slob_t *)block - 1)->units * SLOB_UNIT; + sp = (struct slob_page *)virt_to_page(block); + if (slob_page(sp)) + return ((slob_t *)block - 1)->units + SLOB_UNIT; + else + return sp->page.private; } struct kmem_cache { @@ -284,12 +503,11 @@ struct kmem_cache { struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long), - void (*dtor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(void*, struct kmem_cache *, unsigned long)) { struct kmem_cache *c; - c = slob_alloc(sizeof(struct kmem_cache), flags, 0); + c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1); if (c) { c->name = name; @@ -302,6 +520,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, c->ctor = ctor; /* ignore alignment unless it's forced */ c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < ARCH_SLAB_MINALIGN) + c->align = ARCH_SLAB_MINALIGN; if (c->align < align) c->align = align; } else if (flags & SLAB_PANIC) @@ -317,31 +537,21 @@ void kmem_cache_destroy(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_destroy); -void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) +void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; if (c->size < PAGE_SIZE) - b = slob_alloc(c->size, flags, c->align); + b = slob_alloc(c->size, flags, c->align, node); else - b = (void *)__get_free_pages(flags, get_order(c->size)); + b = slob_new_page(flags, get_order(c->size), node); if (c->ctor) c->ctor(b, c, 0); return b; } -EXPORT_SYMBOL(kmem_cache_alloc); - -void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) -{ - void *ret = kmem_cache_alloc(c, flags); - if (ret) - memset(ret, 0, c->size); - - return ret; -} -EXPORT_SYMBOL(kmem_cache_zalloc); +EXPORT_SYMBOL(kmem_cache_alloc_node); static void __kmem_cache_free(void *b, int size) { @@ -385,9 +595,6 @@ const char *kmem_cache_name(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_name); -static struct timer_list slob_timer = TIMER_INITIALIZER( - (void (*)(unsigned long))slob_timer_cbk, 0, 0); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; @@ -399,17 +606,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b) return 0; } -void __init kmem_cache_init(void) +static unsigned int slob_ready __read_mostly; + +int slab_is_available(void) { - slob_timer_cbk(); + return slob_ready; } -static void slob_timer_cbk(void) +void __init kmem_cache_init(void) { - void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); - - if (p) - free_page((unsigned long)p); - - mod_timer(&slob_timer, jiffies + HZ); + slob_ready = 1; } diff --git a/mm/slub.c b/mm/slub.c index e0cf6213abc0..9b2d6178d06c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -205,6 +205,11 @@ static inline void ClearSlabDebug(struct page *page) #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * The page->inuse field is 16 bit thus we have this limitation + */ +#define MAX_OBJECTS_PER_SLAB 65535 + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ @@ -228,7 +233,7 @@ static enum { /* A list of all slab caches on the system */ static DECLARE_RWSEM(slub_lock); -LIST_HEAD(slab_caches); +static LIST_HEAD(slab_caches); /* * Tracking user of a slab. @@ -247,9 +252,10 @@ static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); #else -static int sysfs_slab_add(struct kmem_cache *s) { return 0; } -static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static void sysfs_slab_remove(struct kmem_cache *s) {} +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) + { return 0; } +static inline void sysfs_slab_remove(struct kmem_cache *s) {} #endif /******************************************************************** @@ -323,7 +329,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) /* * Debug settings: */ +#ifdef CONFIG_SLUB_DEBUG_ON +static int slub_debug = DEBUG_DEFAULT_FLAGS; +#else static int slub_debug; +#endif static char *slub_debug_slabs; @@ -340,7 +350,7 @@ static void print_section(char *text, u8 *addr, unsigned int length) for (i = 0; i < length; i++) { if (newline) { - printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + printk(KERN_ERR "%8s 0x%p: ", text, addr + i); newline = 0; } printk(" %02x", addr[i]); @@ -397,10 +407,11 @@ static void set_track(struct kmem_cache *s, void *object, static void init_tracking(struct kmem_cache *s, void *object) { - if (s->flags & SLAB_STORE_USER) { - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); - } + if (!(s->flags & SLAB_STORE_USER)) + return; + + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); } static void print_track(const char *s, struct track *t) @@ -408,65 +419,106 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "%s: ", s); + printk(KERN_ERR "INFO: %s in ", s); __print_symbol("%s", (unsigned long)t->addr); - printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); +} + +static void print_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + print_track("Allocated", get_track(s, object, TRACK_ALLOC)); + print_track("Freed", get_track(s, object, TRACK_FREE)); +} + +static void print_page_info(struct page *page) +{ + printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", + page, page->inuse, page->freelist, page->flags); + +} + +static void slab_bug(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "========================================" + "=====================================\n"); + printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "----------------------------------------" + "-------------------------------------\n\n"); } -static void print_trailer(struct kmem_cache *s, u8 *p) +static void slab_fix(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "FIX %s: %s\n", s->name, buf); +} + +static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ + u8 *addr = page_address(page); + + print_tracking(s, p); + + print_page_info(page); + + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); + + if (p > addr + 16) + print_section("Bytes b4", p - 16, 16); + + print_section("Object", p, min(s->objsize, 128)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, s->inuse - s->objsize); - printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", - p + s->offset, - get_freepointer(s, p)); - if (s->offset) off = s->offset + sizeof(void *); else off = s->inuse; - if (s->flags & SLAB_STORE_USER) { - print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); - print_track("Last free ", get_track(s, p, TRACK_FREE)); + if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - } if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Filler", p + off, s->size - off); + print_section("Padding", p + off, s->size - off); + + dump_stack(); } static void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { - u8 *addr = page_address(page); - - printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", - s->name, reason, object, page); - printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", - object - addr, page->flags, page->inuse, page->freelist); - if (object > addr + 16) - print_section("Bytes b4", object - 16, 16); - print_section("Object", object, min(s->objsize, 128)); - print_trailer(s, object); - dump_stack(); + slab_bug(s, reason); + print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) { va_list args; char buf[100]; - va_start(args, reason); - vsnprintf(buf, sizeof(buf), reason, args); + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, - page); + slab_bug(s, fmt); + print_page_info(page); dump_stack(); } @@ -485,15 +537,46 @@ static void init_object(struct kmem_cache *s, void *object, int active) s->inuse - s->objsize); } -static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) { while (bytes) { if (*start != (u8)value) - return 0; + return start; start++; bytes--; } - return 1; + return NULL; +} + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + memset(from, data, to - from); +} + +static int check_bytes_and_report(struct kmem_cache *s, struct page *page, + u8 *object, char *what, + u8* start, unsigned int value, unsigned int bytes) +{ + u8 *fault; + u8 *end; + + fault = check_bytes(start, value, bytes); + if (!fault) + return 1; + + end = start + bytes; + while (end > fault && end[-1] == value) + end--; + + slab_bug(s, "%s overwritten", what); + printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault[0], value); + print_trailer(s, page, object); + + restore_bytes(s, what, value, fault, end); + return 0; } /* @@ -534,14 +617,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) * may be used with merged slabcaches. */ -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, - void *from, void *to) -{ - printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", - s->name, message, data, from, to - 1); - memset(from, data, to - from); -} - static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { unsigned long off = s->inuse; /* The end of info */ @@ -557,39 +632,39 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) if (s->size == off) return 1; - if (check_bytes(p + off, POISON_INUSE, s->size - off)) - return 1; - - object_err(s, page, p, "Object padding check fails"); - - /* - * Restore padding - */ - restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); - return 0; + return check_bytes_and_report(s, page, p, "Object padding", + p + off, POISON_INUSE, s->size - off); } static int slab_pad_check(struct kmem_cache *s, struct page *page) { - u8 *p; - int length, remainder; + u8 *start; + u8 *fault; + u8 *end; + int length; + int remainder; if (!(s->flags & SLAB_POISON)) return 1; - p = page_address(page); + start = page_address(page); + end = start + (PAGE_SIZE << s->order); length = s->objects * s->size; - remainder = (PAGE_SIZE << s->order) - length; + remainder = end - (start + length); if (!remainder) return 1; - if (!check_bytes(p + length, POISON_INUSE, remainder)) { - slab_err(s, page, "Padding check failed"); - restore_bytes(s, "slab padding", POISON_INUSE, p + length, - p + length + remainder); - return 0; - } - return 1; + fault = check_bytes(start + length, POISON_INUSE, remainder); + if (!fault) + return 1; + while (end > fault && end[-1] == POISON_INUSE) + end--; + + slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + print_section("Padding", start, length); + + restore_bytes(s, "slab padding", POISON_INUSE, start, end); + return 0; } static int check_object(struct kmem_cache *s, struct page *page, @@ -602,41 +677,22 @@ static int check_object(struct kmem_cache *s, struct page *page, unsigned int red = active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes(endobject, red, s->inuse - s->objsize)) { - object_err(s, page, object, - active ? "Redzone Active" : "Redzone Inactive"); - restore_bytes(s, "redzone", red, - endobject, object + s->inuse); + if (!check_bytes_and_report(s, page, object, "Redzone", + endobject, red, s->inuse - s->objsize)) return 0; - } } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && - !check_bytes(endobject, POISON_INUSE, - s->inuse - s->objsize)) { - object_err(s, page, p, "Alignment padding check fails"); - /* - * Fix it so that there will not be another report. - * - * Hmmm... We may be corrupting an object that now expects - * to be longer than allowed. - */ - restore_bytes(s, "alignment padding", POISON_INUSE, - endobject, object + s->inuse); - } + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) + check_bytes_and_report(s, page, p, "Alignment padding", endobject, + POISON_INUSE, s->inuse - s->objsize); } if (s->flags & SLAB_POISON) { if (!active && (s->flags & __OBJECT_POISON) && - (!check_bytes(p, POISON_FREE, s->objsize - 1) || - p[s->objsize - 1] != POISON_END)) { - - object_err(s, page, p, "Poison check failed"); - restore_bytes(s, "Poison", POISON_FREE, - p, p + s->objsize -1); - restore_bytes(s, "Poison", POISON_END, - p + s->objsize - 1, p + s->objsize); + (!check_bytes_and_report(s, page, p, "Poison", p, + POISON_FREE, s->objsize - 1) || + !check_bytes_and_report(s, page, p, "Poison", + p + s->objsize -1, POISON_END, 1))) return 0; - } /* * check_pad_bytes cleans up on its own. */ @@ -669,25 +725,17 @@ static int check_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { - slab_err(s, page, "Not a valid slab page flags=%lx " - "mapping=0x%p count=%d", page->flags, page->mapping, - page_count(page)); + slab_err(s, page, "Not a valid slab page"); return 0; } if (page->offset * sizeof(void *) != s->offset) { - slab_err(s, page, "Corrupted offset %lu flags=0x%lx " - "mapping=0x%p count=%d", - (unsigned long)(page->offset * sizeof(void *)), - page->flags, - page->mapping, - page_count(page)); + slab_err(s, page, "Corrupted offset %lu", + (unsigned long)(page->offset * sizeof(void *))); return 0; } if (page->inuse > s->objects) { - slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " - "mapping=0x%p count=%d", - s->name, page->inuse, s->objects, page->flags, - page->mapping, page_count(page)); + slab_err(s, page, "inuse %u > max %u", + s->name, page->inuse, s->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -715,13 +763,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) set_freepointer(s, object, NULL); break; } else { - slab_err(s, page, "Freepointer 0x%p corrupt", - fp); + slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; page->inuse = s->objects; - printk(KERN_ERR "@@@ SLUB %s: Freelist " - "cleared. Slab 0x%p\n", - s->name, page); + slab_fix(s, "Freelist cleared"); return 0; } break; @@ -733,11 +778,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) if (page->inuse != s->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", s, page, page->inuse, - s->objects - nr); + "counted were %d", page->inuse, s->objects - nr); page->inuse = s->objects - nr; - printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " - "Slab @0x%p\n", s->name, page); + slab_fix(s, "Object count adjusted."); } return search == NULL; } @@ -799,7 +842,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; if (object && !on_freelist(s, page, object)) { - slab_err(s, page, "Object 0x%p already allocated", object); + object_err(s, page, object, "Object already allocated"); goto bad; } @@ -825,8 +868,7 @@ bad: * to avoid issues in the future. Marking all objects * as used avoids touching the remaining objects. */ - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", - s->name, page); + slab_fix(s, "Marking all objects used"); page->inuse = s->objects; page->freelist = NULL; /* Fix up fields that may be corrupted */ @@ -847,7 +889,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } if (on_freelist(s, page, object)) { - slab_err(s, page, "Object 0x%p already free", object); + object_err(s, page, object, "Object already free"); goto fail; } @@ -866,8 +908,8 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, dump_stack(); } else - slab_err(s, page, "object at 0x%p belongs " - "to slab %s", object, page->slab->name); + object_err(s, page, object, + "page slab pointer corrupt."); goto fail; } @@ -881,45 +923,63 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, return 1; fail: - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", - s->name, page, object); + slab_fix(s, "Object at 0x%p not freed", object); return 0; } static int __init setup_slub_debug(char *str) { - if (!str || *str != '=') - slub_debug = DEBUG_DEFAULT_FLAGS; - else { - str++; - if (*str == 0 || *str == ',') - slub_debug = DEBUG_DEFAULT_FLAGS; - else - for( ;*str && *str != ','; str++) - switch (*str) { - case 'f' : case 'F' : - slub_debug |= SLAB_DEBUG_FREE; - break; - case 'z' : case 'Z' : - slub_debug |= SLAB_RED_ZONE; - break; - case 'p' : case 'P' : - slub_debug |= SLAB_POISON; - break; - case 'u' : case 'U' : - slub_debug |= SLAB_STORE_USER; - break; - case 't' : case 'T' : - slub_debug |= SLAB_TRACE; - break; - default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n",*str); - } + slub_debug = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + if (*str == ',') + /* + * No options but restriction on slabs. This means full + * debugging for slabs matching a pattern. + */ + goto check_slabs; + + slub_debug = 0; + if (*str == '-') + /* + * Switch off all debugging measures. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for ( ;*str && *str != ','; str++) { + switch (tolower(*str)) { + case 'f': + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z': + slub_debug |= SLAB_RED_ZONE; + break; + case 'p': + slub_debug |= SLAB_POISON; + break; + case 'u': + slub_debug |= SLAB_STORE_USER; + break; + case 't': + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n",*str); + } } +check_slabs: if (*str == ',') slub_debug_slabs = str + 1; +out: return 1; } @@ -1018,7 +1078,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *last; void *p; - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); if (flags & __GFP_WAIT) local_irq_enable(); @@ -1336,7 +1396,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) unfreeze_slab(s, page); } -static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) { slab_lock(page); deactivate_slab(s, page, cpu); @@ -1346,7 +1406,7 @@ static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) * Flush cpu slab. * Called from IPI handler with interrupts disabled. */ -static void __flush_cpu_slab(struct kmem_cache *s, int cpu) +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct page *page = s->cpu_slab[cpu]; @@ -1481,7 +1541,7 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static void __always_inline *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, void *addr) { struct page *page; void **object; @@ -1499,6 +1559,10 @@ static void __always_inline *slab_alloc(struct kmem_cache *s, page->lockless_freelist = object[page->offset]; } local_irq_restore(flags); + + if (unlikely((gfpflags & __GFP_ZERO) && object)) + memset(object, 0, s->objsize); + return object; } @@ -1682,8 +1746,17 @@ static inline int slab_order(int size, int min_objects, { int order; int rem; + int min_order = slub_min_order; + + /* + * If we would create too many object per slab then reduce + * the slab order even if it goes below slub_min_order. + */ + while (min_order > 0 && + (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) + min_order--; - for (order = max(slub_min_order, + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1697,6 +1770,9 @@ static inline int slab_order(int size, int min_objects, if (rem <= slab_size / fract_leftover) break; + /* If the next size is too high then exit now */ + if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) + break; } return order; @@ -1777,7 +1853,9 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG INIT_LIST_HEAD(&n->full); +#endif } #ifdef CONFIG_NUMA @@ -1805,7 +1883,10 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag page->freelist = get_freepointer(kmalloc_caches, n); page->inuse++; kmalloc_caches->node[node] = n; - setup_object_debug(kmalloc_caches, page, n); +#ifdef CONFIG_SLUB_DEBUG + init_object(kmalloc_caches, n, 1); + init_tracking(kmalloc_caches, n); +#endif init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); add_partial(n, page); @@ -1983,7 +2064,7 @@ static int calculate_sizes(struct kmem_cache *s) * The page->inuse field is only 16 bit wide! So we cannot have * more than 64k objects per slab. */ - if (!s->objects || s->objects > 65535) + if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) return 0; return 1; @@ -2087,7 +2168,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, /* * Release all resources used by a slab cache. */ -static int kmem_cache_close(struct kmem_cache *s) +static inline int kmem_cache_close(struct kmem_cache *s) { int node; @@ -2115,12 +2196,13 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) WARN_ON(1); sysfs_slab_remove(s); kfree(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2193,47 +2275,92 @@ panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); } -static struct kmem_cache *get_slab(size_t size, gfp_t flags) +#ifdef CONFIG_ZONE_DMA +static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) { - int index = kmalloc_index(size); + struct kmem_cache *s; + struct kmem_cache *x; + char *text; + size_t realsize; - if (!index) - return NULL; + s = kmalloc_caches_dma[index]; + if (s) + return s; - /* Allocation too large? */ - BUG_ON(index < 0); + /* Dynamically create dma cache */ + x = kmalloc(kmem_size, flags & ~SLUB_DMA); + if (!x) + panic("Unable to allocate memory for dma cache\n"); -#ifdef CONFIG_ZONE_DMA - if ((flags & SLUB_DMA)) { - struct kmem_cache *s; - struct kmem_cache *x; - char *text; - size_t realsize; - - s = kmalloc_caches_dma[index]; - if (s) - return s; + realsize = kmalloc_caches[index].objsize; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); + s = create_kmalloc_cache(x, text, realsize, flags); + down_write(&slub_lock); + if (!kmalloc_caches_dma[index]) { + kmalloc_caches_dma[index] = s; + up_write(&slub_lock); + return s; + } + up_write(&slub_lock); + kmem_cache_destroy(s); + return kmalloc_caches_dma[index]; +} +#endif - /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); - if (!x) - panic("Unable to allocate memory for dma cache\n"); +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; - if (index <= KMALLOC_SHIFT_HIGH) - realsize = 1 << index; - else { - if (index == 1) - realsize = 96; - else - realsize = 192; - } +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = create_kmalloc_cache(x, text, realsize, flags); - kmalloc_caches_dma[index] = s; - return s; + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[(size - 1) / 8]; + } else { + if (size > KMALLOC_MAX_SIZE) + return NULL; + + index = fls(size - 1); } + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & SLUB_DMA))) + return dma_kmalloc_cache(index, flags); + #endif return &kmalloc_caches[index]; } @@ -2242,9 +2369,10 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s = get_slab(size, flags); - if (s) - return slab_alloc(s, flags, -1, __builtin_return_address(0)); - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; + + return slab_alloc(s, flags, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc); @@ -2253,9 +2381,10 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s = get_slab(size, flags); - if (s) - return slab_alloc(s, flags, node, __builtin_return_address(0)); - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; + + return slab_alloc(s, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2265,7 +2394,7 @@ size_t ksize(const void *object) struct page *page; struct kmem_cache *s; - if (object == ZERO_SIZE_PTR) + if (ZERO_OR_NULL_PTR(object)) return 0; page = get_object_page(object); @@ -2306,7 +2435,7 @@ void kfree(const void *x) * this comparison would be true for all "negative" pointers * (which would cover the whole upper half of the address space). */ - if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR) + if (ZERO_OR_NULL_PTR(x)) return; page = virt_to_head_page(x); @@ -2395,43 +2524,6 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - size_t ks; - - if (unlikely(!p || p == ZERO_SIZE_PTR)) - return kmalloc(new_size, flags); - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ks = ksize(p); - if (ks >= new_size) - return (void *)p; - - ret = kmalloc(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ks)); - kfree(p); - } - return ret; -} -EXPORT_SYMBOL(krealloc); - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -2474,6 +2566,24 @@ void __init kmem_cache_init(void) caches++; } + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * mips it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) + size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ @@ -2519,7 +2629,7 @@ static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, void (*ctor)(void *, struct kmem_cache *, unsigned long)) { - struct list_head *h; + struct kmem_cache *s; if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) return NULL; @@ -2531,10 +2641,7 @@ static struct kmem_cache *find_mergeable(size_t size, align = calculate_alignment(flags, align, size); size = ALIGN(size, align); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - + list_for_each_entry(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; @@ -2561,12 +2668,10 @@ static struct kmem_cache *find_mergeable(size_t size, struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - void (*dtor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(void *, struct kmem_cache *, unsigned long)) { struct kmem_cache *s; - BUG_ON(dtor); down_write(&slub_lock); s = find_mergeable(size, align, flags, ctor); if (s) { @@ -2577,25 +2682,26 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, */ s->objsize = max(s->objsize, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + up_write(&slub_lock); if (sysfs_slab_alias(s, name)) goto err; - } else { - s = kmalloc(kmem_size, GFP_KERNEL); - if (s && kmem_cache_open(s, GFP_KERNEL, name, + return s; + } + s = kmalloc(kmem_size, GFP_KERNEL); + if (s) { + if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { - if (sysfs_slab_add(s)) { - kfree(s); - goto err; - } list_add(&s->list, &slab_caches); - } else - kfree(s); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto err; + return s; + } + kfree(s); } up_write(&slub_lock); - return s; err: - up_write(&slub_lock); if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -2604,45 +2710,7 @@ err: } EXPORT_SYMBOL(kmem_cache_create); -void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) -{ - void *x; - - x = slab_alloc(s, flags, -1, __builtin_return_address(0)); - if (x) - memset(x, 0, s->objsize); - return x; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - #ifdef CONFIG_SMP -static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) -{ - struct list_head *h; - - down_read(&slub_lock); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - - func(s, cpu); - } - up_read(&slub_lock); -} - -/* - * Version of __flush_cpu_slab for the case that interrupts - * are enabled. - */ -static void cpu_slab_flush(struct kmem_cache *s, int cpu) -{ - unsigned long flags; - - local_irq_save(flags); - __flush_cpu_slab(s, cpu); - local_irq_restore(flags); -} - /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -2651,13 +2719,21 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct kmem_cache *s; + unsigned long flags; switch (action) { case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - for_all_slabs(cpu_slab_flush, cpu); + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + } + up_read(&slub_lock); break; default: break; @@ -2674,8 +2750,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) { struct kmem_cache *s = get_slab(size, gfpflags); - if (!s) - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; return slab_alloc(s, gfpflags, -1, caller); } @@ -2685,18 +2761,18 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s = get_slab(size, gfpflags); - if (!s) - return ZERO_SIZE_PTR; + if (ZERO_OR_NULL_PTR(s)) + return s; return slab_alloc(s, gfpflags, node, caller); } #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) -static int validate_slab(struct kmem_cache *s, struct page *page) +static int validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) { void *p; void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) @@ -2718,10 +2794,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page) return 1; } -static void validate_slab_slab(struct kmem_cache *s, struct page *page) +static void validate_slab_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) { if (slab_trylock(page)) { - validate_slab(s, page); + validate_slab(s, page, map); slab_unlock(page); } else printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", @@ -2738,7 +2815,8 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page) } } -static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) +static int validate_slab_node(struct kmem_cache *s, + struct kmem_cache_node *n, unsigned long *map) { unsigned long count = 0; struct page *page; @@ -2747,7 +2825,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) { - validate_slab_slab(s, page); + validate_slab_slab(s, page, map); count++; } if (count != n->nr_partial) @@ -2758,7 +2836,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) goto out; list_for_each_entry(page, &n->full, lru) { - validate_slab_slab(s, page); + validate_slab_slab(s, page, map); count++; } if (count != atomic_long_read(&n->nr_slabs)) @@ -2771,17 +2849,23 @@ out: return count; } -static unsigned long validate_slab_cache(struct kmem_cache *s) +static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; + unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + sizeof(unsigned long), GFP_KERNEL); + + if (!map) + return -ENOMEM; flush_all(s); for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); - count += validate_slab_node(s, n); + count += validate_slab_node(s, n, map); } + kfree(map); return count; } @@ -2870,18 +2954,14 @@ static void free_loc_track(struct loc_track *t) get_order(sizeof(struct location) * t->max)); } -static int alloc_loc_track(struct loc_track *t, unsigned long max) +static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) { struct location *l; int order; - if (!max) - max = PAGE_SIZE / sizeof(struct location); - order = get_order(sizeof(struct location) * max); - l = (void *)__get_free_pages(GFP_ATOMIC, order); - + l = (void *)__get_free_pages(flags, order); if (!l) return 0; @@ -2947,7 +3027,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, /* * Not found. Insert new tracking element. */ - if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) return 0; l = t->loc + pos; @@ -2990,11 +3070,12 @@ static int list_locations(struct kmem_cache *s, char *buf, { int n = 0; unsigned long i; - struct loc_track t; + struct loc_track t = { 0, 0, NULL }; int node; - t.count = 0; - t.max = 0; + if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_KERNEL)) + return sprintf(buf, "Out of memory\n"); /* Push back cpu slabs */ flush_all(s); @@ -3398,11 +3479,14 @@ static ssize_t validate_show(struct kmem_cache *s, char *buf) static ssize_t validate_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') - validate_slab_cache(s); - else - return -EINVAL; - return length; + int ret = -EINVAL; + + if (buf[0] == '1') { + ret = validate_slab_cache(s); + if (ret >= 0) + ret = length; + } + return ret; } SLAB_ATTR(validate); @@ -3556,7 +3640,7 @@ static struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; -decl_subsys(slab, &slab_ktype, &slab_uevent_ops); +static decl_subsys(slab, &slab_ktype, &slab_uevent_ops); #define ID_STR_LENGTH 64 @@ -3654,7 +3738,7 @@ struct saved_alias { struct saved_alias *next; }; -struct saved_alias *alias_list; +static struct saved_alias *alias_list; static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { @@ -3682,7 +3766,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) static int __init slab_sysfs_init(void) { - struct list_head *h; + struct kmem_cache *s; int err; err = subsystem_register(&slab_subsys); @@ -3693,10 +3777,7 @@ static int __init slab_sysfs_init(void) slab_state = SYSFS; - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - + list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); BUG_ON(err); } diff --git a/mm/sparse.c b/mm/sparse.c index e03b39f3540f..3047bf06c1f3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -209,7 +209,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, return 1; } -__attribute__((weak)) +__attribute__((weak)) __init void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return NULL; diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f7cf2a4cb55..67daecb6031a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_list, to make sync_page look nicer, and to allow + * vmscan's shrink_page_list, to make sync_page look nicer, and to allow * future use of radix_tree tags in the swap cache. */ static const struct address_space_operations swap_aops = { @@ -334,7 +334,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, * Get a new page to read into from swap. */ if (!new_page) { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, addr); if (!new_page) break; /* Out of memory */ } diff --git a/mm/swapfile.c b/mm/swapfile.c index acc172cbe3aa..7ff0a81c7b01 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type) /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_list will preserve it. + * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); unlock_page(page); diff --git a/mm/truncate.c b/mm/truncate.c index 4fbe1a2da5fb..5cdfbc1a59fd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL(cancel_dirty_page); /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes anonymous. It will be left on the LRU and may even be mapped into - * user pagetables if we're racing with filemap_nopage(). + * user pagetables if we're racing with filemap_fault(). * * We need to bale out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on @@ -100,9 +100,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page)) do_invalidatepage(page, 0); + remove_from_page_cache(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); - remove_from_page_cache(page); page_cache_release(page); /* pagecache ref */ } @@ -192,6 +192,11 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page_index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } truncate_complete_page(mapping, page); unlock_page(page); } @@ -229,6 +234,11 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index<<PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } if (page->index > next) next = page->index; next++; @@ -253,21 +263,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) +unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, bool be_atomic) { struct pagevec pvec; pgoff_t next = start; @@ -308,17 +305,38 @@ unlock: break; } pagevec_release(&pvec); + if (likely(!be_atomic)) + cond_resched(); } return ret; } + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, false); +} EXPORT_SYMBOL(invalidate_mapping_pages); /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because - * shrink_list() has a temp ref on them, or because they're transiently sitting - * in the lru_cache_add() pagevecs. + * shrink_page_list() has a temp ref on them, or because they're transiently + * sitting in the lru_cache_add() pagevecs. */ static int invalidate_complete_page2(struct address_space *mapping, struct page *page) @@ -397,7 +415,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, break; } wait_on_page_writeback(page); - while (page_mapped(page)) { + if (page_mapped(page)) { if (!did_range_unmap) { /* * Zap the rest of the file in one hit. @@ -417,6 +435,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, PAGE_CACHE_SIZE, 0); } } + BUG_ON(page_mapped(page)); ret = do_launder_page(mapping, page); if (ret == 0 && !invalidate_complete_page2(mapping, page)) ret = -EIO; diff --git a/mm/util.c b/mm/util.c index ace2aea69f1a..bf340d806868 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,22 +5,7 @@ #include <asm/uaccess.h> /** - * __kzalloc - allocate memory. The memory is set to zero. - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - */ -void *__kzalloc(size_t size, gfp_t flags) -{ - void *ret = kmalloc_track_caller(size, flags); - if (ret) - memset(ret, 0, size); - return ret; -} -EXPORT_SYMBOL(__kzalloc); - -/* * kstrdup - allocate space for and copy an existing string - * * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory */ @@ -41,6 +26,30 @@ char *kstrdup(const char *s, gfp_t gfp) EXPORT_SYMBOL(kstrdup); /** + * kstrndup - allocate space for and copy an existing string + * @s: the string to duplicate + * @max: read at most @max chars from @s + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrndup(const char *s, size_t max, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strnlen(s, max); + buf = kmalloc_track_caller(len+1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kstrndup); + +/** * kmemdup - duplicate region of memory * * @src: memory region to duplicate @@ -58,9 +67,42 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup); +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ks = ksize(p); + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ks)); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + /* * strndup_user - duplicate an existing string from user space - * * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d3a9c5368257..3cee76a8c9f0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -68,12 +68,12 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } -void unmap_vm_area(struct vm_struct *area) +void unmap_kernel_range(unsigned long addr, unsigned long size) { pgd_t *pgd; unsigned long next; - unsigned long addr = (unsigned long) area->addr; - unsigned long end = addr + area->size; + unsigned long start = addr; + unsigned long end = addr + size; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -84,7 +84,12 @@ void unmap_vm_area(struct vm_struct *area) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range((unsigned long) area->addr, end); + flush_tlb_kernel_range(start, end); +} + +static void unmap_vm_area(struct vm_struct *area) +{ + unmap_kernel_range((unsigned long)area->addr, area->size); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, @@ -159,6 +164,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) flush_cache_vmap((unsigned long) area->addr, end); return err; } +EXPORT_SYMBOL_GPL(map_vm_area); static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, @@ -237,6 +243,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, { return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); } +EXPORT_SYMBOL_GPL(__get_vm_area); /** * get_vm_area - reserve a contingous kernel virtual area @@ -427,11 +434,12 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); + pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + PAGE_KERNEL, node); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, - (gfp_mask & GFP_LEVEL_MASK), + (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, node); } area->pages = pages; @@ -440,7 +448,6 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, kfree(area); return NULL; } - memset(area->pages, 0, array_size); for (i = 0; i < area->nr_pages; i++) { if (node < 0) @@ -578,9 +585,9 @@ void *vmalloc_exec(unsigned long size) } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) -#define GFP_VMALLOC32 GFP_DMA32 +#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -#define GFP_VMALLOC32 GFP_DMA +#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL #else #define GFP_VMALLOC32 GFP_KERNEL #endif @@ -762,3 +769,56 @@ EXPORT_SYMBOL(remap_vmalloc_range); void __attribute__((weak)) vmalloc_sync_all(void) { } + + +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + /* apply_to_page_range() does all the hard work. */ + return 0; +} + +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + struct vm_struct *area; + + area = get_vm_area(size, VM_IOREMAP); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + area->size, f, NULL)) { + free_vm_area(area); + return NULL; + } + + /* Make sure the pagetables are constructed in process kernel + mappings */ + vmalloc_sync_all(); + + return area; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1be5a6376ef0..d419e10e3daa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -66,17 +66,8 @@ struct scan_control { int swappiness; int all_unreclaimable; -}; -/* - * The list of shrinker callbacks used by to apply pressure to - * ageable caches. - */ -struct shrinker { - shrinker_t shrinker; - struct list_head list; - int seeks; /* seeks to recreate an obj */ - long nr; /* objs pending delete */ + int order; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -121,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem); /* * Add a shrinker callback to be called from the vm */ -struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) +void register_shrinker(struct shrinker *shrinker) { - struct shrinker *shrinker; - - shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); - if (shrinker) { - shrinker->shrinker = theshrinker; - shrinker->seeks = seeks; - shrinker->nr = 0; - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - up_write(&shrinker_rwsem); - } - return shrinker; + shrinker->nr = 0; + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); } -EXPORT_SYMBOL(set_shrinker); +EXPORT_SYMBOL(register_shrinker); /* * Remove one */ -void remove_shrinker(struct shrinker *shrinker) +void unregister_shrinker(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); - kfree(shrinker); } -EXPORT_SYMBOL(remove_shrinker); +EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 /* @@ -185,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); + unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; @@ -213,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrinker)(0, gfp_mask); - shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(0, gfp_mask); + shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -481,7 +463,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ - if (referenced && page_mapping_inuse(page)) + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && + referenced && page_mapping_inuse(page)) goto activate_locked; #ifdef CONFIG_SWAP @@ -514,7 +497,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - if (referenced) + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) goto keep_locked; if (!may_enter_fs) goto keep_locked; @@ -598,6 +581,51 @@ keep: return nr_reclaimed; } +/* LRU Isolation modes. */ +#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ +#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ +#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ + +/* + * Attempt to remove the specified page from its LRU. Only take this page + * if it is of the appropriate PageActive status. Pages which are being + * freed elsewhere are also ignored. + * + * page: page to consider + * mode: one of the LRU isolation modes defined above + * + * returns 0 on success, -ve errno on failure. + */ +static int __isolate_lru_page(struct page *page, int mode) +{ + int ret = -EINVAL; + + /* Only take pages on the LRU. */ + if (!PageLRU(page)) + return ret; + + /* + * When checking the active state, we need to be sure we are + * dealing with comparible boolean values. Take the logical not + * of each. + */ + if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + return ret; + + ret = -EBUSY; + if (likely(get_page_unless_zero(page))) { + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + ClearPageLRU(page); + ret = 0; + } + + return ret; +} + /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages @@ -612,38 +640,90 @@ keep: * @src: The LRU list to pull pages off. * @dst: The temp list to put pages on to. * @scanned: The number of pages that were scanned. + * @order: The caller's attempted allocation order + * @mode: One of the LRU isolation modes * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned) + unsigned long *scanned, int order, int mode) { unsigned long nr_taken = 0; - struct page *page; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { - struct list_head *target; + struct page *page; + unsigned long pfn; + unsigned long end_pfn; + unsigned long page_pfn; + int zone_id; + page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); VM_BUG_ON(!PageLRU(page)); - list_del(&page->lru); - target = src; - if (likely(get_page_unless_zero(page))) { - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - ClearPageLRU(page); - target = dst; + switch (__isolate_lru_page(page, mode)) { + case 0: + list_move(&page->lru, dst); nr_taken++; - } /* else it is being freed elsewhere */ + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&page->lru, src); + continue; + + default: + BUG(); + } + + if (!order) + continue; - list_add(&page->lru, target); + /* + * Attempt to take all pages in the order aligned region + * surrounding the tag page. Only take those pages of + * the same active state as that tag page. We may safely + * round the target page pfn down to the requested order + * as the mem_map is guarenteed valid out to MAX_ORDER, + * where that page is in a different zone we will detect + * it from its zone id and abort this block scan. + */ + zone_id = page_zone_id(page); + page_pfn = page_to_pfn(page); + pfn = page_pfn & ~((1 << order) - 1); + end_pfn = pfn + (1 << order); + for (; pfn < end_pfn; pfn++) { + struct page *cursor_page; + + /* The target page is in the block, ignore it. */ + if (unlikely(pfn == page_pfn)) + continue; + + /* Avoid holes within the zone. */ + if (unlikely(!pfn_valid_within(pfn))) + break; + + cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ + if (unlikely(page_zone_id(cursor_page) != zone_id)) + continue; + switch (__isolate_lru_page(cursor_page, mode)) { + case 0: + list_move(&cursor_page->lru, dst); + nr_taken++; + scan++; + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&cursor_page->lru, src); + default: + break; + } + } } *scanned = scan; @@ -651,6 +731,24 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /* + * clear_active_flags() is a helper for shrink_active_list(), clearing + * any active bits from the pages in the list. + */ +static unsigned long clear_active_flags(struct list_head *page_list) +{ + int nr_active = 0; + struct page *page; + + list_for_each_entry(page, page_list, lru) + if (PageActive(page)) { + ClearPageActive(page); + nr_active++; + } + + return nr_active; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -671,11 +769,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_taken; unsigned long nr_scan; unsigned long nr_freed; + unsigned long nr_active; nr_taken = isolate_lru_pages(sc->swap_cluster_max, - &zone->inactive_list, - &page_list, &nr_scan); - __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken); + &zone->inactive_list, + &page_list, &nr_scan, sc->order, + (sc->order > PAGE_ALLOC_COSTLY_ORDER)? + ISOLATE_BOTH : ISOLATE_INACTIVE); + nr_active = clear_active_flags(&page_list); + + __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); + __mod_zone_page_state(zone, NR_INACTIVE, + -(nr_taken - nr_active)); zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); @@ -820,7 +925,7 @@ force_reclaim_mapped: lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, - &l_hold, &pgscanned); + &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); zone->pages_scanned += pgscanned; __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); spin_unlock_irq(&zone->lru_lock); @@ -1011,7 +1116,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) { int priority; int ret = 0; @@ -1026,6 +1131,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, .swappiness = vm_swappiness, + .order = order, }; count_vm_event(ALLOCSTALL); @@ -1131,6 +1237,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, + .order = order, }; /* * temp_priority is used to remember the scanning priority at which @@ -1314,6 +1421,7 @@ static int kswapd(void *p) * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + set_freezable(); order = 0; for ( ; ; ) { diff --git a/mm/vmstat.c b/mm/vmstat.c index eceaf496210f..fadf791cd7e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -472,7 +472,7 @@ const struct seq_operations fragmentation_op = { #endif #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) + TEXT_FOR_HIGHMEM(xx) xx "_movable", static const char * const vmstat_text[] = { /* Zoned VM counters */ |