summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/backing-dev.c16
-rw-r--r--mm/filemap.c98
-rw-r--r--mm/filemap_xip.c23
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/hugetlb.c47
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memory.c34
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c61
-rw-r--r--mm/mempool.c6
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mlock.c16
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/msync.c1
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c675
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/rmap.c24
-rw-r--r--mm/shmem.c52
-rw-r--r--mm/slab.c155
-rw-r--r--mm/slob.c586
-rw-r--r--mm/slub.c792
-rw-r--r--mm/sparse.c55
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c44
-rw-r--r--mm/util.c48
-rw-r--r--mm/vmalloc.c19
-rw-r--r--mm/vmscan.c212
-rw-r--r--mm/vmstat.c5
35 files changed, 2033 insertions, 1055 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ac412b45f18..86187221e78f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -117,7 +117,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC64)
+ depends on (IA64 || X86 || PPC64 || SUPERH)
comment "Memory hotplug is currently incompatible with Software Suspend"
depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
@@ -163,8 +163,16 @@ config ZONE_DMA_FLAG
default "0" if !ZONE_DMA
default "1"
+config BOUNCE
+ def_bool y
+ depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+
config NR_QUICK
int
depends on QUICKLIST
default "2" if (SUPERH && !SUPERH64)
default "1"
+
+config VIRT_TO_BUS
+ def_bool y
+ depends on !ARCH_NO_VIRT_TO_BUS
diff --git a/mm/Makefile b/mm/Makefile
index a9148ea329aa..245e33ab00c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,9 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
$(mmu-y)
-ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
-obj-y += bounce.o
-endif
+obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index b2486cf887a0..00b02623f008 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -53,12 +53,9 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
int node = cpu_to_node(cpu);
BUG_ON(pdata->ptrs[cpu]);
- if (node_online(node)) {
- /* FIXME: kzalloc_node(size, gfp, node) */
- pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
- if (pdata->ptrs[cpu])
- memset(pdata->ptrs[cpu], 0, size);
- } else
+ if (node_online(node))
+ pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
+ else
pdata->ptrs[cpu] = kzalloc(size, gfp);
return pdata->ptrs[cpu];
}
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e5de3781d3fe..f50a2811f9dc 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout)
}
EXPORT_SYMBOL(congestion_wait);
-long congestion_wait_interruptible(int rw, long timeout)
-{
- long ret;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[rw];
-
- prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
- if (signal_pending(current))
- ret = -ERESTARTSYS;
- else
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
- return ret;
-}
-EXPORT_SYMBOL(congestion_wait_interruptible);
-
/**
* congestion_end - wake up sleepers on a congested backing_dev_info
* @rw: READ or WRITE
diff --git a/mm/filemap.c b/mm/filemap.c
index edb1b0b5cc8d..5d5449f3d41c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page)
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
+ BUG_ON(page_mapped(page));
}
void remove_from_page_cache(struct page *page)
@@ -866,13 +867,11 @@ void do_generic_mapping_read(struct address_space *mapping,
{
struct inode *inode = mapping->host;
unsigned long index;
- unsigned long end_index;
unsigned long offset;
unsigned long last_index;
unsigned long next_index;
unsigned long prev_index;
unsigned int prev_offset;
- loff_t isize;
struct page *cached_page;
int error;
struct file_ra_state ra = *_ra;
@@ -885,27 +884,12 @@ void do_generic_mapping_read(struct address_space *mapping,
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
+ unsigned long end_index;
+ loff_t isize;
unsigned long nr, ret;
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index >= end_index) {
- if (index > end_index)
- goto out;
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- goto out;
- }
- }
- nr = nr - offset;
-
cond_resched();
if (index == next_index)
next_index = page_cache_readahead(mapping, &ra, filp,
@@ -920,6 +904,32 @@ find_page:
if (!PageUptodate(page))
goto page_not_up_to_date;
page_ok:
+ /*
+ * i_size must be checked after we know the page is Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ goto out;
+ }
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index == end_index) {
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ page_cache_release(page);
+ goto out;
+ }
+ }
+ nr = nr - offset;
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
@@ -1006,31 +1016,6 @@ readpage:
unlock_page(page);
}
- /*
- * i_size must be checked after we have done ->readpage.
- *
- * Checking i_size after the readpage allows us to calculate
- * the correct value for "nr", which means the zero-filled
- * part of the page is not copied back to userspace (unless
- * another truncate extends the file - this is desired though).
- */
- isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- page_cache_release(page);
- goto out;
- }
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- page_cache_release(page);
- goto out;
- }
- }
- nr = nr - offset;
goto page_ok;
readpage_error:
@@ -1218,6 +1203,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
retval = retval ?: desc.error;
break;
}
+ if (desc.count > 0)
+ break;
}
}
out:
@@ -1245,26 +1232,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o
return written;
}
-ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
-{
- read_descriptor_t desc;
-
- if (!count)
- return 0;
-
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
- do_generic_file_read(in_file, ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
-}
-EXPORT_SYMBOL(generic_file_sendfile);
-
static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
unsigned long index, unsigned long nr)
@@ -1786,7 +1753,6 @@ retry:
page = __read_cache_page(mapping, index, filler, data);
if (IS_ERR(page))
return page;
- mark_page_accessed(page);
if (PageUptodate(page))
goto out;
@@ -1985,7 +1951,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (unlikely(*pos + *count > MAX_NON_LFS &&
!(file->f_flags & O_LARGEFILE))) {
if (*pos >= MAX_NON_LFS) {
- send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
if (*count > MAX_NON_LFS - (unsigned long)*pos) {
@@ -2003,7 +1968,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (likely(!isblk)) {
if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
if (*count || *pos > inode->i_sb->s_maxbytes) {
- send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
/* zero-length writes at ->s_maxbytes are OK */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1b49dab9b25d..65ffc321f0c0 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
+#include <linux/sched.h>
#include <asm/tlbflush.h>
#include "filemap.h"
@@ -158,28 +159,6 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
}
EXPORT_SYMBOL_GPL(xip_file_read);
-ssize_t
-xip_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
-{
- read_descriptor_t desc;
-
- if (!count)
- return 0;
-
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
- do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
- ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
-}
-EXPORT_SYMBOL_GPL(xip_file_sendfile);
-
/*
* __xip_unmap is invoked from xip_unmap and
* xip_write
diff --git a/mm/highmem.c b/mm/highmem.c
index be8f8d36a8b9..7a967bc35152 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,9 +46,14 @@ unsigned int nr_free_highpages (void)
pg_data_t *pgdat;
unsigned int pages = 0;
- for_each_online_pgdat(pgdat)
+ for_each_online_pgdat(pgdat) {
pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
NR_FREE_PAGES);
+ if (zone_movable_is_highmem())
+ pages += zone_page_state(
+ &pgdat->node_zones[ZONE_MOVABLE],
+ NR_FREE_PAGES);
+ }
return pages;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eb7180db3033..6912bbf33faa 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,9 @@ unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
+unsigned long hugepages_treat_as_movable;
+
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
@@ -66,14 +69,15 @@ static void enqueue_huge_page(struct page *page)
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
- int nid = numa_node_id();
+ int nid;
struct page *page = NULL;
- struct zonelist *zonelist = huge_zonelist(vma, address);
+ struct zonelist *zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask);
struct zone **z;
for (z = zonelist->zones; *z; z++) {
nid = zone_to_nid(*z);
- if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
+ if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
!list_empty(&hugepage_freelists[nid]))
break;
}
@@ -101,13 +105,20 @@ static void free_huge_page(struct page *page)
static int alloc_fresh_huge_page(void)
{
- static int nid = 0;
+ static int prev_nid;
struct page *page;
- page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
+ static DEFINE_SPINLOCK(nid_lock);
+ int nid;
+
+ spin_lock(&nid_lock);
+ nid = next_node(prev_nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
+ prev_nid = nid;
+ spin_unlock(&nid_lock);
+
+ page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+ HUGETLB_PAGE_ORDER);
if (page) {
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
@@ -256,6 +267,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
max_huge_pages = set_max_huge_pages(max_huge_pages);
return 0;
}
+
+int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (hugepages_treat_as_movable)
+ htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
+ else
+ htlb_alloc_mask = GFP_HIGHUSER;
+ return 0;
+}
+
#endif /* CONFIG_SYSCTL */
int hugetlb_report_meminfo(char *buf)
@@ -326,9 +350,10 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(*ptep));
- ptep_set_access_flags(vma, address, ptep, entry, 1);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ }
}
@@ -473,7 +498,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_MINOR;
}
-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, int write_access)
{
int ret = VM_FAULT_SIGBUS;
diff --git a/mm/madvise.c b/mm/madvise.c
index e75096b5a6d3..93ee375b38e7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/hugetlb.h>
+#include <linux/sched.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -286,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
struct vm_area_struct * vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
+ int write;
size_t len;
- if (madvise_need_mmap_write(behavior))
+ write = madvise_need_mmap_write(behavior);
+ if (write)
down_write(&current->mm->mmap_sem);
else
down_read(&current->mm->mmap_sem);
@@ -353,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
vma = find_vma(current->mm, start);
}
out:
- if (madvise_need_mmap_write(behavior))
+ if (write)
up_write(&current->mm->mmap_sem);
else
up_read(&current->mm->mmap_sem);
diff --git a/mm/memory.c b/mm/memory.c
index cb94488ab96d..9c6ff7fffdc8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
* and ZONE_HIGHMEM.
*/
void * high_memory;
-unsigned long vmalloc_earlyreserve;
EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
-EXPORT_SYMBOL(vmalloc_earlyreserve);
int randomize_va_space __read_mostly = 1;
@@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
do {
struct page *page;
+ /*
+ * If tsk is ooming, cut off its access to large memory
+ * allocations. It has a pending SIGKILL, but it can't
+ * be processed until returning to user space.
+ */
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+ return -ENOMEM;
+
if (write)
foll_flags |= FOLL_WRITE;
@@ -1691,9 +1697,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- ptep_set_access_flags(vma, address, page_table, entry, 1);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ }
ret |= VM_FAULT_WRITE;
goto unlock;
}
@@ -1708,11 +1715,11 @@ gotten:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (old_page == ZERO_PAGE(address)) {
- new_page = alloc_zeroed_user_highpage(vma, address);
+ new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
@@ -2230,7 +2237,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage(vma, address);
+ page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
@@ -2333,7 +2340,8 @@ retry:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, address);
if (!page)
goto oom;
copy_user_highpage(page, new_page, address, vma);
@@ -2525,10 +2533,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
pte_t *pte, pmd_t *pmd, int write_access)
{
pte_t entry;
- pte_t old_entry;
spinlock_t *ptl;
- old_entry = entry = *pte;
+ entry = *pte;
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
@@ -2561,8 +2568,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (!pte_same(old_entry, entry)) {
- ptep_set_access_flags(vma, address, pte, entry, write_access);
+ if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
} else {
@@ -2674,7 +2680,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
write = (vma->vm_flags & VM_WRITE) != 0;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
- len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+ len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
if (ret < 0)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 84279127fcd3..df9d554bea30 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -65,7 +65,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
int zone_type;
zone_type = zone - pgdat->node_zones;
- if (!populated_zone(zone)) {
+ if (!zone->wait_table) {
int ret = 0;
ret = init_currently_empty_zone(zone, phys_start_pfn,
nr_pages, MEMMAP_HOTPLUG);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d76e8eb342d0..9f4e9b95e8f2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -101,8 +101,6 @@
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
-#define PDprintk(fmt...)
-
/* Highest zone. An specific allocation for a zone below that is not
policied. */
enum zone_type policy_zone = 0;
@@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
- PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
+ pr_debug("setting mode %d nodes[0] %lx\n",
+ mode, nodes ? nodes_addr(*nodes)[0] : -1);
+
if (mode == MPOL_DEFAULT)
return NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
int err = 0;
struct mempolicy *old = vma->vm_policy;
- PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
@@ -594,7 +594,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- return alloc_pages_node(node, GFP_HIGHUSER, 0);
+ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -710,7 +710,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
{
struct vm_area_struct *vma = (struct vm_area_struct *)private;
- return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ page_address_in_vma(page, vma));
}
#else
@@ -776,8 +777,8 @@ long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
+ pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode, nmask ? nodes_addr(*nmask)[0] : -1);
down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
@@ -1202,7 +1203,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
#ifdef CONFIG_HUGETLBFS
/* Return a zonelist suitable for a huge page allocation. */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
@@ -1210,7 +1212,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+ return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
@@ -1434,7 +1436,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
- PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+ pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->policy : 0);
}
@@ -1459,7 +1461,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
- PDprintk("deleting %lx-l%x\n", n->start, n->end);
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
mpol_free(n->policy);
kmem_cache_free(sn_cache, n);
@@ -1558,10 +1560,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+ pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
vma->vm_pgoff,
sz, npol? npol->policy : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : -1);
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1597,6 +1599,10 @@ void mpol_free_shared_policy(struct shared_policy *p)
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
+ nodemask_t interleave_nodes;
+ unsigned long largest = 0;
+ int nid, prefer = 0;
+
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL, NULL);
@@ -1605,10 +1611,31 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL, NULL);
- /* Set interleaving policy for system init. This way not all
- the data structures allocated at system boot end up in node zero. */
+ /*
+ * Set interleaving policy for system init. Interleaving is only
+ * enabled across suitably sized nodes (default is >= 16MB), or
+ * fall back to the largest node if they're all smaller.
+ */
+ nodes_clear(interleave_nodes);
+ for_each_online_node(nid) {
+ unsigned long total_pages = node_present_pages(nid);
+
+ /* Preserve the largest node */
+ if (largest < total_pages) {
+ largest = total_pages;
+ prefer = nid;
+ }
+
+ /* Interleave this node? */
+ if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+ node_set(nid, interleave_nodes);
+ }
+
+ /* All too small, use the largest */
+ if (unlikely(nodes_empty(interleave_nodes)))
+ node_set(prefer, interleave_nodes);
- if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+ if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}
diff --git a/mm/mempool.c b/mm/mempool.c
index cc1ca86dfc24..02d5ec3feabc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -62,10 +62,9 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data, int node_id)
{
mempool_t *pool;
- pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
+ pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
if (!pool)
return NULL;
- memset(pool, 0, sizeof(*pool));
pool->elements = kmalloc_node(min_nr * sizeof(void *),
GFP_KERNEL, node_id);
if (!pool->elements) {
@@ -263,6 +262,9 @@ void mempool_free(void *element, mempool_t *pool)
{
unsigned long flags;
+ if (unlikely(element == NULL))
+ return;
+
smp_mb();
if (pool->curr_nr < pool->min_nr) {
spin_lock_irqsave(&pool->lock, flags);
diff --git a/mm/migrate.c b/mm/migrate.c
index a91ca00abebe..34d8ada053e4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -761,7 +761,8 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
- return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
+ return alloc_pages_node(pm->node,
+ GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
}
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 3446b7ef731e..7b2656055d6a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -10,7 +10,18 @@
#include <linux/mm.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+int can_do_mlock(void)
+{
+ if (capable(CAP_IPC_LOCK))
+ return 1;
+ if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(can_do_mlock);
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, unsigned int newflags)
@@ -233,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user)
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ if (lock_limit == RLIM_INFINITY)
+ allowed = 1;
lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
- if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ if (!allowed &&
+ locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
get_uid(user);
user->locked_shm += locked;
diff --git a/mm/mmap.c b/mm/mmap.c
index 68b9ad2ef1d6..144b4a290f2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
struct inode *inode;
unsigned int vm_flags;
- int correct_wcount = 0;
int error;
- struct rb_node ** rb_link, * rb_parent;
int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
+ unsigned long reqprot = prot;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
}
}
- error = security_file_mmap(file, reqprot, prot, flags);
+ error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (error)
return error;
-
+
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+ accountable);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long flags,
+ unsigned int vm_flags, unsigned long pgoff,
+ int accountable)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+ struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
@@ -1175,8 +1190,6 @@ unacct_error:
return error;
}
-EXPORT_SYMBOL(do_mmap_pgoff);
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1536,9 +1549,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
+ * Also guard against wrapping around to address 0.
*/
- address += 4 + PAGE_SIZE - 1;
- address &= PAGE_MASK;
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else {
+ anon_vma_unlock(vma);
+ return -ENOMEM;
+ }
error = 0;
/* Somebody else might have raced and expanded it already */
diff --git a/mm/mremap.c b/mm/mremap.c
index 5d4bd4f95b8e..bc7c52efc71b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr,
if ((addr <= new_addr) && (addr+old_len) > new_addr)
goto out;
+ ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
+
ret = do_munmap(mm, new_addr, new_len);
if (ret)
goto out;
@@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr,
new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
vma->vm_pgoff, map_flags);
- ret = new_addr;
- if (new_addr & ~PAGE_MASK)
+ if (new_addr & ~PAGE_MASK) {
+ ret = new_addr;
+ goto out;
+ }
+
+ ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+ if (ret)
goto out;
}
ret = move_vma(vma, addr, old_len, new_len, new_addr);
diff --git a/mm/msync.c b/mm/msync.c
index 358d73cf7b78..144a7570535d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,6 +12,7 @@
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/sched.h>
/*
* MS_SYNC syncs the entire file - including mappings.
diff --git a/mm/nommu.c b/mm/nommu.c
index 2b16b00a5b11..8bbbf147a794 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
return find_vma(mm, addr);
}
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return -ENOMEM;
+}
+
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm->mmap_sem at least held readlocked
@@ -639,7 +644,7 @@ static int validate_mmap_request(struct file *file,
}
/* allow the security API to have its say */
- ret = security_file_mmap(file, reqprot, prot, flags);
+ ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
if (ret < 0)
return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eec1481ba44f..886ea0d5a136 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg)
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval) {
- mod_timer(&wb_timer,
- jiffies + dirty_writeback_interval);
- } else {
+ if (dirty_writeback_interval)
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+ else
del_timer(&wb_timer);
- }
return 0;
}
@@ -826,6 +824,7 @@ int __set_page_dirty_nobuffers(struct page *page)
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
task_io_account_write(PAGE_CACHE_SIZE);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae96dd844432..e2a10b957f23 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -80,8 +80,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
256,
#endif
#ifdef CONFIG_HIGHMEM
- 32
+ 32,
#endif
+ 32,
};
EXPORT_SYMBOL(totalram_pages);
@@ -95,8 +96,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
"Normal",
#ifdef CONFIG_HIGHMEM
- "HighMem"
+ "HighMem",
#endif
+ "Movable",
};
int min_free_kbytes = 1024;
@@ -126,16 +128,28 @@ static unsigned long __meminitdata dma_reserve;
#endif
#endif
- struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
- int __meminitdata nr_nodemap_entries;
- unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+ static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
+ static int __meminitdata nr_nodemap_entries;
+ static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
- unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
- unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+ static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
+ static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+ unsigned long __initdata required_kernelcore;
+ unsigned long __initdata required_movablecore;
+ unsigned long __initdata zone_movable_pfn[MAX_NUMNODES];
+
+ /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+ int movable_zone;
+ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly = MAX_NUMNODES;
+EXPORT_SYMBOL(nr_node_ids);
+#endif
+
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
@@ -669,26 +683,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return i;
}
-#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-EXPORT_SYMBOL(nr_node_ids);
-
-/*
- * Figure out the number of possible node ids.
- */
-static void __init setup_nr_node_ids(void)
-{
- unsigned int node;
- unsigned int highest = 0;
-
- for_each_node_mask(node, node_possible_map)
- highest = node;
- nr_node_ids = highest + 1;
-}
-#else
-static void __init setup_nr_node_ids(void) {}
-#endif
-
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
@@ -915,11 +909,13 @@ static struct fail_page_alloc_attr {
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
+ u32 min_order;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct dentry *ignore_gfp_highmem_file;
struct dentry *ignore_gfp_wait_file;
+ struct dentry *min_order_file;
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
@@ -927,6 +923,7 @@ static struct fail_page_alloc_attr {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
.ignore_gfp_highmem = 1,
+ .min_order = 1,
};
static int __init setup_fail_page_alloc(char *str)
@@ -937,6 +934,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc);
static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
+ if (order < fail_page_alloc.min_order)
+ return 0;
if (gfp_mask & __GFP_NOFAIL)
return 0;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
@@ -968,12 +967,17 @@ static int __init fail_page_alloc_debugfs(void)
fail_page_alloc.ignore_gfp_highmem_file =
debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem);
+ fail_page_alloc.min_order_file =
+ debugfs_create_u32("min-order", mode, dir,
+ &fail_page_alloc.min_order);
if (!fail_page_alloc.ignore_gfp_wait_file ||
- !fail_page_alloc.ignore_gfp_highmem_file) {
+ !fail_page_alloc.ignore_gfp_highmem_file ||
+ !fail_page_alloc.min_order_file) {
err = -ENOMEM;
debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+ debugfs_remove(fail_page_alloc.min_order_file);
cleanup_fault_attr_dentries(&fail_page_alloc.attr);
}
@@ -1329,7 +1333,7 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -1366,7 +1370,8 @@ nofail_alloc:
*/
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
- if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+ if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
+ (gfp_mask & __GFP_REPEAT))
do_retry = 1;
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
@@ -1479,13 +1484,14 @@ unsigned int nr_free_buffer_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_USER));
}
+EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
/*
* Amount of free RAM allocatable within all zones
*/
unsigned int nr_free_pagecache_pages(void)
{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+ return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
}
static inline void show_node(struct zone *zone)
@@ -1636,8 +1642,8 @@ void show_free_areas(void)
*
* Add all populated zones of a node to the zonelist.
*/
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
- struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
+ int nr_zones, enum zone_type zone_type)
{
struct zone *zone;
@@ -1656,9 +1662,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
return nr_zones;
}
+
+/*
+ * zonelist_order:
+ * 0 = automatic detection of better ordering.
+ * 1 = order by ([node] distance, -zonetype)
+ * 2 = order by (-zonetype, [node] distance)
+ *
+ * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
+ * the same zonelist. So only NUMA can configure this param.
+ */
+#define ZONELIST_ORDER_DEFAULT 0
+#define ZONELIST_ORDER_NODE 1
+#define ZONELIST_ORDER_ZONE 2
+
+/* zonelist order in the kernel.
+ * set_zonelist_order() will set this to NODE or ZONE.
+ */
+static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
+static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
+
+
#ifdef CONFIG_NUMA
+/* The value user specified ....changed by config */
+static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+/* string for sysctl */
+#define NUMA_ZONELIST_ORDER_LEN 16
+char numa_zonelist_order[16] = "default";
+
+/*
+ * interface for configure zonelist ordering.
+ * command line option "numa_zonelist_order"
+ * = "[dD]efault - default, automatic configuration.
+ * = "[nN]ode - order by node locality, then by zone within node
+ * = "[zZ]one - order by zone, then by locality within zone
+ */
+
+static int __parse_numa_zonelist_order(char *s)
+{
+ if (*s == 'd' || *s == 'D') {
+ user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+ } else if (*s == 'n' || *s == 'N') {
+ user_zonelist_order = ZONELIST_ORDER_NODE;
+ } else if (*s == 'z' || *s == 'Z') {
+ user_zonelist_order = ZONELIST_ORDER_ZONE;
+ } else {
+ printk(KERN_WARNING
+ "Ignoring invalid numa_zonelist_order value: "
+ "%s\n", s);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static __init int setup_numa_zonelist_order(char *s)
+{
+ if (s)
+ return __parse_numa_zonelist_order(s);
+ return 0;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ int ret;
+
+ if (write)
+ strncpy(saved_string, (char*)table->data,
+ NUMA_ZONELIST_ORDER_LEN);
+ ret = proc_dostring(table, write, file, buffer, length, ppos);
+ if (ret)
+ return ret;
+ if (write) {
+ int oldval = user_zonelist_order;
+ if (__parse_numa_zonelist_order((char*)table->data)) {
+ /*
+ * bogus value. restore saved string
+ */
+ strncpy((char*)table->data, saved_string,
+ NUMA_ZONELIST_ORDER_LEN);
+ user_zonelist_order = oldval;
+ } else if (oldval != user_zonelist_order)
+ build_all_zonelists();
+ }
+ return 0;
+}
+
+
#define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
+
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
@@ -1673,7 +1772,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
* on them otherwise.
* It returns -1 if no node is found.
*/
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -1719,13 +1818,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
return best_node;
}
-static void __meminit build_zonelists(pg_data_t *pgdat)
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
{
- int j, node, local_node;
enum zone_type i;
- int prev_node, load;
+ int j;
struct zonelist *zonelist;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ for (j = 0; zonelist->zones[j] != NULL; j++)
+ ;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+ zonelist->zones[j] = NULL;
+ }
+}
+
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
+
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+ enum zone_type i;
+ int pos, j, node;
+ int zone_type; /* needs to be signed */
+ struct zone *z;
+ struct zonelist *zonelist;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ pos = 0;
+ for (zone_type = i; zone_type >= 0; zone_type--) {
+ for (j = 0; j < nr_nodes; j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ zonelist->zones[pos++] = z;
+ check_highest_zone(zone_type);
+ }
+ }
+ }
+ zonelist->zones[pos] = NULL;
+ }
+}
+
+static int default_zonelist_order(void)
+{
+ int nid, zone_type;
+ unsigned long low_kmem_size,total_size;
+ struct zone *z;
+ int average_size;
+ /*
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+ * If they are really small and used heavily, the system can fall
+ * into OOM very easily.
+ * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+ */
+ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
+ low_kmem_size = 0;
+ total_size = 0;
+ for_each_online_node(nid) {
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ }
+ if (!low_kmem_size || /* there are no DMA area. */
+ low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
+ return ZONELIST_ORDER_NODE;
+ /*
+ * look into each node's config.
+ * If there is a node whose DMA/DMA32 memory is very big area on
+ * local memory, NODE_ORDER may be suitable.
+ */
+ average_size = total_size / (num_online_nodes() + 1);
+ for_each_online_node(nid) {
+ low_kmem_size = 0;
+ total_size = 0;
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ if (low_kmem_size &&
+ total_size > average_size && /* ignore small node */
+ low_kmem_size > total_size * 70/100)
+ return ZONELIST_ORDER_NODE;
+ }
+ return ZONELIST_ORDER_ZONE;
+}
+
+static void set_zonelist_order(void)
+{
+ if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
+ current_zonelist_order = default_zonelist_order();
+ else
+ current_zonelist_order = user_zonelist_order;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int j, node, load;
+ enum zone_type i;
nodemask_t used_mask;
+ int local_node, prev_node;
+ struct zonelist *zonelist;
+ int order = current_zonelist_order;
/* initialize zonelists */
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1738,6 +1953,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
load = num_online_nodes();
prev_node = local_node;
nodes_clear(used_mask);
+
+ memset(node_load, 0, sizeof(node_load));
+ memset(node_order, 0, sizeof(node_order));
+ j = 0;
+
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
int distance = node_distance(local_node, node);
@@ -1753,23 +1973,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
-
if (distance != node_distance(local_node, prev_node))
- node_load[node] += load;
+ node_load[node] = load;
+
prev_node = node;
load--;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- for (j = 0; zonelist->zones[j] != NULL; j++);
+ if (order == ZONELIST_ORDER_NODE)
+ build_zonelists_in_node_order(pgdat, node);
+ else
+ node_order[j++] = node; /* remember order */
+ }
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- zonelist->zones[j] = NULL;
- }
+ if (order == ZONELIST_ORDER_ZONE) {
+ /* calculate node order -- i.e., DMA last! */
+ build_zonelists_in_zone_order(pgdat, j);
}
}
/* Construct the zonelist performance cache - see further mmzone.h */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
{
int i;
@@ -1786,9 +2008,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
}
}
+
#else /* CONFIG_NUMA */
-static void __meminit build_zonelists(pg_data_t *pgdat)
+static void set_zonelist_order(void)
+{
+ current_zonelist_order = ZONELIST_ORDER_ZONE;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
enum zone_type i,j;
@@ -1824,7 +2052,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
}
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
{
int i;
@@ -1835,7 +2063,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
#endif /* CONFIG_NUMA */
/* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
{
int nid;
@@ -1846,8 +2074,10 @@ static int __meminit __build_all_zonelists(void *dummy)
return 0;
}
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
{
+ set_zonelist_order();
+
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
@@ -1858,8 +2088,13 @@ void __meminit build_all_zonelists(void)
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
- printk("Built %i zonelists. Total pages: %ld\n",
- num_online_nodes(), vm_total_pages);
+ printk("Built %i zonelists in %s order. Total pages: %ld\n",
+ num_online_nodes(),
+ zonelist_order_name[current_zonelist_order],
+ vm_total_pages);
+#ifdef CONFIG_NUMA
+ printk("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
}
/*
@@ -1968,8 +2203,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
}
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
- unsigned long size)
+static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long size)
{
int order;
for (order = 0; order < MAX_ORDER ; order++) {
@@ -1983,7 +2218,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __cpuinit zone_batchsize(struct zone *zone)
+static int __devinit zone_batchsize(struct zone *zone)
{
int batch;
@@ -2165,7 +2400,7 @@ void __init setup_per_cpu_pageset(void)
#endif
-static __meminit noinline
+static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
@@ -2385,7 +2620,7 @@ void __init push_node_boundaries(unsigned int nid,
}
/* If necessary, push the node boundary out for reserve hotadd */
-static void __init account_node_boundary(unsigned int nid,
+static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
@@ -2405,7 +2640,7 @@ static void __init account_node_boundary(unsigned int nid,
void __init push_node_boundaries(unsigned int nid,
unsigned long start_pfn, unsigned long end_pfn) {}
-static void __init account_node_boundary(unsigned int nid,
+static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn) {}
#endif
@@ -2443,10 +2678,67 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
}
/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
+ }
+
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independant of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+void __meminit adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
+}
+
+/*
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
-unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
@@ -2457,6 +2749,9 @@ unsigned long __meminit zone_spanned_pages_in_node(int nid,
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
@@ -2534,7 +2829,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
}
/* Return the number of page frames in holes in a zone on a node */
-unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
@@ -2547,18 +2842,21 @@ unsigned long __meminit zone_absent_pages_in_node(int nid,
zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
node_end_pfn);
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
#else
-static inline unsigned long zone_spanned_pages_in_node(int nid,
+static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zones_size)
{
return zones_size[zone_type];
}
-static inline unsigned long zone_absent_pages_in_node(int nid,
+static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zholes_size)
{
@@ -2678,7 +2976,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
}
}
-static void __meminit alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
@@ -2704,7 +3002,7 @@ static void __meminit alloc_node_mem_map(struct pglist_data *pgdat)
map = alloc_bootmem_node(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
-#ifdef CONFIG_FLATMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
@@ -2733,6 +3031,26 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
}
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+static void __init setup_nr_node_ids(void)
+{
+ unsigned int node;
+ unsigned int highest = 0;
+
+ for_each_node_mask(node, node_possible_map)
+ highest = node;
+ nr_node_ids = highest + 1;
+}
+#else
+static inline void setup_nr_node_ids(void)
+{
+}
+#endif
+
/**
* add_active_range - Register a range of PFNs backed by physical memory
* @nid: The node ID the range resides on
@@ -2904,6 +3222,157 @@ unsigned long __init find_max_pfn_with_active_regions(void)
return max_pfn;
}
+unsigned long __init early_calculate_totalpages(void)
+{
+ int i;
+ unsigned long totalpages = 0;
+
+ for (i = 0; i < nr_nodemap_entries; i++)
+ totalpages += early_node_map[i].end_pfn -
+ early_node_map[i].start_pfn;
+
+ return totalpages;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+{
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ int usable_nodes = num_online_nodes();
+
+ /*
+ * If movablecore was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long totalpages = early_calculate_totalpages();
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /* If kernelcore was not specified, there is no ZONE_MOVABLE */
+ if (!required_kernelcore)
+ return;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ find_usable_zone_for_movable();
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_online_node(nid) {
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_active_range_index_in_nid(i, nid) {
+ unsigned long start_pfn, end_pfn;
+ unsigned long size_pages;
+
+ start_pfn = max(early_node_map[i].start_pfn,
+ zone_movable_pfn[nid]);
+ end_pfn = early_node_map[i].end_pfn;
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisified
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisified
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+}
+
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -2933,19 +3402,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
+ arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
+ arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
+
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes(zone_movable_pfn);
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
- for (i = 0; i < MAX_NR_ZONES; i++)
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
printk(" %-8s %8lu -> %8lu\n",
zone_names[i],
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ printk("Movable zone start PFN for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
+ }
/* Print out the early_node_map[] */
printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
@@ -2962,6 +3449,43 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
find_min_pfn_for_node(nid), NULL);
}
}
+
+static int __init cmdline_parse_core(char *p, unsigned long *core)
+{
+ unsigned long long coremem;
+ if (!p)
+ return -EINVAL;
+
+ coremem = memparse(p, &p);
+ *core = coremem >> PAGE_SHIFT;
+
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ return cmdline_parse_core(p, &required_kernelcore);
+}
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore);
+}
+
+early_param("kernelcore", cmdline_parse_kernelcore);
+early_param("movablecore", cmdline_parse_movablecore);
+
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
/**
@@ -3350,13 +3874,28 @@ void *__init alloc_large_system_hash(const char *tablename,
for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
;
table = (void*) __get_free_pages(GFP_ATOMIC, order);
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table.
+ */
+ if (table) {
+ unsigned long alloc_end = (unsigned long)table +
+ (PAGE_SIZE << order);
+ unsigned long used = (unsigned long)table +
+ PAGE_ALIGN(size);
+ split_page(virt_to_page(table), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
tablename,
(1U << log2qty),
ilog2(size) - PAGE_SHIFT,
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8ce0900dc95c..8f6ee073c0e3 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -92,6 +92,7 @@ struct pdflush_work {
static int __pdflush(struct pdflush_work *my_work)
{
current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
my_work->fn = NULL;
my_work->who = current;
INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/rmap.c b/mm/rmap.c
index 850165d32b7a..61e492597a0b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,24 +53,6 @@
struct kmem_cache *anon_vma_cachep;
-static inline void validate_anon_vma(struct vm_area_struct *find_vma)
-{
-#ifdef CONFIG_DEBUG_VM
- struct anon_vma *anon_vma = find_vma->anon_vma;
- struct vm_area_struct *vma;
- unsigned int mapcount = 0;
- int found = 0;
-
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- mapcount++;
- BUG_ON(mapcount > 100000);
- if (vma == find_vma)
- found = 1;
- }
- BUG_ON(!found);
-#endif
-}
-
/* This must be called under the mmap_sem. */
int anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
- if (anon_vma) {
+ if (anon_vma)
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
- }
}
void anon_vma_link(struct vm_area_struct *vma)
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma)
if (anon_vma) {
spin_lock(&anon_vma->lock);
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
spin_unlock(&anon_vma->lock);
}
}
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma)
return;
spin_lock(&anon_vma->lock);
- validate_anon_vma(vma);
list_del(&vma->anon_vma_node);
/* We must garbage collect the anon_vma if it's empty */
diff --git a/mm/shmem.c b/mm/shmem.c
index e537317bec4d..96fa79fb6ad3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -27,6 +27,7 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/xattr.h>
+#include <linux/exportfs.h>
#include <linux/generic_acl.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -93,8 +94,11 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
* The above definition of ENTRIES_PER_PAGE, and the use of
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
+ *
+ * __GFP_MOVABLE is masked out as swap vectors cannot move
*/
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+ return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
}
static inline void shmem_dir_free(struct page *page)
@@ -372,7 +376,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
}
spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
if (page)
set_page_private(page, 0);
spin_lock(&info->lock);
@@ -967,6 +971,8 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
*nodelist++ = '\0';
if (nodelist_parse(nodelist, *policy_nodes))
goto out;
+ if (!nodes_subset(*policy_nodes, node_online_map))
+ goto out;
}
if (!strcmp(value, "default")) {
*policy = MPOL_DEFAULT;
@@ -1098,9 +1104,9 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
* Normally, filepage is NULL on entry, and either found
* uptodate immediately, or allocated and zeroed, or read
* in under swappage, which is then assigned to filepage.
- * But shmem_prepare_write passes in a locked filepage,
- * which may be found not uptodate by other callers too,
- * and may need to be copied from the swappage read in.
+ * But shmem_readpage and shmem_prepare_write pass in a locked
+ * filepage, which may be found not uptodate by other callers
+ * too, and may need to be copied from the swappage read in.
*/
repeat:
if (!filepage)
@@ -1483,9 +1489,18 @@ static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_symlink_inline_operations;
/*
- * Normally tmpfs makes no use of shmem_prepare_write, but it
- * lets a tmpfs file be used read-write below the loop driver.
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write;
+ * but providing them allows a tmpfs file to be used for splice, sendfile, and
+ * below the loop driver, in the generic fashion that many filesystems support.
*/
+static int shmem_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
+ unlock_page(page);
+ return error;
+}
+
static int
shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
{
@@ -1709,25 +1724,6 @@ static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count
return desc.error;
}
-static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
-{
- read_descriptor_t desc;
-
- if (!count)
- return 0;
-
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
- do_shmem_file_read(in_file, ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
-}
-
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2384,6 +2380,7 @@ static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
+ .readpage = shmem_readpage,
.prepare_write = shmem_prepare_write,
.commit_write = simple_commit_write,
#endif
@@ -2397,7 +2394,8 @@ static const struct file_operations shmem_file_operations = {
.read = shmem_file_read,
.write = shmem_file_write,
.fsync = simple_sync_file,
- .sendfile = shmem_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
#endif
};
diff --git a/mm/slab.c b/mm/slab.c
index 528243e15cc8..96d30ee256ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -137,6 +137,7 @@
/* Shouldn't this be in a header file somewhere? */
#define BYTES_PER_WORD sizeof(void *)
+#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
#ifndef cache_line_size
#define cache_line_size() L1_CACHE_BYTES
@@ -547,7 +548,7 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
if (cachep->flags & SLAB_STORE_USER)
return (unsigned long long *)(objp + cachep->buffer_size -
sizeof(unsigned long long) -
- BYTES_PER_WORD);
+ REDZONE_ALIGN);
return (unsigned long long *) (objp + cachep->buffer_size -
sizeof(unsigned long long));
}
@@ -774,7 +775,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
*/
BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
#endif
- WARN_ON_ONCE(size == 0);
+ if (!size)
+ return ZERO_SIZE_PTR;
+
while (size > csizep->cs_size)
csizep++;
@@ -929,7 +932,7 @@ static void next_reap_node(void)
* the CPUs getting into lockstep and contending for the global cache chain
* lock.
*/
-static void __devinit start_cpu_timer(int cpu)
+static void __cpuinit start_cpu_timer(int cpu)
{
struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
@@ -2037,7 +2040,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
return left_over;
}
-static int setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
{
if (g_cpucache_up == FULL)
return enable_cpucache(cachep);
@@ -2179,7 +2182,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
- if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
+ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
+ 2 * sizeof(unsigned long long)))
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
@@ -2220,12 +2224,20 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
/*
- * Redzoning and user store require word alignment. Note this will be
- * overridden by architecture or caller mandated alignment if either
- * is greater than BYTES_PER_WORD.
+ * Redzoning and user store require word alignment or possibly larger.
+ * Note this will be overridden by architecture or caller mandated
+ * alignment if either is greater than BYTES_PER_WORD.
*/
- if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
- ralign = __alignof__(unsigned long long);
+ if (flags & SLAB_STORE_USER)
+ ralign = BYTES_PER_WORD;
+
+ if (flags & SLAB_RED_ZONE) {
+ ralign = REDZONE_ALIGN;
+ /* If redzoning, ensure that the second redzone is suitably
+ * aligned, by adjusting the object size accordingly. */
+ size += REDZONE_ALIGN - 1;
+ size &= ~(REDZONE_ALIGN - 1);
+ }
/* 2) arch mandated alignment */
if (ralign < ARCH_SLAB_MINALIGN) {
@@ -2262,9 +2274,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
- * the real object.
+ * the real object. But if the second red zone needs to be
+ * aligned to 64 bits, we must allow that much space.
*/
- size += BYTES_PER_WORD;
+ if (flags & SLAB_RED_ZONE)
+ size += REDZONE_ALIGN;
+ else
+ size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
@@ -2338,7 +2354,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
- BUG_ON(!cachep->slabp_cache);
+ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
cachep->ctor = ctor;
cachep->name = name;
@@ -2730,7 +2746,7 @@ static int cache_grow(struct kmem_cache *cachep,
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
+ BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
local_flags = (flags & GFP_LEVEL_MASK);
/* Take the l3 list lock to change the colour_next on this node */
@@ -3376,6 +3392,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+ if (unlikely((flags & __GFP_ZERO) && ptr))
+ memset(ptr, 0, obj_size(cachep));
+
return ptr;
}
@@ -3427,6 +3446,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
+ if (unlikely((flags & __GFP_ZERO) && objp))
+ memset(objp, 0, obj_size(cachep));
+
return objp;
}
@@ -3539,7 +3561,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
- if (use_alien_caches && cache_free_alien(cachep, objp))
+ if (cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {
@@ -3568,23 +3590,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
/**
- * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
- * @cache: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache and set the allocated memory to zero.
- * The flags are only relevant if the cache has no available objects.
- */
-void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
- if (ret)
- memset(ret, 0, obj_size(cache));
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
-/**
* kmem_ptr_validate - check if an untrusted pointer might
* be a slab entry.
* @cachep: the cache we're checking against
@@ -3640,8 +3645,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
struct kmem_cache *cachep;
cachep = kmem_find_general_cachep(size, flags);
- if (unlikely(cachep == NULL))
- return NULL;
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
return kmem_cache_alloc_node(cachep, flags, node);
}
@@ -3713,52 +3718,6 @@ EXPORT_SYMBOL(__kmalloc);
#endif
/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- struct kmem_cache *cache, *new_cache;
- void *ret;
-
- if (unlikely(!p))
- return kmalloc_track_caller(new_size, flags);
-
- if (unlikely(!new_size)) {
- kfree(p);
- return NULL;
- }
-
- cache = virt_to_cache(p);
- new_cache = __find_general_cachep(new_size, flags);
-
- /*
- * If new size fits in the current cache, bail out.
- */
- if (likely(cache == new_cache))
- return (void *)p;
-
- /*
- * We are on the slow-path here so do not use __cache_alloc
- * because it bloats kernel text.
- */
- ret = kmalloc_track_caller(new_size, flags);
- if (ret) {
- memcpy(ret, p, min(new_size, ksize(p)));
- kfree(p);
- }
- return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
-/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
* @objp: The previously allocated object.
@@ -3793,7 +3752,7 @@ void kfree(const void *objp)
struct kmem_cache *c;
unsigned long flags;
- if (unlikely(!objp))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
return;
local_irq_save(flags);
kfree_debugcheck(objp);
@@ -4144,26 +4103,17 @@ static void print_slabinfo_header(struct seq_file *m)
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
- struct list_head *p;
mutex_lock(&cache_chain_mutex);
if (!n)
print_slabinfo_header(m);
- p = cache_chain.next;
- while (n--) {
- p = p->next;
- if (p == &cache_chain)
- return NULL;
- }
- return list_entry(p, struct kmem_cache, next);
+
+ return seq_list_start(&cache_chain, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct kmem_cache *cachep = p;
- ++*pos;
- return cachep->next.next == &cache_chain ?
- NULL : list_entry(cachep->next.next, struct kmem_cache, next);
+ return seq_list_next(p, &cache_chain, pos);
}
static void s_stop(struct seq_file *m, void *p)
@@ -4173,7 +4123,7 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = p;
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -4342,17 +4292,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
static void *leaks_start(struct seq_file *m, loff_t *pos)
{
- loff_t n = *pos;
- struct list_head *p;
-
mutex_lock(&cache_chain_mutex);
- p = cache_chain.next;
- while (n--) {
- p = p->next;
- if (p == &cache_chain)
- return NULL;
- }
- return list_entry(p, struct kmem_cache, next);
+ return seq_list_start(&cache_chain, *pos);
}
static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4403,7 +4344,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
unsigned long offset, size;
- char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
+ char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
@@ -4417,7 +4358,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = p;
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
struct slab *slabp;
struct kmem_list3 *l3;
const char *name;
@@ -4498,7 +4439,7 @@ const struct seq_operations slabstats_op = {
*/
size_t ksize(const void *objp)
{
- if (unlikely(objp == NULL))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
return 0;
return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index 71976c5d40d3..c89ef116d7aa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,57 +3,159 @@
*
* Matt Mackall <mpm@selenic.com> 12/30/03
*
+ * NUMA support by Paul Mundt, 2007.
+ *
* How SLOB works:
*
* The core of SLOB is a traditional K&R style heap allocator, with
* support for returning aligned objects. The granularity of this
- * allocator is 8 bytes on x86, though it's perhaps possible to reduce
- * this to 4 if it's deemed worth the effort. The slob heap is a
- * singly-linked list of pages from __get_free_page, grown on demand
- * and allocation from the heap is currently first-fit.
+ * allocator is as little as 2 bytes, however typically most architectures
+ * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
+ *
+ * The slob heap is a linked list of pages from alloc_pages(), and
+ * within each page, there is a singly-linked list of free blocks (slob_t).
+ * The heap is grown on demand and allocation from the heap is currently
+ * first-fit.
*
* Above this is an implementation of kmalloc/kfree. Blocks returned
- * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * from kmalloc are prepended with a 4-byte header with the kmalloc size.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * __get_free_pages directly so that it can return page-aligned blocks
- * and keeps a linked list of such pages and their orders. These
- * objects are detected in kfree() by their page alignment.
+ * alloc_pages() directly, allocating compound pages so the page order
+ * does not have to be separately tracked, and also stores the exact
+ * allocation size in page->private so that it can be used to accurately
+ * provide ksize(). These objects are detected in kfree() because slob_page()
+ * is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
- * destructors for every SLAB allocation. Objects are returned with
- * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is
- * set, in which case the low-level allocator will fragment blocks to
- * create the proper alignment. Again, objects of page-size or greater
- * are allocated by calling __get_free_pages. As SLAB objects know
- * their size, no separate size bookkeeping is necessary and there is
- * essentially no allocation space overhead.
+ * destructors for every SLAB allocation. Objects are returned with the
+ * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
+ * case the low-level allocator will fragment blocks to create the proper
+ * alignment. Again, objects of page-size or greater are allocated by
+ * calling alloc_pages(). As SLAB objects know their size, no separate
+ * size bookkeeping is necessary and there is essentially no allocation
+ * space overhead, and compound pages aren't needed for multi-page
+ * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
*/
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/timer.h>
#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+/*
+ * slob_block has a field 'units', which indicates size of block if +ve,
+ * or offset of next block if -ve (in SLOB_UNITs).
+ *
+ * Free blocks of size 1 unit simply contain the offset of the next block.
+ * Those with larger size contain their size in the first SLOB_UNIT of
+ * memory, and the offset of the next free block in the second SLOB_UNIT.
+ */
+#if PAGE_SIZE <= (32767 * 2)
+typedef s16 slobidx_t;
+#else
+typedef s32 slobidx_t;
+#endif
struct slob_block {
- int units;
- struct slob_block *next;
+ slobidx_t units;
};
typedef struct slob_block slob_t;
+/*
+ * We use struct page fields to manage some slob allocation aspects,
+ * however to avoid the horrible mess in include/linux/mm_types.h, we'll
+ * just define our own struct page type variant here.
+ */
+struct slob_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ slobidx_t units; /* free units left in page */
+ unsigned long pad[2];
+ slob_t *free; /* first free slob_t in page */
+ struct list_head list; /* linked list of free pages */
+ };
+ struct page page;
+ };
+};
+static inline void struct_slob_page_wrong_size(void)
+{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
+
+/*
+ * free_slob_page: call before a slob_page is returned to the page allocator.
+ */
+static inline void free_slob_page(struct slob_page *sp)
+{
+ reset_page_mapcount(&sp->page);
+ sp->page.mapping = NULL;
+}
+
+/*
+ * All (partially) free slob pages go on this list.
+ */
+static LIST_HEAD(free_slob_pages);
+
+/*
+ * slob_page: True for all slob pages (false for bigblock pages)
+ */
+static inline int slob_page(struct slob_page *sp)
+{
+ return test_bit(PG_active, &sp->flags);
+}
+
+static inline void set_slob_page(struct slob_page *sp)
+{
+ __set_bit(PG_active, &sp->flags);
+}
+
+static inline void clear_slob_page(struct slob_page *sp)
+{
+ __clear_bit(PG_active, &sp->flags);
+}
+
+/*
+ * slob_page_free: true for pages on free_slob_pages list.
+ */
+static inline int slob_page_free(struct slob_page *sp)
+{
+ return test_bit(PG_private, &sp->flags);
+}
+
+static inline void set_slob_page_free(struct slob_page *sp)
+{
+ list_add(&sp->list, &free_slob_pages);
+ __set_bit(PG_private, &sp->flags);
+}
+
+static inline void clear_slob_page_free(struct slob_page *sp)
+{
+ list_del(&sp->list);
+ __clear_bit(PG_private, &sp->flags);
+}
+
#define SLOB_UNIT sizeof(slob_t)
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
#define SLOB_ALIGN L1_CACHE_BYTES
-struct bigblock {
- int order;
- void *pages;
- struct bigblock *next;
-};
-typedef struct bigblock bigblock_t;
-
/*
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
* were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
@@ -64,215 +166,321 @@ struct slob_rcu {
int size;
};
-static slob_t arena = { .next = &arena, .units = 1 };
-static slob_t *slobfree = &arena;
-static bigblock_t *bigblocks;
+/*
+ * slob_lock protects all slob allocator structures.
+ */
static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
-static void slob_free(void *b, int size);
-static void slob_timer_cbk(void);
+/*
+ * Encode the given size and next info into a free slob block s.
+ */
+static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t offset = next - base;
+
+ if (size > 1) {
+ s[0].units = size;
+ s[1].units = offset;
+ } else
+ s[0].units = -offset;
+}
+
+/*
+ * Return the size of a slob block.
+ */
+static slobidx_t slob_units(slob_t *s)
+{
+ if (s->units > 0)
+ return s->units;
+ return 1;
+}
+
+/*
+ * Return the next free slob block pointer after this one.
+ */
+static slob_t *slob_next(slob_t *s)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t next;
+
+ if (s[0].units < 0)
+ next = -s[0].units;
+ else
+ next = s[1].units;
+ return base+next;
+}
+/*
+ * Returns true if s is the last free block in its page.
+ */
+static int slob_last(slob_t *s)
+{
+ return !((unsigned long)slob_next(s) & ~PAGE_MASK);
+}
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+ void *page;
+
+#ifdef CONFIG_NUMA
+ if (node != -1)
+ page = alloc_pages_node(node, gfp, order);
+ else
+#endif
+ page = alloc_pages(gfp, order);
+
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+/*
+ * Allocate a slob block within a given slob_page sp.
+ */
+static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
{
slob_t *prev, *cur, *aligned = 0;
int delta = 0, units = SLOB_UNITS(size);
- unsigned long flags;
- spin_lock_irqsave(&slob_lock, flags);
- prev = slobfree;
- for (cur = prev->next; ; prev = cur, cur = cur->next) {
+ for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+ slobidx_t avail = slob_units(cur);
+
if (align) {
aligned = (slob_t *)ALIGN((unsigned long)cur, align);
delta = aligned - cur;
}
- if (cur->units >= units + delta) { /* room enough? */
+ if (avail >= units + delta) { /* room enough? */
+ slob_t *next;
+
if (delta) { /* need to fragment head to align? */
- aligned->units = cur->units - delta;
- aligned->next = cur->next;
- cur->next = aligned;
- cur->units = delta;
+ next = slob_next(cur);
+ set_slob(aligned, avail - delta, next);
+ set_slob(cur, delta, aligned);
prev = cur;
cur = aligned;
+ avail = slob_units(cur);
}
- if (cur->units == units) /* exact fit? */
- prev->next = cur->next; /* unlink */
- else { /* fragment */
- prev->next = cur + units;
- prev->next->units = cur->units - units;
- prev->next->next = cur->next;
- cur->units = units;
+ next = slob_next(cur);
+ if (avail == units) { /* exact fit? unlink. */
+ if (prev)
+ set_slob(prev, slob_units(prev), next);
+ else
+ sp->free = next;
+ } else { /* fragment */
+ if (prev)
+ set_slob(prev, slob_units(prev), cur + units);
+ else
+ sp->free = cur + units;
+ set_slob(cur + units, avail - units, next);
}
- slobfree = prev;
- spin_unlock_irqrestore(&slob_lock, flags);
+ sp->units -= units;
+ if (!sp->units)
+ clear_slob_page_free(sp);
return cur;
}
- if (cur == slobfree) {
- spin_unlock_irqrestore(&slob_lock, flags);
-
- if (size == PAGE_SIZE) /* trying to shrink arena? */
- return 0;
+ if (slob_last(cur))
+ return NULL;
+ }
+}
- cur = (slob_t *)__get_free_page(gfp);
- if (!cur)
- return 0;
+/*
+ * slob_alloc: entry point into the slob allocator.
+ */
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+{
+ struct slob_page *sp;
+ slob_t *b = NULL;
+ unsigned long flags;
- slob_free(cur, PAGE_SIZE);
- spin_lock_irqsave(&slob_lock, flags);
- cur = slobfree;
+ spin_lock_irqsave(&slob_lock, flags);
+ /* Iterate through each partially free page, try to find room */
+ list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+ /*
+ * If there's a node specification, search for a partial
+ * page with a matching node id in the freelist.
+ */
+ if (node != -1 && page_to_nid(&sp->page) != node)
+ continue;
+#endif
+
+ if (sp->units >= SLOB_UNITS(size)) {
+ b = slob_page_alloc(sp, size, align);
+ if (b)
+ break;
}
}
+ spin_unlock_irqrestore(&slob_lock, flags);
+
+ /* Not enough space: must allocate a new page */
+ if (!b) {
+ b = slob_new_page(gfp, 0, node);
+ if (!b)
+ return 0;
+ sp = (struct slob_page *)virt_to_page(b);
+ set_slob_page(sp);
+
+ spin_lock_irqsave(&slob_lock, flags);
+ sp->units = SLOB_UNITS(PAGE_SIZE);
+ sp->free = b;
+ INIT_LIST_HEAD(&sp->list);
+ set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+ set_slob_page_free(sp);
+ b = slob_page_alloc(sp, size, align);
+ BUG_ON(!b);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ }
+ if (unlikely((gfp & __GFP_ZERO) && b))
+ memset(b, 0, size);
+ return b;
}
+/*
+ * slob_free: entry point into the slob allocator.
+ */
static void slob_free(void *block, int size)
{
- slob_t *cur, *b = (slob_t *)block;
+ struct slob_page *sp;
+ slob_t *prev, *next, *b = (slob_t *)block;
+ slobidx_t units;
unsigned long flags;
- if (!block)
+ if (ZERO_OR_NULL_PTR(block))
return;
+ BUG_ON(!size);
- if (size)
- b->units = SLOB_UNITS(size);
+ sp = (struct slob_page *)virt_to_page(block);
+ units = SLOB_UNITS(size);
- /* Find reinsertion point */
spin_lock_irqsave(&slob_lock, flags);
- for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
- if (cur >= cur->next && (b > cur || b < cur->next))
- break;
-
- if (b + b->units == cur->next) {
- b->units += cur->next->units;
- b->next = cur->next->next;
- } else
- b->next = cur->next;
-
- if (cur + cur->units == b) {
- cur->units += b->units;
- cur->next = b->next;
- } else
- cur->next = b;
- slobfree = cur;
-
- spin_unlock_irqrestore(&slob_lock, flags);
-}
-
-void *__kmalloc(size_t size, gfp_t gfp)
-{
- slob_t *m;
- bigblock_t *bb;
- unsigned long flags;
+ if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
+ /* Go directly to page allocator. Do not pass slob allocator */
+ if (slob_page_free(sp))
+ clear_slob_page_free(sp);
+ clear_slob_page(sp);
+ free_slob_page(sp);
+ free_page((unsigned long)b);
+ goto out;
+ }
- if (size < PAGE_SIZE - SLOB_UNIT) {
- m = slob_alloc(size + SLOB_UNIT, gfp, 0);
- return m ? (void *)(m + 1) : 0;
+ if (!slob_page_free(sp)) {
+ /* This slob page is about to become partially free. Easy! */
+ sp->units = units;
+ sp->free = b;
+ set_slob(b, units,
+ (void *)((unsigned long)(b +
+ SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
+ set_slob_page_free(sp);
+ goto out;
}
- bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
- if (!bb)
- return 0;
+ /*
+ * Otherwise the page is already partially free, so find reinsertion
+ * point.
+ */
+ sp->units += units;
- bb->order = get_order(size);
- bb->pages = (void *)__get_free_pages(gfp, bb->order);
+ if (b < sp->free) {
+ set_slob(b, units, sp->free);
+ sp->free = b;
+ } else {
+ prev = sp->free;
+ next = slob_next(prev);
+ while (b > next) {
+ prev = next;
+ next = slob_next(prev);
+ }
- if (bb->pages) {
- spin_lock_irqsave(&block_lock, flags);
- bb->next = bigblocks;
- bigblocks = bb;
- spin_unlock_irqrestore(&block_lock, flags);
- return bb->pages;
+ if (!slob_last(prev) && b + units == next) {
+ units += slob_units(next);
+ set_slob(b, units, slob_next(next));
+ } else
+ set_slob(b, units, next);
+
+ if (prev + slob_units(prev) == b) {
+ units = slob_units(b) + slob_units(prev);
+ set_slob(prev, units, slob_next(b));
+ } else
+ set_slob(prev, slob_units(prev), b);
}
-
- slob_free(bb, sizeof(bigblock_t));
- return 0;
+out:
+ spin_unlock_irqrestore(&slob_lock, flags);
}
-EXPORT_SYMBOL(__kmalloc);
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- *
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
+/*
+ * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
+#endif
+
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
- void *ret;
+ unsigned int *m;
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
- if (unlikely(!p))
- return kmalloc_track_caller(new_size, flags);
+ if (size < PAGE_SIZE - align) {
+ if (!size)
+ return ZERO_SIZE_PTR;
- if (unlikely(!new_size)) {
- kfree(p);
- return NULL;
- }
+ m = slob_alloc(size + align, gfp, align, node);
+ if (m)
+ *m = size;
+ return (void *)m + align;
+ } else {
+ void *ret;
- ret = kmalloc_track_caller(new_size, flags);
- if (ret) {
- memcpy(ret, p, min(new_size, ksize(p)));
- kfree(p);
+ ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+ if (ret) {
+ struct page *page;
+ page = virt_to_page(ret);
+ page->private = size;
+ }
+ return ret;
}
- return ret;
}
-EXPORT_SYMBOL(krealloc);
+EXPORT_SYMBOL(__kmalloc_node);
void kfree(const void *block)
{
- bigblock_t *bb, **last = &bigblocks;
- unsigned long flags;
+ struct slob_page *sp;
- if (!block)
+ if (ZERO_OR_NULL_PTR(block))
return;
- if (!((unsigned long)block & (PAGE_SIZE-1))) {
- /* might be on the big block list */
- spin_lock_irqsave(&block_lock, flags);
- for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
- if (bb->pages == block) {
- *last = bb->next;
- spin_unlock_irqrestore(&block_lock, flags);
- free_pages((unsigned long)block, bb->order);
- slob_free(bb, sizeof(bigblock_t));
- return;
- }
- }
- spin_unlock_irqrestore(&block_lock, flags);
- }
-
- slob_free((slob_t *)block - 1, 0);
- return;
+ sp = (struct slob_page *)virt_to_page(block);
+ if (slob_page(sp)) {
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ slob_free(m, *m + align);
+ } else
+ put_page(&sp->page);
}
-
EXPORT_SYMBOL(kfree);
+/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t ksize(const void *block)
{
- bigblock_t *bb;
- unsigned long flags;
+ struct slob_page *sp;
- if (!block)
+ if (ZERO_OR_NULL_PTR(block))
return 0;
- if (!((unsigned long)block & (PAGE_SIZE-1))) {
- spin_lock_irqsave(&block_lock, flags);
- for (bb = bigblocks; bb; bb = bb->next)
- if (bb->pages == block) {
- spin_unlock_irqrestore(&slob_lock, flags);
- return PAGE_SIZE << bb->order;
- }
- spin_unlock_irqrestore(&block_lock, flags);
- }
-
- return ((slob_t *)block - 1)->units * SLOB_UNIT;
+ sp = (struct slob_page *)virt_to_page(block);
+ if (slob_page(sp))
+ return ((slob_t *)block - 1)->units + SLOB_UNIT;
+ else
+ return sp->page.private;
}
struct kmem_cache {
@@ -289,7 +497,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
{
struct kmem_cache *c;
- c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+ c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
if (c) {
c->name = name;
@@ -302,6 +510,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
c->ctor = ctor;
/* ignore alignment unless it's forced */
c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+ if (c->align < ARCH_SLAB_MINALIGN)
+ c->align = ARCH_SLAB_MINALIGN;
if (c->align < align)
c->align = align;
} else if (flags & SLAB_PANIC)
@@ -317,31 +527,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_destroy);
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
if (c->size < PAGE_SIZE)
- b = slob_alloc(c->size, flags, c->align);
+ b = slob_alloc(c->size, flags, c->align, node);
else
- b = (void *)__get_free_pages(flags, get_order(c->size));
+ b = slob_new_page(flags, get_order(c->size), node);
if (c->ctor)
c->ctor(b, c, 0);
return b;
}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
-void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
-{
- void *ret = kmem_cache_alloc(c, flags);
- if (ret)
- memset(ret, 0, c->size);
-
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
+EXPORT_SYMBOL(kmem_cache_alloc_node);
static void __kmem_cache_free(void *b, int size)
{
@@ -385,9 +585,6 @@ const char *kmem_cache_name(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_name);
-static struct timer_list slob_timer = TIMER_INITIALIZER(
- (void (*)(unsigned long))slob_timer_cbk, 0, 0);
-
int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
@@ -399,17 +596,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b)
return 0;
}
-void __init kmem_cache_init(void)
+static unsigned int slob_ready __read_mostly;
+
+int slab_is_available(void)
{
- slob_timer_cbk();
+ return slob_ready;
}
-static void slob_timer_cbk(void)
+void __init kmem_cache_init(void)
{
- void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
-
- if (p)
- free_page((unsigned long)p);
-
- mod_timer(&slob_timer, jiffies + HZ);
+ slob_ready = 1;
}
diff --git a/mm/slub.c b/mm/slub.c
index 98801d404d69..52a4f44be394 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -205,6 +205,11 @@ static inline void ClearSlabDebug(struct page *page)
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
+/*
+ * The page->inuse field is 16 bit thus we have this limitation
+ */
+#define MAX_OBJECTS_PER_SLAB 65535
+
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
@@ -228,7 +233,7 @@ static enum {
/* A list of all slab caches on the system */
static DECLARE_RWSEM(slub_lock);
-LIST_HEAD(slab_caches);
+static LIST_HEAD(slab_caches);
/*
* Tracking user of a slab.
@@ -247,9 +252,10 @@ static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
#else
-static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
-static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; }
-static void sysfs_slab_remove(struct kmem_cache *s) {}
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
+ { return 0; }
+static inline void sysfs_slab_remove(struct kmem_cache *s) {}
#endif
/********************************************************************
@@ -323,7 +329,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
/*
* Debug settings:
*/
+#ifdef CONFIG_SLUB_DEBUG_ON
+static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#else
static int slub_debug;
+#endif
static char *slub_debug_slabs;
@@ -340,7 +350,7 @@ static void print_section(char *text, u8 *addr, unsigned int length)
for (i = 0; i < length; i++) {
if (newline) {
- printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
+ printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
newline = 0;
}
printk(" %02x", addr[i]);
@@ -397,10 +407,11 @@ static void set_track(struct kmem_cache *s, void *object,
static void init_tracking(struct kmem_cache *s, void *object)
{
- if (s->flags & SLAB_STORE_USER) {
- set_track(s, object, TRACK_FREE, NULL);
- set_track(s, object, TRACK_ALLOC, NULL);
- }
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ set_track(s, object, TRACK_FREE, NULL);
+ set_track(s, object, TRACK_ALLOC, NULL);
}
static void print_track(const char *s, struct track *t)
@@ -408,65 +419,106 @@ static void print_track(const char *s, struct track *t)
if (!t->addr)
return;
- printk(KERN_ERR "%s: ", s);
+ printk(KERN_ERR "INFO: %s in ", s);
__print_symbol("%s", (unsigned long)t->addr);
- printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+ printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+}
+
+static void print_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC));
+ print_track("Freed", get_track(s, object, TRACK_FREE));
+}
+
+static void print_page_info(struct page *page)
+{
+ printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->inuse, page->freelist, page->flags);
+
+}
+
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "========================================"
+ "=====================================\n");
+ printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+ printk(KERN_ERR "----------------------------------------"
+ "-------------------------------------\n\n");
}
-static void print_trailer(struct kmem_cache *s, u8 *p)
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
+}
+
+static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
+ u8 *addr = page_address(page);
+
+ print_tracking(s, p);
+
+ print_page_info(page);
+
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
+
+ if (p > addr + 16)
+ print_section("Bytes b4", p - 16, 16);
+
+ print_section("Object", p, min(s->objsize, 128));
if (s->flags & SLAB_RED_ZONE)
print_section("Redzone", p + s->objsize,
s->inuse - s->objsize);
- printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
- p + s->offset,
- get_freepointer(s, p));
-
if (s->offset)
off = s->offset + sizeof(void *);
else
off = s->inuse;
- if (s->flags & SLAB_STORE_USER) {
- print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
- print_track("Last free ", get_track(s, p, TRACK_FREE));
+ if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- }
if (off != s->size)
/* Beginning of the filler is the free pointer */
- print_section("Filler", p + off, s->size - off);
+ print_section("Padding", p + off, s->size - off);
+
+ dump_stack();
}
static void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
- u8 *addr = page_address(page);
-
- printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
- s->name, reason, object, page);
- printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
- object - addr, page->flags, page->inuse, page->freelist);
- if (object > addr + 16)
- print_section("Bytes b4", object - 16, 16);
- print_section("Object", object, min(s->objsize, 128));
- print_trailer(s, object);
- dump_stack();
+ slab_bug(s, reason);
+ print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
{
va_list args;
char buf[100];
- va_start(args, reason);
- vsnprintf(buf, sizeof(buf), reason, args);
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
- page);
+ slab_bug(s, fmt);
+ print_page_info(page);
dump_stack();
}
@@ -485,15 +537,46 @@ static void init_object(struct kmem_cache *s, void *object, int active)
s->inuse - s->objsize);
}
-static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
{
while (bytes) {
if (*start != (u8)value)
- return 0;
+ return start;
start++;
bytes--;
}
- return 1;
+ return NULL;
+}
+
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+ void *from, void *to)
+{
+ slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ memset(from, data, to - from);
+}
+
+static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+ u8 *object, char *what,
+ u8* start, unsigned int value, unsigned int bytes)
+{
+ u8 *fault;
+ u8 *end;
+
+ fault = check_bytes(start, value, bytes);
+ if (!fault)
+ return 1;
+
+ end = start + bytes;
+ while (end > fault && end[-1] == value)
+ end--;
+
+ slab_bug(s, "%s overwritten", what);
+ printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault[0], value);
+ print_trailer(s, page, object);
+
+ restore_bytes(s, what, value, fault, end);
+ return 0;
}
/*
@@ -534,14 +617,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
* may be used with merged slabcaches.
*/
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
- void *from, void *to)
-{
- printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
- s->name, message, data, from, to - 1);
- memset(from, data, to - from);
-}
-
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned long off = s->inuse; /* The end of info */
@@ -557,39 +632,39 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
if (s->size == off)
return 1;
- if (check_bytes(p + off, POISON_INUSE, s->size - off))
- return 1;
-
- object_err(s, page, p, "Object padding check fails");
-
- /*
- * Restore padding
- */
- restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
- return 0;
+ return check_bytes_and_report(s, page, p, "Object padding",
+ p + off, POISON_INUSE, s->size - off);
}
static int slab_pad_check(struct kmem_cache *s, struct page *page)
{
- u8 *p;
- int length, remainder;
+ u8 *start;
+ u8 *fault;
+ u8 *end;
+ int length;
+ int remainder;
if (!(s->flags & SLAB_POISON))
return 1;
- p = page_address(page);
+ start = page_address(page);
+ end = start + (PAGE_SIZE << s->order);
length = s->objects * s->size;
- remainder = (PAGE_SIZE << s->order) - length;
+ remainder = end - (start + length);
if (!remainder)
return 1;
- if (!check_bytes(p + length, POISON_INUSE, remainder)) {
- slab_err(s, page, "Padding check failed");
- restore_bytes(s, "slab padding", POISON_INUSE, p + length,
- p + length + remainder);
- return 0;
- }
- return 1;
+ fault = check_bytes(start + length, POISON_INUSE, remainder);
+ if (!fault)
+ return 1;
+ while (end > fault && end[-1] == POISON_INUSE)
+ end--;
+
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ print_section("Padding", start, length);
+
+ restore_bytes(s, "slab padding", POISON_INUSE, start, end);
+ return 0;
}
static int check_object(struct kmem_cache *s, struct page *page,
@@ -602,41 +677,22 @@ static int check_object(struct kmem_cache *s, struct page *page,
unsigned int red =
active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
- if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
- object_err(s, page, object,
- active ? "Redzone Active" : "Redzone Inactive");
- restore_bytes(s, "redzone", red,
- endobject, object + s->inuse);
+ if (!check_bytes_and_report(s, page, object, "Redzone",
+ endobject, red, s->inuse - s->objsize))
return 0;
- }
} else {
- if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
- !check_bytes(endobject, POISON_INUSE,
- s->inuse - s->objsize)) {
- object_err(s, page, p, "Alignment padding check fails");
- /*
- * Fix it so that there will not be another report.
- *
- * Hmmm... We may be corrupting an object that now expects
- * to be longer than allowed.
- */
- restore_bytes(s, "alignment padding", POISON_INUSE,
- endobject, object + s->inuse);
- }
+ if ((s->flags & SLAB_POISON) && s->objsize < s->inuse)
+ check_bytes_and_report(s, page, p, "Alignment padding", endobject,
+ POISON_INUSE, s->inuse - s->objsize);
}
if (s->flags & SLAB_POISON) {
if (!active && (s->flags & __OBJECT_POISON) &&
- (!check_bytes(p, POISON_FREE, s->objsize - 1) ||
- p[s->objsize - 1] != POISON_END)) {
-
- object_err(s, page, p, "Poison check failed");
- restore_bytes(s, "Poison", POISON_FREE,
- p, p + s->objsize -1);
- restore_bytes(s, "Poison", POISON_END,
- p + s->objsize - 1, p + s->objsize);
+ (!check_bytes_and_report(s, page, p, "Poison", p,
+ POISON_FREE, s->objsize - 1) ||
+ !check_bytes_and_report(s, page, p, "Poison",
+ p + s->objsize -1, POISON_END, 1)))
return 0;
- }
/*
* check_pad_bytes cleans up on its own.
*/
@@ -669,25 +725,17 @@ static int check_slab(struct kmem_cache *s, struct page *page)
VM_BUG_ON(!irqs_disabled());
if (!PageSlab(page)) {
- slab_err(s, page, "Not a valid slab page flags=%lx "
- "mapping=0x%p count=%d", page->flags, page->mapping,
- page_count(page));
+ slab_err(s, page, "Not a valid slab page");
return 0;
}
if (page->offset * sizeof(void *) != s->offset) {
- slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
- "mapping=0x%p count=%d",
- (unsigned long)(page->offset * sizeof(void *)),
- page->flags,
- page->mapping,
- page_count(page));
+ slab_err(s, page, "Corrupted offset %lu",
+ (unsigned long)(page->offset * sizeof(void *)));
return 0;
}
if (page->inuse > s->objects) {
- slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
- "mapping=0x%p count=%d",
- s->name, page->inuse, s->objects, page->flags,
- page->mapping, page_count(page));
+ slab_err(s, page, "inuse %u > max %u",
+ s->name, page->inuse, s->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
@@ -715,13 +763,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
set_freepointer(s, object, NULL);
break;
} else {
- slab_err(s, page, "Freepointer 0x%p corrupt",
- fp);
+ slab_err(s, page, "Freepointer corrupt");
page->freelist = NULL;
page->inuse = s->objects;
- printk(KERN_ERR "@@@ SLUB %s: Freelist "
- "cleared. Slab 0x%p\n",
- s->name, page);
+ slab_fix(s, "Freelist cleared");
return 0;
}
break;
@@ -733,11 +778,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
if (page->inuse != s->objects - nr) {
slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", s, page, page->inuse,
- s->objects - nr);
+ "counted were %d", page->inuse, s->objects - nr);
page->inuse = s->objects - nr;
- printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
- "Slab @0x%p\n", s->name, page);
+ slab_fix(s, "Object count adjusted.");
}
return search == NULL;
}
@@ -799,7 +842,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
goto bad;
if (object && !on_freelist(s, page, object)) {
- slab_err(s, page, "Object 0x%p already allocated", object);
+ object_err(s, page, object, "Object already allocated");
goto bad;
}
@@ -825,8 +868,7 @@ bad:
* to avoid issues in the future. Marking all objects
* as used avoids touching the remaining objects.
*/
- printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
- s->name, page);
+ slab_fix(s, "Marking all objects used");
page->inuse = s->objects;
page->freelist = NULL;
/* Fix up fields that may be corrupted */
@@ -847,7 +889,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
}
if (on_freelist(s, page, object)) {
- slab_err(s, page, "Object 0x%p already free", object);
+ object_err(s, page, object, "Object already free");
goto fail;
}
@@ -866,8 +908,8 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
dump_stack();
}
else
- slab_err(s, page, "object at 0x%p belongs "
- "to slab %s", object, page->slab->name);
+ object_err(s, page, object,
+ "page slab pointer corrupt.");
goto fail;
}
@@ -881,45 +923,63 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
return 1;
fail:
- printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
- s->name, page, object);
+ slab_fix(s, "Object at 0x%p not freed", object);
return 0;
}
static int __init setup_slub_debug(char *str)
{
- if (!str || *str != '=')
- slub_debug = DEBUG_DEFAULT_FLAGS;
- else {
- str++;
- if (*str == 0 || *str == ',')
- slub_debug = DEBUG_DEFAULT_FLAGS;
- else
- for( ;*str && *str != ','; str++)
- switch (*str) {
- case 'f' : case 'F' :
- slub_debug |= SLAB_DEBUG_FREE;
- break;
- case 'z' : case 'Z' :
- slub_debug |= SLAB_RED_ZONE;
- break;
- case 'p' : case 'P' :
- slub_debug |= SLAB_POISON;
- break;
- case 'u' : case 'U' :
- slub_debug |= SLAB_STORE_USER;
- break;
- case 't' : case 'T' :
- slub_debug |= SLAB_TRACE;
- break;
- default:
- printk(KERN_ERR "slub_debug option '%c' "
- "unknown. skipped\n",*str);
- }
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ if (*str == ',')
+ /*
+ * No options but restriction on slabs. This means full
+ * debugging for slabs matching a pattern.
+ */
+ goto check_slabs;
+
+ slub_debug = 0;
+ if (*str == '-')
+ /*
+ * Switch off all debugging measures.
+ */
+ goto out;
+
+ /*
+ * Determine which debug features should be switched on
+ */
+ for ( ;*str && *str != ','; str++) {
+ switch (tolower(*str)) {
+ case 'f':
+ slub_debug |= SLAB_DEBUG_FREE;
+ break;
+ case 'z':
+ slub_debug |= SLAB_RED_ZONE;
+ break;
+ case 'p':
+ slub_debug |= SLAB_POISON;
+ break;
+ case 'u':
+ slub_debug |= SLAB_STORE_USER;
+ break;
+ case 't':
+ slub_debug |= SLAB_TRACE;
+ break;
+ default:
+ printk(KERN_ERR "slub_debug option '%c' "
+ "unknown. skipped\n",*str);
+ }
}
+check_slabs:
if (*str == ',')
slub_debug_slabs = str + 1;
+out:
return 1;
}
@@ -939,7 +999,7 @@ static void kmem_cache_open_debug_check(struct kmem_cache *s)
* Debugging or ctor may create a need to move the free
* pointer. Fail if this happens.
*/
- if (s->size >= 65535 * sizeof(void *)) {
+ if (s->objsize >= 65535 * sizeof(void *)) {
BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
BUG_ON(s->ctor);
@@ -1018,7 +1078,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
void *last;
void *p;
- BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
+ BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
if (flags & __GFP_WAIT)
local_irq_enable();
@@ -1336,7 +1396,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
unfreeze_slab(s, page);
}
-static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
+static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
{
slab_lock(page);
deactivate_slab(s, page, cpu);
@@ -1346,7 +1406,7 @@ static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
* Flush cpu slab.
* Called from IPI handler with interrupts disabled.
*/
-static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct page *page = s->cpu_slab[cpu];
@@ -1481,7 +1541,7 @@ debug:
* Otherwise we can simply pick the next object from the lockless free list.
*/
static void __always_inline *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr)
+ gfp_t gfpflags, int node, void *addr)
{
struct page *page;
void **object;
@@ -1499,6 +1559,10 @@ static void __always_inline *slab_alloc(struct kmem_cache *s,
page->lockless_freelist = object[page->offset];
}
local_irq_restore(flags);
+
+ if (unlikely((gfpflags & __GFP_ZERO) && object))
+ memset(object, 0, s->objsize);
+
return object;
}
@@ -1682,8 +1746,17 @@ static inline int slab_order(int size, int min_objects,
{
int order;
int rem;
+ int min_order = slub_min_order;
- for (order = max(slub_min_order,
+ /*
+ * If we would create too many object per slab then reduce
+ * the slab order even if it goes below slub_min_order.
+ */
+ while (min_order > 0 &&
+ (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size)
+ min_order--;
+
+ for (order = max(min_order,
fls(min_objects * size - 1) - PAGE_SHIFT);
order <= max_order; order++) {
@@ -1697,6 +1770,9 @@ static inline int slab_order(int size, int min_objects,
if (rem <= slab_size / fract_leftover)
break;
+ /* If the next size is too high then exit now */
+ if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size)
+ break;
}
return order;
@@ -1777,7 +1853,9 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
atomic_long_set(&n->nr_slabs, 0);
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
+#ifdef CONFIG_SLUB_DEBUG
INIT_LIST_HEAD(&n->full);
+#endif
}
#ifdef CONFIG_NUMA
@@ -1798,8 +1876,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
- /* new_slab() disables interupts */
- local_irq_enable();
BUG_ON(!page);
n = page->freelist;
@@ -1807,10 +1883,19 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
page->freelist = get_freepointer(kmalloc_caches, n);
page->inuse++;
kmalloc_caches->node[node] = n;
- setup_object_debug(kmalloc_caches, page, n);
+#ifdef CONFIG_SLUB_DEBUG
+ init_object(kmalloc_caches, n, 1);
+ init_tracking(kmalloc_caches, n);
+#endif
init_kmem_cache_node(n);
atomic_long_inc(&n->nr_slabs);
add_partial(n, page);
+
+ /*
+ * new_slab() disables interupts. If we do not reenable interrupts here
+ * then bootup would continue with interrupts disabled.
+ */
+ local_irq_enable();
return n;
}
@@ -1917,7 +2002,6 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->inuse = size;
-#ifdef CONFIG_SLUB_DEBUG
if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
s->ctor)) {
/*
@@ -1932,6 +2016,7 @@ static int calculate_sizes(struct kmem_cache *s)
size += sizeof(void *);
}
+#ifdef CONFIG_SLUB_DEBUG
if (flags & SLAB_STORE_USER)
/*
* Need to store information about allocs and frees after
@@ -1979,7 +2064,7 @@ static int calculate_sizes(struct kmem_cache *s)
* The page->inuse field is only 16 bit wide! So we cannot have
* more than 64k objects per slab.
*/
- if (!s->objects || s->objects > 65535)
+ if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB)
return 0;
return 1;
@@ -2016,7 +2101,6 @@ error:
s->offset, flags);
return 0;
}
-EXPORT_SYMBOL(kmem_cache_open);
/*
* Check if a given pointer is valid
@@ -2084,7 +2168,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Release all resources used by a slab cache.
*/
-static int kmem_cache_close(struct kmem_cache *s)
+static inline int kmem_cache_close(struct kmem_cache *s)
{
int node;
@@ -2112,12 +2196,13 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s))
WARN_ON(1);
sysfs_slab_remove(s);
kfree(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2190,47 +2275,92 @@ panic:
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
}
-static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+#ifdef CONFIG_ZONE_DMA
+static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
{
- int index = kmalloc_index(size);
+ struct kmem_cache *s;
+ struct kmem_cache *x;
+ char *text;
+ size_t realsize;
- if (!index)
- return NULL;
+ s = kmalloc_caches_dma[index];
+ if (s)
+ return s;
- /* Allocation too large? */
- BUG_ON(index < 0);
+ /* Dynamically create dma cache */
+ x = kmalloc(kmem_size, flags & ~SLUB_DMA);
+ if (!x)
+ panic("Unable to allocate memory for dma cache\n");
-#ifdef CONFIG_ZONE_DMA
- if ((flags & SLUB_DMA)) {
- struct kmem_cache *s;
- struct kmem_cache *x;
- char *text;
- size_t realsize;
-
- s = kmalloc_caches_dma[index];
- if (s)
- return s;
+ realsize = kmalloc_caches[index].objsize;
+ text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
+ (unsigned int)realsize);
+ s = create_kmalloc_cache(x, text, realsize, flags);
+ down_write(&slub_lock);
+ if (!kmalloc_caches_dma[index]) {
+ kmalloc_caches_dma[index] = s;
+ up_write(&slub_lock);
+ return s;
+ }
+ up_write(&slub_lock);
+ kmem_cache_destroy(s);
+ return kmalloc_caches_dma[index];
+}
+#endif
- /* Dynamically create dma cache */
- x = kmalloc(kmem_size, flags & ~SLUB_DMA);
- if (!x)
- panic("Unable to allocate memory for dma cache\n");
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
- if (index <= KMALLOC_SHIFT_HIGH)
- realsize = 1 << index;
- else {
- if (index == 1)
- realsize = 96;
- else
- realsize = 192;
- }
+static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+{
+ int index;
- text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
- (unsigned int)realsize);
- s = create_kmalloc_cache(x, text, realsize, flags);
- kmalloc_caches_dma[index] = s;
- return s;
+ if (size <= 192) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ index = size_index[(size - 1) / 8];
+ } else {
+ if (size > KMALLOC_MAX_SIZE)
+ return NULL;
+
+ index = fls(size - 1);
}
+
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely((flags & SLUB_DMA)))
+ return dma_kmalloc_cache(index, flags);
+
#endif
return &kmalloc_caches[index];
}
@@ -2239,9 +2369,10 @@ void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s = get_slab(size, flags);
- if (s)
- return slab_alloc(s, flags, -1, __builtin_return_address(0));
- return NULL;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
+
+ return slab_alloc(s, flags, -1, __builtin_return_address(0));
}
EXPORT_SYMBOL(__kmalloc);
@@ -2250,18 +2381,23 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s = get_slab(size, flags);
- if (s)
- return slab_alloc(s, flags, node, __builtin_return_address(0));
- return NULL;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
+
+ return slab_alloc(s, flags, node, __builtin_return_address(0));
}
EXPORT_SYMBOL(__kmalloc_node);
#endif
size_t ksize(const void *object)
{
- struct page *page = get_object_page(object);
+ struct page *page;
struct kmem_cache *s;
+ if (object == ZERO_SIZE_PTR)
+ return 0;
+
+ page = get_object_page(object);
BUG_ON(!page);
s = page->slab;
BUG_ON(!s);
@@ -2293,7 +2429,13 @@ void kfree(const void *x)
struct kmem_cache *s;
struct page *page;
- if (!x)
+ /*
+ * This has to be an unsigned comparison. According to Linus
+ * some gcc version treat a pointer as a signed entity. Then
+ * this comparison would be true for all "negative" pointers
+ * (which would cover the whole upper half of the address space).
+ */
+ if (ZERO_OR_NULL_PTR(x))
return;
page = virt_to_head_page(x);
@@ -2382,43 +2524,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- void *ret;
- size_t ks;
-
- if (unlikely(!p))
- return kmalloc(new_size, flags);
-
- if (unlikely(!new_size)) {
- kfree(p);
- return NULL;
- }
-
- ks = ksize(p);
- if (ks >= new_size)
- return (void *)p;
-
- ret = kmalloc(new_size, flags);
- if (ret) {
- memcpy(ret, p, min(new_size, ks));
- kfree(p);
- }
- return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
@@ -2426,6 +2531,7 @@ EXPORT_SYMBOL(krealloc);
void __init kmem_cache_init(void)
{
int i;
+ int caches = 0;
#ifdef CONFIG_NUMA
/*
@@ -2435,20 +2541,48 @@ void __init kmem_cache_init(void)
*/
create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
sizeof(struct kmem_cache_node), GFP_KERNEL);
+ kmalloc_caches[0].refcount = -1;
+ caches++;
#endif
/* Able to allocate the per node structures */
slab_state = PARTIAL;
/* Caches that are not of the two-to-the-power-of size */
- create_kmalloc_cache(&kmalloc_caches[1],
+ if (KMALLOC_MIN_SIZE <= 64) {
+ create_kmalloc_cache(&kmalloc_caches[1],
"kmalloc-96", 96, GFP_KERNEL);
- create_kmalloc_cache(&kmalloc_caches[2],
+ caches++;
+ }
+ if (KMALLOC_MIN_SIZE <= 128) {
+ create_kmalloc_cache(&kmalloc_caches[2],
"kmalloc-192", 192, GFP_KERNEL);
+ caches++;
+ }
- for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
create_kmalloc_cache(&kmalloc_caches[i],
"kmalloc", 1 << i, GFP_KERNEL);
+ caches++;
+ }
+
+
+ /*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * mips it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
+ size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
slab_state = UP;
@@ -2465,8 +2599,8 @@ void __init kmem_cache_init(void)
nr_cpu_ids * sizeof(struct page *);
printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
- " Processors=%d, Nodes=%d\n",
- KMALLOC_SHIFT_HIGH, cache_line_size(),
+ " CPUs=%d, Nodes=%d\n",
+ caches, cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
}
@@ -2482,6 +2616,12 @@ static int slab_unmergeable(struct kmem_cache *s)
if (s->ctor)
return 1;
+ /*
+ * We may have set a slab to be unmergeable during bootstrap.
+ */
+ if (s->refcount < 0)
+ return 1;
+
return 0;
}
@@ -2489,7 +2629,7 @@ static struct kmem_cache *find_mergeable(size_t size,
size_t align, unsigned long flags,
void (*ctor)(void *, struct kmem_cache *, unsigned long))
{
- struct list_head *h;
+ struct kmem_cache *s;
if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
return NULL;
@@ -2501,10 +2641,7 @@ static struct kmem_cache *find_mergeable(size_t size,
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
+ list_for_each_entry(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -2547,25 +2684,26 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ up_write(&slub_lock);
if (sysfs_slab_alias(s, name))
goto err;
- } else {
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s && kmem_cache_open(s, GFP_KERNEL, name,
+ return s;
+ }
+ s = kmalloc(kmem_size, GFP_KERNEL);
+ if (s) {
+ if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- if (sysfs_slab_add(s)) {
- kfree(s);
- goto err;
- }
list_add(&s->list, &slab_caches);
- } else
- kfree(s);
+ up_write(&slub_lock);
+ if (sysfs_slab_add(s))
+ goto err;
+ return s;
+ }
+ kfree(s);
}
up_write(&slub_lock);
- return s;
err:
- up_write(&slub_lock);
if (flags & SLAB_PANIC)
panic("Cannot create slabcache %s\n", name);
else
@@ -2574,32 +2712,7 @@ err:
}
EXPORT_SYMBOL(kmem_cache_create);
-void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags)
-{
- void *x;
-
- x = slab_alloc(s, flags, -1, __builtin_return_address(0));
- if (x)
- memset(x, 0, s->objsize);
- return x;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
#ifdef CONFIG_SMP
-static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
-{
- struct list_head *h;
-
- down_read(&slub_lock);
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
- func(s, cpu);
- }
- up_read(&slub_lock);
-}
-
/*
* Use the cpu notifier to insure that the cpu slabs are flushed when
* necessary.
@@ -2608,13 +2721,21 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
+ struct kmem_cache *s;
+ unsigned long flags;
switch (action) {
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- for_all_slabs(__flush_cpu_slab, cpu);
+ down_read(&slub_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ local_irq_save(flags);
+ __flush_cpu_slab(s, cpu);
+ local_irq_restore(flags);
+ }
+ up_read(&slub_lock);
break;
default:
break;
@@ -2631,8 +2752,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
{
struct kmem_cache *s = get_slab(size, gfpflags);
- if (!s)
- return NULL;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
return slab_alloc(s, gfpflags, -1, caller);
}
@@ -2642,18 +2763,18 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
{
struct kmem_cache *s = get_slab(size, gfpflags);
- if (!s)
- return NULL;
+ if (ZERO_OR_NULL_PTR(s))
+ return s;
return slab_alloc(s, gfpflags, node, caller);
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
-static int validate_slab(struct kmem_cache *s, struct page *page)
+static int validate_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
{
void *p;
void *addr = page_address(page);
- DECLARE_BITMAP(map, s->objects);
if (!check_slab(s, page) ||
!on_freelist(s, page, NULL))
@@ -2675,10 +2796,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page)
return 1;
}
-static void validate_slab_slab(struct kmem_cache *s, struct page *page)
+static void validate_slab_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
{
if (slab_trylock(page)) {
- validate_slab(s, page);
+ validate_slab(s, page, map);
slab_unlock(page);
} else
printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
@@ -2695,7 +2817,8 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page)
}
}
-static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
+static int validate_slab_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, unsigned long *map)
{
unsigned long count = 0;
struct page *page;
@@ -2704,7 +2827,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru) {
- validate_slab_slab(s, page);
+ validate_slab_slab(s, page, map);
count++;
}
if (count != n->nr_partial)
@@ -2715,7 +2838,7 @@ static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
goto out;
list_for_each_entry(page, &n->full, lru) {
- validate_slab_slab(s, page);
+ validate_slab_slab(s, page, map);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -2728,17 +2851,23 @@ out:
return count;
}
-static unsigned long validate_slab_cache(struct kmem_cache *s)
+static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) *
+ sizeof(unsigned long), GFP_KERNEL);
+
+ if (!map)
+ return -ENOMEM;
flush_all(s);
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
- count += validate_slab_node(s, n);
+ count += validate_slab_node(s, n, map);
}
+ kfree(map);
return count;
}
@@ -2827,18 +2956,14 @@ static void free_loc_track(struct loc_track *t)
get_order(sizeof(struct location) * t->max));
}
-static int alloc_loc_track(struct loc_track *t, unsigned long max)
+static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
{
struct location *l;
int order;
- if (!max)
- max = PAGE_SIZE / sizeof(struct location);
-
order = get_order(sizeof(struct location) * max);
- l = (void *)__get_free_pages(GFP_KERNEL, order);
-
+ l = (void *)__get_free_pages(flags, order);
if (!l)
return 0;
@@ -2904,7 +3029,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
/*
* Not found. Insert new tracking element.
*/
- if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
+ if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
return 0;
l = t->loc + pos;
@@ -2947,11 +3072,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
{
int n = 0;
unsigned long i;
- struct loc_track t;
+ struct loc_track t = { 0, 0, NULL };
int node;
- t.count = 0;
- t.max = 0;
+ if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_KERNEL))
+ return sprintf(buf, "Out of memory\n");
/* Push back cpu slabs */
flush_all(s);
@@ -3002,13 +3128,15 @@ static int list_locations(struct kmem_cache *s, char *buf,
n += sprintf(buf + n, " pid=%ld",
l->min_pid);
- if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
+ if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
+ n < PAGE_SIZE - 60) {
n += sprintf(buf + n, " cpus=");
n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
l->cpus);
}
- if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
+ if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
+ n < PAGE_SIZE - 60) {
n += sprintf(buf + n, " nodes=");
n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
l->nodes);
@@ -3353,11 +3481,14 @@ static ssize_t validate_show(struct kmem_cache *s, char *buf)
static ssize_t validate_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1')
- validate_slab_cache(s);
- else
- return -EINVAL;
- return length;
+ int ret = -EINVAL;
+
+ if (buf[0] == '1') {
+ ret = validate_slab_cache(s);
+ if (ret >= 0)
+ ret = length;
+ }
+ return ret;
}
SLAB_ATTR(validate);
@@ -3511,7 +3642,7 @@ static struct kset_uevent_ops slab_uevent_ops = {
.filter = uevent_filter,
};
-decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
+static decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
#define ID_STR_LENGTH 64
@@ -3609,7 +3740,7 @@ struct saved_alias {
struct saved_alias *next;
};
-struct saved_alias *alias_list;
+static struct saved_alias *alias_list;
static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{
@@ -3637,7 +3768,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
static int __init slab_sysfs_init(void)
{
- struct list_head *h;
+ struct kmem_cache *s;
int err;
err = subsystem_register(&slab_subsys);
@@ -3648,10 +3779,7 @@ static int __init slab_sysfs_init(void)
slab_state = SYSFS;
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
+ list_for_each_entry(s, &slab_caches, list) {
err = sysfs_slab_add(s);
BUG_ON(err);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6f3fff907bc2..e03b39f3540f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(page_to_nid);
#endif
#ifdef CONFIG_SPARSEMEM_EXTREME
-static struct mem_section noinline *sparse_index_alloc(int nid)
+static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
{
struct mem_section *section = NULL;
unsigned long array_size = SECTIONS_PER_ROOT *
@@ -209,6 +209,12 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
return 1;
}
+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+ return NULL;
+}
+
static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -219,6 +225,11 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
if (map)
return map;
+ map = alloc_bootmem_high_node(NODE_DATA(nid),
+ sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
@@ -229,6 +240,27 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
return NULL;
}
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void __init sparse_init(void)
+{
+ unsigned long pnum;
+ struct page *map;
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!valid_section_nr(pnum))
+ continue;
+
+ map = sparse_early_mem_map_alloc(pnum);
+ if (!map)
+ continue;
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map);
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
struct page *page, *ret;
@@ -269,27 +301,6 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
}
/*
- * Allocate the accumulated non-linear sections, allocate a mem_map
- * for each and record the physical to section mapping.
- */
-void __init sparse_init(void)
-{
- unsigned long pnum;
- struct page *map;
-
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
- if (!valid_section_nr(pnum))
- continue;
-
- map = sparse_early_mem_map_alloc(pnum);
- if (!map)
- continue;
- sparse_init_one_section(__nr_to_section(pnum), pnum, map);
- }
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-/*
* returns the number of sections whose mem_maps were properly
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5f7cf2a4cb55..67daecb6031a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,7 +21,7 @@
/*
* swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_list, to make sync_page look nicer, and to allow
+ * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
* future use of radix_tree tags in the swap cache.
*/
static const struct address_space_operations swap_aops = {
@@ -334,7 +334,8 @@ struct page *read_swap_cache_async(swp_entry_t entry,
* Get a new page to read into from swap.
*/
if (!new_page) {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+ vma, addr);
if (!new_page)
break; /* Out of memory */
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index acc172cbe3aa..7ff0a81c7b01 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type)
/*
* So we could skip searching mms once swap count went
* to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_list will preserve it.
+ * mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
unlock_page(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 4fbe1a2da5fb..f47e46d1be3b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -100,9 +100,9 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (PagePrivate(page))
do_invalidatepage(page, 0);
+ remove_from_page_cache(page);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
- remove_from_page_cache(page);
page_cache_release(page); /* pagecache ref */
}
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
}
EXPORT_SYMBOL(truncate_inode_pages);
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, bool be_atomic)
{
struct pagevec pvec;
pgoff_t next = start;
@@ -308,17 +295,38 @@ unlock:
break;
}
pagevec_release(&pvec);
+ if (likely(!be_atomic))
+ cond_resched();
}
return ret;
}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, false);
+}
EXPORT_SYMBOL(invalidate_mapping_pages);
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
- * shrink_list() has a temp ref on them, or because they're transiently sitting
- * in the lru_cache_add() pagevecs.
+ * shrink_page_list() has a temp ref on them, or because they're transiently
+ * sitting in the lru_cache_add() pagevecs.
*/
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
diff --git a/mm/util.c b/mm/util.c
index ace2aea69f1a..78f3783bdcc8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,20 +5,6 @@
#include <asm/uaccess.h>
/**
- * __kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *__kzalloc(size_t size, gfp_t flags)
-{
- void *ret = kmalloc_track_caller(size, flags);
- if (ret)
- memset(ret, 0, size);
- return ret;
-}
-EXPORT_SYMBOL(__kzalloc);
-
-/*
* kstrdup - allocate space for and copy an existing string
*
* @s: the string to duplicate
@@ -58,6 +44,40 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
}
EXPORT_SYMBOL(kmemdup);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+ size_t ks;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ks = ksize(p);
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret) {
+ memcpy(ret, p, min(new_size, ks));
+ kfree(p);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
/*
* strndup_user - duplicate an existing string from user space
*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d3a9c5368257..8e05a11155c9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -68,12 +68,12 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
} while (pud++, addr = next, addr != end);
}
-void unmap_vm_area(struct vm_struct *area)
+void unmap_kernel_range(unsigned long addr, unsigned long size)
{
pgd_t *pgd;
unsigned long next;
- unsigned long addr = (unsigned long) area->addr;
- unsigned long end = addr + area->size;
+ unsigned long start = addr;
+ unsigned long end = addr + size;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
@@ -84,7 +84,12 @@ void unmap_vm_area(struct vm_struct *area)
continue;
vunmap_pud_range(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- flush_tlb_kernel_range((unsigned long) area->addr, end);
+ flush_tlb_kernel_range(start, end);
+}
+
+static void unmap_vm_area(struct vm_struct *area)
+{
+ unmap_kernel_range((unsigned long)area->addr, area->size);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
@@ -427,11 +432,12 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
+ pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
+ PAGE_KERNEL, node);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size,
- (gfp_mask & GFP_LEVEL_MASK),
+ (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO,
node);
}
area->pages = pages;
@@ -440,7 +446,6 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
kfree(area);
return NULL;
}
- memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) {
if (node < 0)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1be5a6376ef0..d419e10e3daa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,17 +66,8 @@ struct scan_control {
int swappiness;
int all_unreclaimable;
-};
-/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
- */
-struct shrinker {
- shrinker_t shrinker;
- struct list_head list;
- int seeks; /* seeks to recreate an obj */
- long nr; /* objs pending delete */
+ int order;
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -121,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem);
/*
* Add a shrinker callback to be called from the vm
*/
-struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+void register_shrinker(struct shrinker *shrinker)
{
- struct shrinker *shrinker;
-
- shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
- if (shrinker) {
- shrinker->shrinker = theshrinker;
- shrinker->seeks = seeks;
- shrinker->nr = 0;
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- up_write(&shrinker_rwsem);
- }
- return shrinker;
+ shrinker->nr = 0;
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
+ up_write(&shrinker_rwsem);
}
-EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(register_shrinker);
/*
* Remove one
*/
-void remove_shrinker(struct shrinker *shrinker)
+void unregister_shrinker(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
- kfree(shrinker);
}
-EXPORT_SYMBOL(remove_shrinker);
+EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
/*
@@ -185,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
- unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+ unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
delta *= max_pass;
@@ -213,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrinker)(0, gfp_mask);
- shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+ nr_before = (*shrinker->shrink)(0, gfp_mask);
+ shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
@@ -481,7 +463,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+ referenced && page_mapping_inuse(page))
goto activate_locked;
#ifdef CONFIG_SWAP
@@ -514,7 +497,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
- if (referenced)
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
@@ -598,6 +581,51 @@ keep:
return nr_reclaimed;
}
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
+#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU. Only take this page
+ * if it is of the appropriate PageActive status. Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page: page to consider
+ * mode: one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+static int __isolate_lru_page(struct page *page, int mode)
+{
+ int ret = -EINVAL;
+
+ /* Only take pages on the LRU. */
+ if (!PageLRU(page))
+ return ret;
+
+ /*
+ * When checking the active state, we need to be sure we are
+ * dealing with comparible boolean values. Take the logical not
+ * of each.
+ */
+ if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ return ret;
+
+ ret = -EBUSY;
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+
+ return ret;
+}
+
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
@@ -612,38 +640,90 @@ keep:
* @src: The LRU list to pull pages off.
* @dst: The temp list to put pages on to.
* @scanned: The number of pages that were scanned.
+ * @order: The caller's attempted allocation order
+ * @mode: One of the LRU isolation modes
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned)
+ unsigned long *scanned, int order, int mode)
{
unsigned long nr_taken = 0;
- struct page *page;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
- struct list_head *target;
+ struct page *page;
+ unsigned long pfn;
+ unsigned long end_pfn;
+ unsigned long page_pfn;
+ int zone_id;
+
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
VM_BUG_ON(!PageLRU(page));
- list_del(&page->lru);
- target = src;
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- target = dst;
+ switch (__isolate_lru_page(page, mode)) {
+ case 0:
+ list_move(&page->lru, dst);
nr_taken++;
- } /* else it is being freed elsewhere */
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+
+ default:
+ BUG();
+ }
+
+ if (!order)
+ continue;
- list_add(&page->lru, target);
+ /*
+ * Attempt to take all pages in the order aligned region
+ * surrounding the tag page. Only take those pages of
+ * the same active state as that tag page. We may safely
+ * round the target page pfn down to the requested order
+ * as the mem_map is guarenteed valid out to MAX_ORDER,
+ * where that page is in a different zone we will detect
+ * it from its zone id and abort this block scan.
+ */
+ zone_id = page_zone_id(page);
+ page_pfn = page_to_pfn(page);
+ pfn = page_pfn & ~((1 << order) - 1);
+ end_pfn = pfn + (1 << order);
+ for (; pfn < end_pfn; pfn++) {
+ struct page *cursor_page;
+
+ /* The target page is in the block, ignore it. */
+ if (unlikely(pfn == page_pfn))
+ continue;
+
+ /* Avoid holes within the zone. */
+ if (unlikely(!pfn_valid_within(pfn)))
+ break;
+
+ cursor_page = pfn_to_page(pfn);
+ /* Check that we have not crossed a zone boundary. */
+ if (unlikely(page_zone_id(cursor_page) != zone_id))
+ continue;
+ switch (__isolate_lru_page(cursor_page, mode)) {
+ case 0:
+ list_move(&cursor_page->lru, dst);
+ nr_taken++;
+ scan++;
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&cursor_page->lru, src);
+ default:
+ break;
+ }
+ }
}
*scanned = scan;
@@ -651,6 +731,24 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
}
/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list)
+{
+ int nr_active = 0;
+ struct page *page;
+
+ list_for_each_entry(page, page_list, lru)
+ if (PageActive(page)) {
+ ClearPageActive(page);
+ nr_active++;
+ }
+
+ return nr_active;
+}
+
+/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
@@ -671,11 +769,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_taken;
unsigned long nr_scan;
unsigned long nr_freed;
+ unsigned long nr_active;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
- &zone->inactive_list,
- &page_list, &nr_scan);
- __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
+ &zone->inactive_list,
+ &page_list, &nr_scan, sc->order,
+ (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+ ISOLATE_BOTH : ISOLATE_INACTIVE);
+ nr_active = clear_active_flags(&page_list);
+
+ __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+ __mod_zone_page_state(zone, NR_INACTIVE,
+ -(nr_taken - nr_active));
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
@@ -820,7 +925,7 @@ force_reclaim_mapped:
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
+ &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
zone->pages_scanned += pgscanned;
__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
spin_unlock_irq(&zone->lru_lock);
@@ -1011,7 +1116,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
{
int priority;
int ret = 0;
@@ -1026,6 +1131,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .order = order,
};
count_vm_event(ALLOCSTALL);
@@ -1131,6 +1237,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
+ .order = order,
};
/*
* temp_priority is used to remember the scanning priority at which
@@ -1314,6 +1421,7 @@ static int kswapd(void *p)
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ set_freezable();
order = 0;
for ( ; ; ) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8faf27e5aa98..fadf791cd7e6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <linux/sched.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -471,13 +472,13 @@ const struct seq_operations fragmentation_op = {
#endif
#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx)
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
static const char * const vmstat_text[] = {
/* Zoned VM counters */
"nr_free_pages",
- "nr_active",
"nr_inactive",
+ "nr_active",
"nr_anon_pages",
"nr_mapped",
"nr_file_pages",