summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c18
-rw-r--r--mm/bootmem.c25
-rw-r--r--mm/filemap.c43
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/highmem.c30
-rw-r--r--mm/hugetlb.c75
-rw-r--r--mm/internal.h13
-rw-r--r--mm/maccess.c55
-rw-r--r--mm/memcontrol.c398
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page_alloc.c31
-rw-r--r--mm/pagewalk.c10
-rw-r--r--mm/pdflush.c8
-rw-r--r--mm/readahead.c5
-rw-r--r--mm/rmap.c17
-rw-r--r--mm/shmem.c34
-rw-r--r--mm/slab.c23
-rw-r--r--mm/slub.c414
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c7
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/tiny-shmem.c10
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c54
-rw-r--r--mm/vmstat.c1
32 files changed, 709 insertions, 653 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 9f117bab5322..18c143b3c46c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
vmalloc.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- page_alloc.o page-writeback.o pdflush.o \
+ maccess.o page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o $(mmu-y)
@@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 7e58322b7134..f4026bae6eed 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -6,6 +6,10 @@
#include <linux/mm.h>
#include <linux/module.h>
+#ifndef cache_line_size
+#define cache_line_size() L1_CACHE_BYTES
+#endif
+
/**
* percpu_depopulate - depopulate per-cpu data for given cpu
* @__pdata: per-cpu data to depopulate
@@ -52,6 +56,11 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
struct percpu_data *pdata = __percpu_disguise(__pdata);
int node = cpu_to_node(cpu);
+ /*
+ * We should make sure each CPU gets private memory.
+ */
+ size = roundup(size, cache_line_size());
+
BUG_ON(pdata->ptrs[cpu]);
if (node_online(node))
pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
@@ -73,9 +82,10 @@ EXPORT_SYMBOL_GPL(percpu_populate);
int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
cpumask_t *mask)
{
- cpumask_t populated = CPU_MASK_NONE;
+ cpumask_t populated;
int cpu;
+ cpus_clear(populated);
for_each_cpu_mask(cpu, *mask)
if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
__percpu_depopulate_mask(__pdata, &populated);
@@ -98,7 +108,11 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask);
*/
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
{
- void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp);
+ /*
+ * We allocate whole cache lines to avoid false sharing
+ */
+ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
+ void *pdata = kzalloc(sz, gfp);
void *__pdata = __percpu_disguise(pdata);
if (unlikely(!pdata))
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f6ff4337b424..2ccea700968f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -125,6 +125,7 @@ static int __init reserve_bootmem_core(bootmem_data_t *bdata,
BUG_ON(!size);
BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
+ BUG_ON(addr < bdata->node_boot_start);
sidx = PFN_DOWN(addr - bdata->node_boot_start);
eidx = PFN_UP(addr + size - bdata->node_boot_start);
@@ -156,21 +157,31 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
unsigned long sidx, eidx;
unsigned long i;
+ BUG_ON(!size);
+
+ /* out range */
+ if (addr + size < bdata->node_boot_start ||
+ PFN_DOWN(addr) > bdata->node_low_pfn)
+ return;
/*
* round down end of usable mem, partially free pages are
* considered reserved.
*/
- BUG_ON(!size);
- BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
- if (addr < bdata->last_success)
+ if (addr >= bdata->node_boot_start && addr < bdata->last_success)
bdata->last_success = addr;
/*
- * Round up the beginning of the address.
+ * Round up to index to the range.
*/
- sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
+ if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
+ sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
+ else
+ sidx = 0;
+
eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
+ if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+ eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
for (i = sidx; i < eidx; i++) {
if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
@@ -421,7 +432,9 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
void __init free_bootmem(unsigned long addr, unsigned long size)
{
- free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+ bootmem_data_t *bdata;
+ list_for_each_entry(bdata, &bdata_list, list)
+ free_bootmem_core(bdata, addr, size);
}
unsigned long __init free_all_bootmem(void)
diff --git a/mm/filemap.c b/mm/filemap.c
index 5c74b68935ac..07e9d9258b48 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,7 +28,6 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
@@ -344,7 +343,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
EXPORT_SYMBOL(sync_page_range);
/**
- * sync_page_range_nolock
+ * sync_page_range_nolock - write & wait on all pages in the passed range without locking
* @inode: target inode
* @mapping: target address_space
* @pos: beginning offset in pages to write
@@ -612,7 +611,10 @@ int __lock_page_killable(struct page *page)
sync_page_killable, TASK_KILLABLE);
}
-/*
+/**
+ * __lock_page_nosync - get a lock on the page, without calling sync_page()
+ * @page: the page to lock
+ *
* Variant of lock_page that does not require the caller to hold a reference
* on the page's mapping.
*/
@@ -1539,9 +1541,20 @@ repeat:
return page;
}
-/*
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
* Same as read_cache_page, but don't wait for page to become unlocked
* after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
*/
struct page *read_cache_page_async(struct address_space *mapping,
pgoff_t index,
@@ -1743,21 +1756,27 @@ size_t iov_iter_copy_from_user(struct page *page,
}
EXPORT_SYMBOL(iov_iter_copy_from_user);
-static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
{
+ BUG_ON(i->count < bytes);
+
if (likely(i->nr_segs == 1)) {
i->iov_offset += bytes;
+ i->count -= bytes;
} else {
const struct iovec *iov = i->iov;
size_t base = i->iov_offset;
/*
* The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments.
+ * zero-length segments (without overruning the iovec).
*/
- while (bytes || !iov->iov_len) {
- int copy = min(bytes, iov->iov_len - base);
+ while (bytes || unlikely(!iov->iov_len && i->count)) {
+ int copy;
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
@@ -1769,14 +1788,6 @@ static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
i->iov_offset = base;
}
}
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- __iov_iter_advance_iov(i, bytes);
- i->count -= bytes;
-}
EXPORT_SYMBOL(iov_iter_advance);
/*
diff --git a/mm/fremap.c b/mm/fremap.c
index 69a37c2bdf81..07a9c82ce1a3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -113,7 +113,7 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
* mmap()/mremap() it does not create any new vmas. The new mappings are
* also safe across swapout.
*
- * NOTE: the 'prot' parameter right now is ignored (but must be zero),
+ * NOTE: the @prot parameter right now is ignored (but must be zero),
* and the vma's default protection is used. Arbitrary protections
* might be implemented in the future.
*/
diff --git a/mm/highmem.c b/mm/highmem.c
index 35d47733cde4..7da4a7b6af11 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -104,8 +104,9 @@ static void flush_all_zero_pkmaps(void)
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
-/* Flush all unused kmap mappings in order to remove stray
- mappings. */
+/**
+ * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+ */
void kmap_flush_unused(void)
{
spin_lock(&kmap_lock);
@@ -163,6 +164,14 @@ start:
return vaddr;
}
+/**
+ * kmap_high - map a highmem page into memory
+ * @page: &struct page to map
+ *
+ * Returns the page's virtual memory address.
+ *
+ * We cannot call this from interrupts, as it may block.
+ */
void *kmap_high(struct page *page)
{
unsigned long vaddr;
@@ -170,8 +179,6 @@ void *kmap_high(struct page *page)
/*
* For highmem pages, we can't trust "virtual" until
* after we have the lock.
- *
- * We cannot call this from interrupts, as it may block
*/
spin_lock(&kmap_lock);
vaddr = (unsigned long)page_address(page);
@@ -185,6 +192,10 @@ void *kmap_high(struct page *page)
EXPORT_SYMBOL(kmap_high);
+/**
+ * kunmap_high - map a highmem page into memory
+ * @page: &struct page to unmap
+ */
void kunmap_high(struct page *page)
{
unsigned long vaddr;
@@ -259,6 +270,12 @@ static struct page_address_slot *page_slot(struct page *page)
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
}
+/**
+ * page_address - get the mapped virtual address of a page
+ * @page: &struct page to get the virtual address of
+ *
+ * Returns the page's virtual address.
+ */
void *page_address(struct page *page)
{
unsigned long flags;
@@ -288,6 +305,11 @@ done:
EXPORT_SYMBOL(page_address);
+/**
+ * set_page_address - set a page's virtual address
+ * @page: &struct page to set
+ * @virtual: virtual address to use
+ */
void set_page_address(struct page *page, void *virtual)
{
unsigned long flags;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cb1b3a7ecdfc..51c9e2c01640 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,7 +71,25 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page(void)
+{
+ int nid;
+ struct page *page = NULL;
+
+ for (nid = 0; nid < MAX_NUMNODES; ++nid) {
+ if (!list_empty(&hugepage_freelists[nid])) {
+ page = list_entry(hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ break;
+ }
+ }
+ return page;
+}
+
+static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
unsigned long address)
{
int nid;
@@ -120,6 +138,7 @@ static void free_huge_page(struct page *page)
struct address_space *mapping;
mapping = (struct address_space *) page_private(page);
+ set_page_private(page, 0);
BUG_ON(page_count(page));
INIT_LIST_HEAD(&page->lru);
@@ -134,7 +153,6 @@ static void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
if (mapping)
hugetlb_put_quota(mapping, 1);
- set_page_private(page, 0);
}
/*
@@ -268,6 +286,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
spin_lock(&hugetlb_lock);
if (page) {
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
/*
@@ -296,8 +320,10 @@ static int gather_surplus_pages(int delta)
int needed, allocated;
needed = (resv_huge_pages + delta) - free_huge_pages;
- if (needed <= 0)
+ if (needed <= 0) {
+ resv_huge_pages += delta;
return 0;
+ }
allocated = 0;
INIT_LIST_HEAD(&surplus_list);
@@ -335,9 +361,12 @@ retry:
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accomodate the reservation. Add the appropriate number
* of pages to the hugetlb pool and free the extras back to the buddy
- * allocator.
+ * allocator. Commit the entire reservation here to prevent another
+ * process from stealing the pages as they are added to the pool but
+ * before they are reserved.
*/
needed += allocated;
+ resv_huge_pages += delta;
ret = 0;
free:
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
@@ -346,13 +375,14 @@ free:
enqueue_huge_page(page);
else {
/*
- * Decrement the refcount and free the page using its
- * destructor. This must be done with hugetlb_lock
+ * The page has a reference count of zero already, so
+ * call free_huge_page directly instead of using
+ * put_page. This must be done with hugetlb_lock
* unlocked which is safe because free_huge_page takes
* hugetlb_lock before deciding how to free the page.
*/
spin_unlock(&hugetlb_lock);
- put_page(page);
+ free_huge_page(page);
spin_lock(&hugetlb_lock);
}
}
@@ -371,9 +401,20 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
struct page *page;
unsigned long nr_pages;
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes. Iterate across all nodes until we
+ * can no longer free unreserved surplus pages. This occurs when
+ * the nodes with surplus pages have no free pages.
+ */
+ unsigned long remaining_iterations = num_online_nodes();
+
+ /* Uncommit the reservation */
+ resv_huge_pages -= unused_resv_pages;
+
nr_pages = min(unused_resv_pages, surplus_huge_pages);
- while (nr_pages) {
+ while (remaining_iterations-- && nr_pages) {
nid = next_node(nid, node_online_map);
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
@@ -391,6 +432,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
surplus_huge_pages--;
surplus_huge_pages_node[nid]--;
nr_pages--;
+ remaining_iterations = num_online_nodes();
}
}
}
@@ -402,7 +444,7 @@ static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
struct page *page;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page(vma, addr);
+ page = dequeue_huge_page_vma(vma, addr);
spin_unlock(&hugetlb_lock);
return page ? page : ERR_PTR(-VM_FAULT_OOM);
}
@@ -417,7 +459,7 @@ static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
spin_lock(&hugetlb_lock);
if (free_huge_pages > resv_huge_pages)
- page = dequeue_huge_page(vma, addr);
+ page = dequeue_huge_page_vma(vma, addr);
spin_unlock(&hugetlb_lock);
if (!page) {
page = alloc_buddy_huge_page(vma, addr);
@@ -570,7 +612,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
min_count = max(count, min_count);
try_to_free_low(min_count);
while (min_count < persistent_huge_pages) {
- struct page *page = dequeue_huge_page(NULL, 0);
+ struct page *page = dequeue_huge_page();
if (!page)
break;
update_and_free_page(page);
@@ -638,9 +680,11 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
{
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
- "Node %d HugePages_Free: %5u\n",
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
nid, nr_huge_pages_node[nid],
- nid, free_huge_pages_node[nid]);
+ nid, free_huge_pages_node[nid],
+ nid, surplus_huge_pages_node[nid]);
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
@@ -1205,12 +1249,13 @@ static int hugetlb_acct_memory(long delta)
if (gather_surplus_pages(delta) < 0)
goto out;
- if (delta > cpuset_mems_nr(free_huge_pages_node))
+ if (delta > cpuset_mems_nr(free_huge_pages_node)) {
+ return_unused_surplus_pages(delta);
goto out;
+ }
}
ret = 0;
- resv_huge_pages += delta;
if (delta < 0)
return_unused_surplus_pages((unsigned long) -delta);
diff --git a/mm/internal.h b/mm/internal.h
index 5a9a6200e034..789727309f4d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,4 +47,17 @@ static inline unsigned long page_order(struct page *page)
VM_BUG_ON(!PageBuddy(page));
return page_private(page);
}
+
+/*
+ * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
+ * so all functions starting at paging_init should be marked __init
+ * in those cases. SPARSEMEM, however, allows for memory hotplug,
+ * and alloc_bootmem_node is not used.
+ */
+#ifdef CONFIG_SPARSEMEM
+#define __paginginit __meminit
+#else
+#define __paginginit __init
+#endif
+
#endif
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 000000000000..ac40796cfb15
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,55 @@
+/*
+ * Access kernel memory without faulting.
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+/**
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_read(void *dst, void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst,
+ (__force const void __user *)src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+/**
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_kernel_write(void *dst, void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_write);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6bded84c20c8..2e0bfc93484b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -137,14 +137,21 @@ struct mem_cgroup {
*/
struct mem_cgroup_stat stat;
};
+static struct mem_cgroup init_mem_cgroup;
/*
* We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock. We need to ensure that page->page_cgroup is atleast two
- * byte aligned (based on comments from Nick Piggin)
+ * lock. We need to ensure that page->page_cgroup is at least two
+ * byte aligned (based on comments from Nick Piggin). But since
+ * bit_spin_lock doesn't actually set that lock bit in a non-debug
+ * uniprocessor kernel, we should avoid setting it here too.
*/
#define PAGE_CGROUP_LOCK_BIT 0x0
-#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
+#else
+#define PAGE_CGROUP_LOCK 0x0
+#endif
/*
* A page_cgroup page is associated with every page descriptor. The
@@ -154,37 +161,27 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct page *page;
struct mem_cgroup *mem_cgroup;
- atomic_t ref_cnt; /* Helpful when pages move b/w */
- /* mapped and cached states */
- int flags;
+ int ref_cnt; /* cached, mapped, migrating */
+ int flags;
};
#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
-static inline int page_cgroup_nid(struct page_cgroup *pc)
+static int page_cgroup_nid(struct page_cgroup *pc)
{
return page_to_nid(pc->page);
}
-static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
+static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
{
return page_zonenum(pc->page);
}
-enum {
- MEM_CGROUP_TYPE_UNSPEC = 0,
- MEM_CGROUP_TYPE_MAPPED,
- MEM_CGROUP_TYPE_CACHED,
- MEM_CGROUP_TYPE_ALL,
- MEM_CGROUP_TYPE_MAX,
-};
-
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
};
-
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
*/
@@ -193,23 +190,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
{
int val = (charge)? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat;
- VM_BUG_ON(!irqs_disabled());
+ VM_BUG_ON(!irqs_disabled());
if (flags & PAGE_CGROUP_FLAG_CACHE)
- __mem_cgroup_stat_add_safe(stat,
- MEM_CGROUP_STAT_CACHE, val);
+ __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
else
__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
}
-static inline struct mem_cgroup_per_zone *
+static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
- BUG_ON(!mem->info.nodeinfo[nid]);
return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}
-static inline struct mem_cgroup_per_zone *
+static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
struct mem_cgroup *mem = pc->mem_cgroup;
@@ -234,18 +229,14 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
return total;
}
-static struct mem_cgroup init_mem_cgroup;
-
-static inline
-struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
return container_of(cgroup_subsys_state(cont,
mem_cgroup_subsys_id), struct mem_cgroup,
css);
}
-static inline
-struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
@@ -267,81 +258,33 @@ void mm_free_cgroup(struct mm_struct *mm)
static inline int page_cgroup_locked(struct page *page)
{
- return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
- &page->page_cgroup);
+ return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
-void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
{
- int locked;
-
- /*
- * While resetting the page_cgroup we might not hold the
- * page_cgroup lock. free_hot_cold_page() is an example
- * of such a scenario
- */
- if (pc)
- VM_BUG_ON(!page_cgroup_locked(page));
- locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
- page->page_cgroup = ((unsigned long)pc | locked);
+ VM_BUG_ON(!page_cgroup_locked(page));
+ page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
}
struct page_cgroup *page_get_page_cgroup(struct page *page)
{
- return (struct page_cgroup *)
- (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+ return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
}
-static void __always_inline lock_page_cgroup(struct page *page)
+static void lock_page_cgroup(struct page *page)
{
bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
- VM_BUG_ON(!page_cgroup_locked(page));
-}
-
-static void __always_inline unlock_page_cgroup(struct page *page)
-{
- bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
-/*
- * Tie new page_cgroup to struct page under lock_page_cgroup()
- * This can fail if the page has been tied to a page_cgroup.
- * If success, returns 0.
- */
-static int page_cgroup_assign_new_page_cgroup(struct page *page,
- struct page_cgroup *pc)
+static int try_lock_page_cgroup(struct page *page)
{
- int ret = 0;
-
- lock_page_cgroup(page);
- if (!page_get_page_cgroup(page))
- page_assign_page_cgroup(page, pc);
- else /* A page is tied to other pc. */
- ret = 1;
- unlock_page_cgroup(page);
- return ret;
+ return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
-/*
- * Clear page->page_cgroup member under lock_page_cgroup().
- * If given "pc" value is different from one page->page_cgroup,
- * page->cgroup is not cleared.
- * Returns a value of page->page_cgroup at lock taken.
- * A can can detect failure of clearing by following
- * clear_page_cgroup(page, pc) == pc
- */
-
-static struct page_cgroup *clear_page_cgroup(struct page *page,
- struct page_cgroup *pc)
+static void unlock_page_cgroup(struct page *page)
{
- struct page_cgroup *ret;
- /* lock and clear */
- lock_page_cgroup(page);
- ret = page_get_page_cgroup(page);
- if (likely(ret == pc))
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
- return ret;
+ bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
static void __mem_cgroup_remove_list(struct page_cgroup *pc)
@@ -399,7 +342,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
int ret;
task_lock(task);
- ret = task->mm && vm_match_cgroup(task->mm, mem);
+ ret = task->mm && mm_match_cgroup(task->mm, mem);
task_unlock(task);
return ret;
}
@@ -407,18 +350,30 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
/*
* This routine assumes that the appropriate zone's lru lock is already held
*/
-void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+void mem_cgroup_move_lists(struct page *page, bool active)
{
+ struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
unsigned long flags;
- if (!pc)
+ /*
+ * We cannot lock_page_cgroup while holding zone's lru_lock,
+ * because other holders of lock_page_cgroup can be interrupted
+ * with an attempt to rotate_reclaimable_page. But we cannot
+ * safely get to page_cgroup without it, so just try_lock it:
+ * mem_cgroup_isolate_pages allows for page left on wrong list.
+ */
+ if (!try_lock_page_cgroup(page))
return;
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_move_lists(pc, active);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ pc = page_get_page_cgroup(page);
+ if (pc) {
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_move_lists(pc, active);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ }
+ unlock_page_cgroup(page);
}
/*
@@ -437,6 +392,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
return (int)((rss * 100L) / total);
}
+
/*
* This function is called from vmscan.c. In page reclaiming loop. balance
* between active and inactive list is calculated. For memory controller
@@ -500,7 +456,6 @@ long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
-
return (nr_inactive >> priority);
}
@@ -534,7 +489,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
if (scan >= nr_to_scan)
break;
page = pc->page;
- VM_BUG_ON(!pc);
if (unlikely(!PageLRU(page)))
continue;
@@ -579,6 +533,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup_per_zone *mz;
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+
/*
* Should page_cgroup's go to their own slab?
* One could optimize the performance of the charging routine
@@ -587,26 +544,21 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
* with it
*/
retry:
- if (page) {
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- /*
- * The page_cgroup exists and
- * the page has already been accounted.
- */
- if (pc) {
- if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
- /* this page is under being uncharged ? */
- unlock_page_cgroup(page);
- cpu_relax();
- goto retry;
- } else {
- unlock_page_cgroup(page);
- goto done;
- }
- }
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ /*
+ * The page_cgroup exists and
+ * the page has already been accounted.
+ */
+ if (pc) {
+ VM_BUG_ON(pc->page != page);
+ VM_BUG_ON(pc->ref_cnt <= 0);
+
+ pc->ref_cnt++;
unlock_page_cgroup(page);
+ goto done;
}
+ unlock_page_cgroup(page);
pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
if (pc == NULL)
@@ -624,16 +576,11 @@ retry:
rcu_read_lock();
mem = rcu_dereference(mm->mem_cgroup);
/*
- * For every charge from the cgroup, increment reference
- * count
+ * For every charge from the cgroup, increment reference count
*/
css_get(&mem->css);
rcu_read_unlock();
- /*
- * If we created the page_cgroup, we should free it on exceeding
- * the cgroup limit.
- */
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
if (!(gfp_mask & __GFP_WAIT))
goto out;
@@ -642,12 +589,12 @@ retry:
continue;
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- */
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
if (res_counter_check_under_limit(&mem->res))
continue;
@@ -658,14 +605,16 @@ retry:
congestion_wait(WRITE, HZ/10);
}
- atomic_set(&pc->ref_cnt, 1);
+ pc->ref_cnt = 1;
pc->mem_cgroup = mem;
pc->page = page;
pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
pc->flags |= PAGE_CGROUP_FLAG_CACHE;
- if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
+ lock_page_cgroup(page);
+ if (page_get_page_cgroup(page)) {
+ unlock_page_cgroup(page);
/*
* Another charge has been added to this page already.
* We take lock_page_cgroup(page) again and read
@@ -674,17 +623,16 @@ retry:
res_counter_uncharge(&mem->res, PAGE_SIZE);
css_put(&mem->css);
kfree(pc);
- if (!page)
- goto done;
goto retry;
}
+ page_assign_page_cgroup(page, pc);
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
- /* Update statistics vector */
__mem_cgroup_add_list(pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
+ unlock_page_cgroup(page);
done:
return 0;
out:
@@ -694,70 +642,64 @@ err:
return -ENOMEM;
}
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask)
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
-/*
- * See if the cached pages should be charged at all?
- */
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- int ret = 0;
if (!mm)
mm = &init_mm;
-
- ret = mem_cgroup_charge_common(page, mm, gfp_mask,
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_CACHE);
- return ret;
}
/*
* Uncharging is always a welcome operation, we never complain, simply
- * uncharge. This routine should be called with lock_page_cgroup held
+ * uncharge.
*/
-void mem_cgroup_uncharge(struct page_cgroup *pc)
+void mem_cgroup_uncharge_page(struct page *page)
{
+ struct page_cgroup *pc;
struct mem_cgroup *mem;
struct mem_cgroup_per_zone *mz;
- struct page *page;
unsigned long flags;
+ if (mem_cgroup_subsys.disabled)
+ return;
+
/*
* Check if our page_cgroup is valid
*/
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
if (!pc)
- return;
+ goto unlock;
- if (atomic_dec_and_test(&pc->ref_cnt)) {
- page = pc->page;
+ VM_BUG_ON(pc->page != page);
+ VM_BUG_ON(pc->ref_cnt <= 0);
+
+ if (--(pc->ref_cnt) == 0) {
mz = page_cgroup_zoneinfo(pc);
- /*
- * get page->cgroup and clear it under lock.
- * force_empty can drop page->cgroup without checking refcnt.
- */
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_remove_list(pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+
+ page_assign_page_cgroup(page, NULL);
unlock_page_cgroup(page);
- if (clear_page_cgroup(page, pc) == pc) {
- mem = pc->mem_cgroup;
- css_put(&mem->css);
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
- kfree(pc);
- }
- lock_page_cgroup(page);
+
+ mem = pc->mem_cgroup;
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+
+ kfree(pc);
+ return;
}
-}
-void mem_cgroup_uncharge_page(struct page *page)
-{
- lock_page_cgroup(page);
- mem_cgroup_uncharge(page_get_page_cgroup(page));
+unlock:
unlock_page_cgroup(page);
}
@@ -765,63 +707,62 @@ void mem_cgroup_uncharge_page(struct page *page)
* Returns non-zero if a page (under migration) has valid page_cgroup member.
* Refcnt of page_cgroup is incremented.
*/
-
int mem_cgroup_prepare_migration(struct page *page)
{
struct page_cgroup *pc;
- int ret = 0;
+
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
- if (pc && atomic_inc_not_zero(&pc->ref_cnt))
- ret = 1;
+ if (pc)
+ pc->ref_cnt++;
unlock_page_cgroup(page);
- return ret;
+ return pc != NULL;
}
void mem_cgroup_end_migration(struct page *page)
{
- struct page_cgroup *pc;
-
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- mem_cgroup_uncharge(pc);
- unlock_page_cgroup(page);
+ mem_cgroup_uncharge_page(page);
}
+
/*
- * We know both *page* and *newpage* are now not-on-LRU and Pg_locked.
+ * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
* And no race with uncharge() routines because page_cgroup for *page*
* has extra one reference by mem_cgroup_prepare_migration.
*/
-
void mem_cgroup_page_migration(struct page *page, struct page *newpage)
{
struct page_cgroup *pc;
- struct mem_cgroup *mem;
- unsigned long flags;
struct mem_cgroup_per_zone *mz;
-retry:
+ unsigned long flags;
+
+ lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
- if (!pc)
+ if (!pc) {
+ unlock_page_cgroup(page);
return;
- mem = pc->mem_cgroup;
+ }
+
mz = page_cgroup_zoneinfo(pc);
- if (clear_page_cgroup(page, pc) != pc)
- goto retry;
spin_lock_irqsave(&mz->lru_lock, flags);
-
__mem_cgroup_remove_list(pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
+ page_assign_page_cgroup(page, NULL);
+ unlock_page_cgroup(page);
+
pc->page = newpage;
lock_page_cgroup(newpage);
page_assign_page_cgroup(newpage, pc);
- unlock_page_cgroup(newpage);
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_add_list(pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
- return;
+
+ unlock_page_cgroup(newpage);
}
/*
@@ -830,14 +771,13 @@ retry:
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
#define FORCE_UNCHARGE_BATCH (128)
-static void
-mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
int active)
{
struct page_cgroup *pc;
struct page *page;
- int count;
+ int count = FORCE_UNCHARGE_BATCH;
unsigned long flags;
struct list_head *list;
@@ -846,46 +786,39 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem,
else
list = &mz->inactive_list;
- if (list_empty(list))
- return;
-retry:
- count = FORCE_UNCHARGE_BATCH;
spin_lock_irqsave(&mz->lru_lock, flags);
-
- while (--count && !list_empty(list)) {
+ while (!list_empty(list)) {
pc = list_entry(list->prev, struct page_cgroup, lru);
page = pc->page;
- /* Avoid race with charge */
- atomic_set(&pc->ref_cnt, 0);
- if (clear_page_cgroup(page, pc) == pc) {
- css_put(&mem->css);
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- __mem_cgroup_remove_list(pc);
- kfree(pc);
- } else /* being uncharged ? ...do relax */
- break;
+ get_page(page);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ if (--count <= 0) {
+ count = FORCE_UNCHARGE_BATCH;
+ cond_resched();
+ }
+ spin_lock_irqsave(&mz->lru_lock, flags);
}
spin_unlock_irqrestore(&mz->lru_lock, flags);
- if (!list_empty(list)) {
- cond_resched();
- goto retry;
- }
- return;
}
/*
* make mem_cgroup's charge to be 0 if there is no task.
* This enables deleting this mem_cgroup.
*/
-
-int mem_cgroup_force_empty(struct mem_cgroup *mem)
+static int mem_cgroup_force_empty(struct mem_cgroup *mem)
{
int ret = -EBUSY;
int node, zid;
+
+ if (mem_cgroup_subsys.disabled)
+ return 0;
+
css_get(&mem->css);
/*
* page reclaim code (kswapd etc..) will move pages between
-` * active_list <-> inactive_list while we don't take a lock.
+ * active_list <-> inactive_list while we don't take a lock.
* So, we have to do loop here until all lists are empty.
*/
while (mem->res.usage > 0) {
@@ -907,9 +840,7 @@ out:
return ret;
}
-
-
-int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
{
*tmp = memparse(buf, &buf);
if (*buf != '\0')
@@ -946,8 +877,7 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
size_t nbytes, loff_t *ppos)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
- int ret;
- ret = mem_cgroup_force_empty(mem);
+ int ret = mem_cgroup_force_empty(mem);
if (!ret)
ret = nbytes;
return ret;
@@ -956,7 +886,6 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
/*
* Note: This should be removed if cgroup supports write-only file.
*/
-
static ssize_t mem_force_empty_read(struct cgroup *cont,
struct cftype *cft,
struct file *file, char __user *userbuf,
@@ -965,7 +894,6 @@ static ssize_t mem_force_empty_read(struct cgroup *cont,
return -EINVAL;
}
-
static const struct mem_cgroup_stat_desc {
const char *msg;
u64 unit;
@@ -1018,8 +946,6 @@ static int mem_control_stat_open(struct inode *unused, struct file *file)
return single_open(file, mem_control_stat_show, cont);
}
-
-
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -1052,7 +978,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
- int zone;
+ int zone, tmp = node;
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
@@ -1061,10 +987,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
* never be onlined. It's better to use memory hotplug callback
* function.
*/
- if (node_state(node, N_HIGH_MEMORY))
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node);
- else
- pn = kmalloc(sizeof(*pn), GFP_KERNEL);
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
@@ -1085,9 +1010,6 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
kfree(mem->info.nodeinfo[node]);
}
-
-static struct mem_cgroup init_mem_cgroup;
-
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
@@ -1101,7 +1023,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
if (mem == NULL)
- return NULL;
+ return ERR_PTR(-ENOMEM);
res_counter_init(&mem->res);
@@ -1117,7 +1039,7 @@ free_out:
free_mem_cgroup_per_zone_info(mem, node);
if (cont->parent != NULL)
kfree(mem);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
@@ -1142,6 +1064,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *cont)
{
+ if (mem_cgroup_subsys.disabled)
+ return 0;
return cgroup_add_files(cont, ss, mem_cgroup_files,
ARRAY_SIZE(mem_cgroup_files));
}
@@ -1154,6 +1078,9 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct mm_struct *mm;
struct mem_cgroup *mem, *old_mem;
+ if (mem_cgroup_subsys.disabled)
+ return;
+
mm = get_task_mm(p);
if (mm == NULL)
return;
@@ -1168,7 +1095,7 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
* Only thread group leaders are allowed to migrate, the mm_struct is
* in effect owned by the leader
*/
- if (p->tgid != p->pid)
+ if (!thread_group_leader(p))
goto out;
css_get(&mem->css);
@@ -1177,7 +1104,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
out:
mmput(mm);
- return;
}
struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory.c b/mm/memory.c
index ce3c9e4492d8..0d14d1e58a5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1711,7 +1711,7 @@ unlock:
}
return ret;
oom_free_new:
- __free_page(new_page);
+ page_cache_release(new_page);
oom:
if (old_page)
page_cache_release(old_page);
@@ -2093,12 +2093,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(page);
if (write_access) {
- /* XXX: We could OR the do_wp_page code with this one? */
- if (do_wp_page(mm, vma, address,
- page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
- mem_cgroup_uncharge_page(page);
- ret = VM_FAULT_OOM;
- }
+ ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ if (ret & VM_FAULT_ERROR)
+ ret &= VM_FAULT_ERROR;
goto out;
}
@@ -2163,7 +2160,7 @@ release:
page_cache_release(page);
goto unlock;
oom_free_page:
- __free_page(page);
+ page_cache_release(page);
oom:
return VM_FAULT_OOM;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7469c503580d..0fb330271271 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -208,7 +208,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
- * memory_block->state_sem.
+ * memory_block->state_mutex.
*/
zone = page_zone(pfn_to_page(pfn));
pgdat_resize_lock(zone->zone_pgdat, &flags);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6c7ba1a63d23..3c3601121509 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1296,7 +1296,9 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- __mpol_free(pol); /* finished with pol */
+ if (unlikely(pol != &default_policy &&
+ pol != current->mempolicy))
+ __mpol_free(pol); /* finished with pol */
return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
}
@@ -1360,6 +1362,9 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+ if (unlikely(pol != &default_policy &&
+ pol != current->mempolicy))
+ __mpol_free(pol); /* finished with pol */
return alloc_page_interleave(gfp, 0, nid);
}
zl = zonelist_policy(gfp, pol);
diff --git a/mm/migrate.c b/mm/migrate.c
index a73504ff5ab9..4e0eccca5e26 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -153,11 +153,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
return;
}
- if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
- pte_unmap(ptep);
- return;
- }
-
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
@@ -169,6 +164,20 @@ static void remove_migration_pte(struct vm_area_struct *vma,
if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
goto out;
+ /*
+ * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
+ * Failure is not an option here: we're now expected to remove every
+ * migration pte, and will cause crashes otherwise. Normally this
+ * is not an issue: mem_cgroup_prepare_migration bumped up the old
+ * page_cgroup count for safety, that's now attached to the new page,
+ * so this charge should just be another incrementation of the count,
+ * to keep in balance with rmap.c's mem_cgroup_uncharging. But if
+ * there's been a force_empty, those reference counts may no longer
+ * be reliable, and this charge can actually fail: oh well, we don't
+ * make the situation any worse by proceeding as if it had succeeded.
+ */
+ mem_cgroup_charge(new, mm, GFP_ATOMIC);
+
get_page(new);
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4194b9db0104..beb592fe9389 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -37,6 +37,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
* badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
* @uptime: current uptime in seconds
+ * @mem: target memory controller
*
* The formula used is relatively simple and documented inline in the
* function. The main rationale is that we want to select a good task
@@ -264,6 +265,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
}
/**
+ * dump_tasks - dump current memory state of all system tasks
+ * @mem: target memory controller
+ *
* Dumps the current memory state of all system tasks, excluding kernel threads.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
* score, and name.
@@ -298,7 +302,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
} while_each_thread(g, p);
}
-/**
+/*
* Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
* set.
@@ -412,14 +416,14 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
return oom_kill_task(p);
}
-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
unsigned long points = 0;
struct task_struct *p;
cgroup_lock();
- rcu_read_lock();
+ read_lock(&tasklist_lock);
retry:
p = select_bad_process(&points, mem);
if (PTR_ERR(p) == -1UL)
@@ -432,7 +436,7 @@ retry:
"Memory cgroup out of memory"))
goto retry;
out:
- rcu_read_unlock();
+ read_unlock(&tasklist_lock);
cgroup_unlock();
}
#endif
@@ -504,6 +508,9 @@ void clear_zonelist_oom(struct zonelist *zonelist)
/**
* out_of_memory - kill the "best" process when we run out of memory
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 75b979313346..32e796af12a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
+#include <linux/jiffies.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
@@ -221,13 +222,19 @@ static inline int bad_range(struct zone *zone, struct page *page)
static void bad_page(struct page *page)
{
- printk(KERN_EMERG "Bad page state in process '%s'\n"
- KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
- KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
- KERN_EMERG "Backtrace:\n",
+ void *pc = page_get_page_cgroup(page);
+
+ printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
+ "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
current->comm, page, (int)(2*sizeof(unsigned long)),
(unsigned long)page->flags, page->mapping,
page_mapcount(page), page_count(page));
+ if (pc) {
+ printk(KERN_EMERG "cgroup:%p\n", pc);
+ page_reset_bad_cgroup(page);
+ }
+ printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+ KERN_EMERG "Backtrace:\n");
dump_stack();
page->flags &= ~(1 << PG_lru |
1 << PG_private |
@@ -453,6 +460,7 @@ static inline int free_pages_check(struct page *page)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
+ (page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
@@ -602,6 +610,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
+ (page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
@@ -988,7 +997,6 @@ static void free_hot_cold_page(struct page *page, int cold)
if (!PageHighMem(page))
debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
- VM_BUG_ON(page_get_page_cgroup(page));
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
@@ -1276,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
if (!zlc)
return NULL;
- if (jiffies - zlc->last_full_zap > 1 * HZ) {
+ if (time_after(jiffies, zlc->last_full_zap + HZ)) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}
@@ -2021,6 +2029,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = -1;
+ node_to_cpumask_ptr(tmp, 0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -2029,7 +2038,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
}
for_each_node_state(n, N_HIGH_MEMORY) {
- cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -2042,8 +2050,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = node_to_cpumask(n);
- if (!cpus_empty(tmp))
+ node_to_cpumask_ptr_next(tmp, n);
+ if (!cpus_empty(*tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -2527,7 +2535,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
- page_assign_page_cgroup(page, NULL);
SetPageReserved(page);
/*
@@ -3314,7 +3321,7 @@ static inline int pageblock_default_order(unsigned int order)
* - mark all memory queues empty
* - clear the memory bitmaps
*/
-static void __meminit free_area_init_core(struct pglist_data *pgdat,
+static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
@@ -3438,7 +3445,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
-void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
+void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b4f27d22da91..1cf1417ef8b7 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -77,11 +77,11 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
/**
* walk_page_range - walk a memory map's page tables with a callback
- * @mm - memory map to walk
- * @addr - starting address
- * @end - ending address
- * @walk - set of callbacks to invoke for each level of the tree
- * @private - private data passed to the callback function
+ * @mm: memory map to walk
+ * @addr: starting address
+ * @end: ending address
+ * @walk: set of callbacks to invoke for each level of the tree
+ * @private: private data passed to the callback function
*
* Recursively walk the page table for the memory area in a VMA,
* calling supplied callbacks. Callbacks are called in-order (first
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e3..1c96cfc9e040 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -17,8 +17,8 @@
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/fs.h> // Needed by writeback.h
-#include <linux/writeback.h> // Prototypes pdflush_operation()
+#include <linux/fs.h> /* Needed by writeback.h */
+#include <linux/writeback.h> /* Prototypes pdflush_operation() */
#include <linux/kthread.h>
#include <linux/cpuset.h>
#include <linux/freezer.h>
@@ -187,8 +187,8 @@ static int pdflush(void *dummy)
* This is needed as pdflush's are dynamically created and destroyed.
* The boottime pdflush's are easily placed w/o these 2 lines.
*/
- cpus_allowed = cpuset_cpus_allowed(current);
- set_cpus_allowed(current, cpus_allowed);
+ cpuset_cpus_allowed(current, &cpus_allowed);
+ set_cpus_allowed_ptr(current, &cpus_allowed);
return __pdflush(&my_work);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index c9c50ca1ec38..8762e8988972 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -443,9 +443,10 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
* pagecache pages
*
* page_cache_async_ondemand() should be called when a page is used which
- * has the PG_readahead flag: this is a marker to suggest that the application
+ * has the PG_readahead flag; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
- * more pages. */
+ * more pages.
+ */
void
page_cache_async_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
diff --git a/mm/rmap.c b/mm/rmap.c
index 8fd527c4e2bf..997f06907b6d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -321,7 +321,7 @@ static int page_referenced_anon(struct page *page,
* counting on behalf of references from different
* cgroups
*/
- if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont))
+ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
@@ -335,6 +335,7 @@ static int page_referenced_anon(struct page *page,
/**
* page_referenced_file - referenced check for object-based rmap
* @page: the page we're checking references on.
+ * @mem_cont: target memory controller
*
* For an object-based mapped page, find all the places it is mapped and
* check/clear the referenced flag. This is done by following the page->mapping
@@ -382,7 +383,7 @@ static int page_referenced_file(struct page *page,
* counting on behalf of references from different
* cgroups
*/
- if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont))
+ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
continue;
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
== (VM_LOCKED|VM_MAYSHARE)) {
@@ -402,6 +403,7 @@ static int page_referenced_file(struct page *page,
* page_referenced - test if the page was referenced
* @page: the page to test
* @is_locked: caller holds lock on the page
+ * @mem_cont: target memory controller
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
@@ -506,7 +508,7 @@ int page_mkclean(struct page *page)
EXPORT_SYMBOL_GPL(page_mkclean);
/**
- * page_set_anon_rmap - setup new anonymous rmap
+ * __page_set_anon_rmap - setup new anonymous rmap
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
@@ -530,7 +532,7 @@ static void __page_set_anon_rmap(struct page *page,
}
/**
- * page_set_anon_rmap - sanity check anonymous rmap addition
+ * __page_check_anon_rmap - sanity check anonymous rmap addition
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
@@ -583,7 +585,7 @@ void page_add_anon_rmap(struct page *page,
}
}
-/*
+/**
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
@@ -623,6 +625,8 @@ void page_add_file_rmap(struct page *page)
/**
* page_dup_rmap - duplicate pte mapping to a page
* @page: the page to add the mapping to
+ * @vma: the vm area being duplicated
+ * @address: the user virtual address mapped
*
* For copy_page_range only: minimal extract from page_add_file_rmap /
* page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
@@ -642,6 +646,7 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
+ * @vma: the vm area in which the mapping is removed
*
* The caller needs to hold the pte lock.
*/
@@ -890,6 +895,7 @@ static int try_to_unmap_anon(struct page *page, int migration)
/**
* try_to_unmap_file - unmap file page using the object-based rmap method
* @page: the page to unmap
+ * @migration: migration flag
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
@@ -986,6 +992,7 @@ out:
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
+ * @migration: migration flag
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
diff --git a/mm/shmem.c b/mm/shmem.c
index 90b576cbc06e..f514dd392cd9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -244,9 +244,8 @@ static void shmem_free_inode(struct super_block *sb)
}
}
-/*
+/**
* shmem_recalc_inode - recalculate the size of an inode
- *
* @inode: inode to recalc
*
* We have to calculate the free blocks since the mm can drop
@@ -270,9 +269,8 @@ static void shmem_recalc_inode(struct inode *inode)
}
}
-/*
+/**
* shmem_swp_entry - find the swap vector position in the info structure
- *
* @info: info structure for the inode
* @index: index of the page to find
* @page: optional page to add to the structure. Has to be preset to
@@ -374,13 +372,13 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
}
}
-/*
+/**
* shmem_swp_alloc - get the position of the swap entry for the page.
- * If it does not exist allocate the entry.
- *
* @info: info structure for the inode
* @index: index of the page to find
* @sgp: check and recheck i_size? skip allocation?
+ *
+ * If the entry does not exist, allocate it.
*/
static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
{
@@ -440,9 +438,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
return entry;
}
-/*
+/**
* shmem_free_swp - free some swap entries in a directory
- *
* @dir: pointer to the directory
* @edir: pointer after last entry of the directory
* @punch_lock: pointer to spinlock when needed for the holepunch case
@@ -1370,14 +1367,17 @@ repeat:
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
unlock_page(swappage);
- page_cache_release(swappage);
if (error == -ENOMEM) {
/* allow reclaim from this memory cgroup */
- error = mem_cgroup_cache_charge(NULL,
+ error = mem_cgroup_cache_charge(swappage,
current->mm, gfp & ~__GFP_HIGHMEM);
- if (error)
+ if (error) {
+ page_cache_release(swappage);
goto failed;
+ }
+ mem_cgroup_uncharge_page(swappage);
}
+ page_cache_release(swappage);
goto repeat;
}
} else if (sgp == SGP_READ && !filepage) {
@@ -2019,7 +2019,7 @@ static const struct inode_operations shmem_symlink_inode_operations = {
};
#ifdef CONFIG_TMPFS_POSIX_ACL
-/**
+/*
* Superblocks without xattr inode operations will get security.* xattr
* support from the VFS "for free". As soon as we have any other xattrs
* like ACLs, we also need to implement the security.* handlers at
@@ -2558,12 +2558,11 @@ out4:
}
module_init(init_tmpfs)
-/*
+/**
* shmem_file_setup - get an unlinked file living in tmpfs
- *
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
- *
+ * @flags: vm_flags
*/
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
{
@@ -2618,9 +2617,8 @@ put_memory:
return ERR_PTR(error);
}
-/*
+/**
* shmem_zero_setup - setup a shared anonymous mapping
- *
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
*/
int shmem_zero_setup(struct vm_area_struct *vma)
diff --git a/mm/slab.c b/mm/slab.c
index 473e6c2eaefb..03927cb5ec9e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -333,7 +333,7 @@ static __always_inline int index_of(const size_t size)
return i; \
else \
i++;
-#include "linux/kmalloc_sizes.h"
+#include <linux/kmalloc_sizes.h>
#undef CACHE
__bad_size();
} else
@@ -1160,14 +1160,13 @@ static void __cpuinit cpuup_canceled(long cpu)
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
int node = cpu_to_node(cpu);
+ node_to_cpumask_ptr(mask, node);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
struct array_cache **alien;
- cpumask_t mask;
- mask = node_to_cpumask(node);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
@@ -1183,7 +1182,7 @@ static void __cpuinit cpuup_canceled(long cpu)
if (nc)
free_block(cachep, nc->entry, nc->avail, node);
- if (!cpus_empty(mask)) {
+ if (!cpus_empty(*mask)) {
spin_unlock_irq(&l3->list_lock);
goto free_array_cache;
}
@@ -1481,7 +1480,7 @@ void __init kmem_cache_init(void)
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
- cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
+ cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
* struct kmem_cache size depends on nr_node_ids, which
@@ -1602,7 +1601,7 @@ void __init kmem_cache_init(void)
int nid;
for_each_online_node(nid) {
- init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid);
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC + nid], nid);
@@ -2964,11 +2963,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
struct array_cache *ac;
int node;
- node = numa_node_id();
-
+retry:
check_irq_off();
+ node = numa_node_id();
ac = cpu_cache_get(cachep);
-retry:
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
@@ -3280,7 +3278,7 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, flags, -1);
+ obj = kmem_getpages(cache, local_flags, -1);
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
@@ -3625,12 +3623,11 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
/**
- * kmem_ptr_validate - check if an untrusted pointer might
- * be a slab entry.
+ * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
* @cachep: the cache we're checking against
* @ptr: pointer to validate
*
- * This verifies that the untrusted pointer looks sane:
+ * This verifies that the untrusted pointer looks sane;
* it is _not_ a guarantee that the pointer is actually
* part of the slab cache in question, but it at least
* validates that the pointer can be dereferenced and
diff --git a/mm/slub.c b/mm/slub.c
index 4b3895cb90ee..7f8aaa291a4e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -149,13 +149,6 @@ static inline void ClearSlabDebug(struct page *page)
/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST
-/*
- * Currently fastpath is not supported if preemption is enabled.
- */
-#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT)
-#define SLUB_FASTPATH
-#endif
-
#if PAGE_SHIFT <= 12
/*
@@ -298,32 +291,16 @@ static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
#endif
}
-/*
- * The end pointer in a slab is special. It points to the first object in the
- * slab but has bit 0 set to mark it.
- *
- * Note that SLUB relies on page_mapping returning NULL for pages with bit 0
- * in the mapping set.
- */
-static inline int is_end(void *addr)
-{
- return (unsigned long)addr & PAGE_MAPPING_ANON;
-}
-
-static void *slab_address(struct page *page)
-{
- return page->end - PAGE_MAPPING_ANON;
-}
-
+/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
struct page *page, const void *object)
{
void *base;
- if (object == page->end)
+ if (!object)
return 1;
- base = slab_address(page);
+ base = page_address(page);
if (object < base || object >= base + s->objects * s->size ||
(object - base) % s->size) {
return 0;
@@ -356,8 +333,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Scan freelist */
#define for_each_free_object(__p, __s, __free) \
- for (__p = (__free); (__p) != page->end; __p = get_freepointer((__s),\
- __p))
+ for (__p = (__free); __p; __p = get_freepointer((__s), __p))
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -509,7 +485,7 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
- u8 *addr = slab_address(page);
+ u8 *addr = page_address(page);
print_tracking(s, p);
@@ -644,7 +620,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
* C. Padding to reach required alignment boundary or at mininum
- * one word if debuggin is on to be able to detect writes
+ * one word if debugging is on to be able to detect writes
* before the word boundary.
*
* Padding is done using 0x5a (POISON_INUSE)
@@ -687,7 +663,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!(s->flags & SLAB_POISON))
return 1;
- start = slab_address(page);
+ start = page_address(page);
end = start + (PAGE_SIZE << s->order);
length = s->objects * s->size;
remainder = end - (start + length);
@@ -755,7 +731,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
* of the free objects in this slab. May cause
* another error because the object count is now wrong.
*/
- set_freepointer(s, p, page->end);
+ set_freepointer(s, p, NULL);
return 0;
}
return 1;
@@ -789,18 +765,18 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
void *fp = page->freelist;
void *object = NULL;
- while (fp != page->end && nr <= s->objects) {
+ while (fp && nr <= s->objects) {
if (fp == search)
return 1;
if (!check_valid_pointer(s, page, fp)) {
if (object) {
object_err(s, page, object,
"Freechain corrupt");
- set_freepointer(s, object, page->end);
+ set_freepointer(s, object, NULL);
break;
} else {
slab_err(s, page, "Freepointer corrupt");
- page->freelist = page->end;
+ page->freelist = NULL;
page->inuse = s->objects;
slab_fix(s, "Freelist cleared");
return 0;
@@ -861,6 +837,35 @@ static void remove_full(struct kmem_cache *s, struct page *page)
spin_unlock(&n->list_lock);
}
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * May be called early in order to allocate a slab for the
+ * kmem_cache_node structure. Solve the chicken-egg
+ * dilemma by deferring the increment of the count during
+ * bootstrap (see early_kmem_cache_node_alloc).
+ */
+ if (!NUMA_BUILD || n)
+ atomic_long_inc(&n->nr_slabs);
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ atomic_long_dec(&n->nr_slabs);
+}
+
+/* Object debug checks for alloc/free paths */
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
@@ -877,7 +882,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
if (!check_slab(s, page))
goto bad;
- if (object && !on_freelist(s, page, object)) {
+ if (!on_freelist(s, page, object)) {
object_err(s, page, object, "Object already allocated");
goto bad;
}
@@ -887,7 +892,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
goto bad;
}
- if (object && !check_object(s, page, object, 0))
+ if (!check_object(s, page, object, 0))
goto bad;
/* Success perform special debug activities for allocs */
@@ -906,7 +911,7 @@ bad:
*/
slab_fix(s, "Marking all objects used");
page->inuse = s->objects;
- page->freelist = page->end;
+ page->freelist = NULL;
}
return 0;
}
@@ -946,7 +951,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
}
/* Special debug activities for freeing objects */
- if (!SlabFrozen(page) && page->freelist == page->end)
+ if (!SlabFrozen(page) && !page->freelist)
remove_full(s, page);
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
@@ -1022,30 +1027,11 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
void (*ctor)(struct kmem_cache *, void *))
{
/*
- * The page->offset field is only 16 bit wide. This is an offset
- * in units of words from the beginning of an object. If the slab
- * size is bigger then we cannot move the free pointer behind the
- * object anymore.
- *
- * On 32 bit platforms the limit is 256k. On 64bit platforms
- * the limit is 512k.
- *
- * Debugging or ctor may create a need to move the free
- * pointer. Fail if this happens.
+ * Enable debugging if selected on the kernel commandline.
*/
- if (objsize >= 65535 * sizeof(void *)) {
- BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
- SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
- BUG_ON(ctor);
- } else {
- /*
- * Enable debugging if selected on the kernel commandline.
- */
- if (slub_debug && (!slub_debug_slabs ||
- strncmp(slub_debug_slabs, name,
- strlen(slub_debug_slabs)) == 0))
- flags |= slub_debug;
- }
+ if (slub_debug && (!slub_debug_slabs ||
+ strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
+ flags |= slub_debug;
return flags;
}
@@ -1071,6 +1057,11 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
return flags;
}
#define slub_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+ { return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node) {}
#endif
/*
* Slab allocation and freeing
@@ -1109,7 +1100,6 @@ static void setup_object(struct kmem_cache *s, struct page *page,
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- struct kmem_cache_node *n;
void *start;
void *last;
void *p;
@@ -1121,9 +1111,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
- n = get_node(s, page_to_nid(page));
- if (n)
- atomic_long_inc(&n->nr_slabs);
+ inc_slabs_node(s, page_to_nid(page));
page->slab = s;
page->flags |= 1 << PG_slab;
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1131,7 +1119,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
SetSlabDebug(page);
start = page_address(page);
- page->end = start + 1;
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << s->order);
@@ -1143,7 +1130,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
last = p;
}
setup_object(s, page, last);
- set_freepointer(s, last, page->end);
+ set_freepointer(s, last, NULL);
page->freelist = start;
page->inuse = 0;
@@ -1159,7 +1146,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
void *p;
slab_pad_check(s, page);
- for_each_object(p, s, slab_address(page))
+ for_each_object(p, s, page_address(page))
check_object(s, page, p, 0);
ClearSlabDebug(page);
}
@@ -1169,7 +1156,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
- page->mapping = NULL;
+ __ClearPageSlab(page);
+ reset_page_mapcount(page);
__free_pages(page, s->order);
}
@@ -1196,11 +1184,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
static void discard_slab(struct kmem_cache *s, struct page *page)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-
- atomic_long_dec(&n->nr_slabs);
- reset_page_mapcount(page);
- __ClearPageSlab(page);
+ dec_slabs_node(s, page_to_nid(page));
free_slab(s, page);
}
@@ -1314,7 +1298,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
* may return off node objects because partial slabs are obtained
* from other nodes and filled up.
*
- * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
+ * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
* defrag_ratio = 1000) then every (well almost) allocation will
* first attempt to defrag slab caches on other nodes. This means
* scanning over all nodes to look for partial slabs which may be
@@ -1373,7 +1357,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
ClearSlabFrozen(page);
if (page->inuse) {
- if (page->freelist != page->end) {
+ if (page->freelist) {
add_partial(n, page, tail);
stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {
@@ -1389,9 +1373,11 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
* Adding an empty slab to the partial slabs in order
* to avoid page allocator overhead. This slab needs
* to come after the other slabs with objects in
- * order to fill them up. That way the size of the
- * partial list stays small. kmem_cache_shrink can
- * reclaim empty slabs from the partial list.
+ * so that the others get filled first. That way the
+ * size of the partial list stays small.
+ *
+ * kmem_cache_shrink can reclaim any empty slabs from the
+ * partial list.
*/
add_partial(n, page, 1);
slab_unlock(page);
@@ -1411,18 +1397,14 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
struct page *page = c->page;
int tail = 1;
- if (c->freelist)
+ if (page->freelist)
stat(c, DEACTIVATE_REMOTE_FREES);
/*
- * Merge cpu freelist into freelist. Typically we get here
+ * Merge cpu freelist into slab freelist. Typically we get here
* because both freelists are empty. So this is unlikely
* to occur.
- *
- * We need to use _is_end here because deactivate slab may
- * be called for a debug slab. Then c->freelist may contain
- * a dummy pointer.
*/
- while (unlikely(!is_end(c->freelist))) {
+ while (unlikely(c->freelist)) {
void **object;
tail = 0; /* Hot objects. Put the slab first */
@@ -1449,6 +1431,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
/*
* Flush cpu slab.
+ *
* Called from IPI handler with interrupts disabled.
*/
static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
@@ -1507,43 +1490,41 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
* rest of the freelist to the lockless freelist.
*
* And if we were unable to get a new slab from the partial slab lists then
- * we need to allocate a new slab. This is slowest path since we may sleep.
+ * we need to allocate a new slab. This is the slowest path since it involves
+ * a call to the page allocator and the setup of a new slab.
*/
static void *__slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
{
void **object;
struct page *new;
-#ifdef SLUB_FASTPATH
- unsigned long flags;
- local_irq_save(flags);
-#endif
+ /* We handle __GFP_ZERO in the caller */
+ gfpflags &= ~__GFP_ZERO;
+
if (!c->page)
goto new_slab;
slab_lock(c->page);
if (unlikely(!node_match(c, node)))
goto another_slab;
+
stat(c, ALLOC_REFILL);
+
load_freelist:
object = c->page->freelist;
- if (unlikely(object == c->page->end))
+ if (unlikely(!object))
goto another_slab;
if (unlikely(SlabDebug(c->page)))
goto debug;
- object = c->page->freelist;
c->freelist = object[c->offset];
c->page->inuse = s->objects;
- c->page->freelist = c->page->end;
+ c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
slab_unlock(c->page);
stat(c, ALLOC_SLOWPATH);
-#ifdef SLUB_FASTPATH
- local_irq_restore(flags);
-#endif
return object;
another_slab:
@@ -1575,9 +1556,7 @@ new_slab:
c->page = new;
goto load_freelist;
}
-#ifdef SLUB_FASTPATH
- local_irq_restore(flags);
-#endif
+
/*
* No memory available.
*
@@ -1589,12 +1568,17 @@ new_slab:
* That is only possible if certain conditions are met that are being
* checked when a slab is created.
*/
- if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK))
- return kmalloc_large(s->objsize, gfpflags);
-
+ if (!(gfpflags & __GFP_NORETRY) &&
+ (s->flags & __PAGE_ALLOC_FALLBACK)) {
+ if (gfpflags & __GFP_WAIT)
+ local_irq_enable();
+ object = kmalloc_large(s->objsize, gfpflags);
+ if (gfpflags & __GFP_WAIT)
+ local_irq_disable();
+ return object;
+ }
return NULL;
debug:
- object = c->page->freelist;
if (!alloc_debug_processing(s, c->page, object, addr))
goto another_slab;
@@ -1619,39 +1603,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
{
void **object;
struct kmem_cache_cpu *c;
-
-/*
- * The SLUB_FASTPATH path is provisional and is currently disabled if the
- * kernel is compiled with preemption or if the arch does not support
- * fast cmpxchg operations. There are a couple of coming changes that will
- * simplify matters and allow preemption. Ultimately we may end up making
- * SLUB_FASTPATH the default.
- *
- * 1. The introduction of the per cpu allocator will avoid array lookups
- * through get_cpu_slab(). A special register can be used instead.
- *
- * 2. The introduction of per cpu atomic operations (cpu_ops) means that
- * we can realize the logic here entirely with per cpu atomics. The
- * per cpu atomic ops will take care of the preemption issues.
- */
-
-#ifdef SLUB_FASTPATH
- c = get_cpu_slab(s, raw_smp_processor_id());
- do {
- object = c->freelist;
- if (unlikely(is_end(object) || !node_match(c, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c);
- break;
- }
- stat(c, ALLOC_FASTPATH);
- } while (cmpxchg_local(&c->freelist, object, object[c->offset])
- != object);
-#else
unsigned long flags;
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
- if (unlikely(is_end(c->freelist) || !node_match(c, node)))
+ if (unlikely(!c->freelist || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
@@ -1661,7 +1617,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
stat(c, ALLOC_FASTPATH);
}
local_irq_restore(flags);
-#endif
if (unlikely((gfpflags & __GFP_ZERO) && object))
memset(object, 0, c->objsize);
@@ -1698,17 +1653,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
void **object = (void *)x;
struct kmem_cache_cpu *c;
-#ifdef SLUB_FASTPATH
- unsigned long flags;
-
- local_irq_save(flags);
-#endif
c = get_cpu_slab(s, raw_smp_processor_id());
stat(c, FREE_SLOWPATH);
slab_lock(page);
if (unlikely(SlabDebug(page)))
goto debug;
+
checks_ok:
prior = object[offset] = page->freelist;
page->freelist = object;
@@ -1723,24 +1674,20 @@ checks_ok:
goto slab_empty;
/*
- * Objects left in the slab. If it
- * was not on the partial list before
+ * Objects left in the slab. If it was not on the partial list before
* then add it.
*/
- if (unlikely(prior == page->end)) {
+ if (unlikely(!prior)) {
add_partial(get_node(s, page_to_nid(page)), page, 1);
stat(c, FREE_ADD_PARTIAL);
}
out_unlock:
slab_unlock(page);
-#ifdef SLUB_FASTPATH
- local_irq_restore(flags);
-#endif
return;
slab_empty:
- if (prior != page->end) {
+ if (prior) {
/*
* Slab still on the partial list.
*/
@@ -1749,9 +1696,6 @@ slab_empty:
}
slab_unlock(page);
stat(c, FREE_SLAB);
-#ifdef SLUB_FASTPATH
- local_irq_restore(flags);
-#endif
discard_slab(s, page);
return;
@@ -1777,39 +1721,11 @@ static __always_inline void slab_free(struct kmem_cache *s,
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
-
-#ifdef SLUB_FASTPATH
- void **freelist;
-
- c = get_cpu_slab(s, raw_smp_processor_id());
- debug_check_no_locks_freed(object, s->objsize);
- do {
- freelist = c->freelist;
- barrier();
- /*
- * If the compiler would reorder the retrieval of c->page to
- * come before c->freelist then an interrupt could
- * change the cpu slab before we retrieve c->freelist. We
- * could be matching on a page no longer active and put the
- * object onto the freelist of the wrong slab.
- *
- * On the other hand: If we already have the freelist pointer
- * then any change of cpu_slab will cause the cmpxchg to fail
- * since the freelist pointers are unique per slab.
- */
- if (unlikely(page != c->page || c->node < 0)) {
- __slab_free(s, page, x, addr, c->offset);
- break;
- }
- object[c->offset] = freelist;
- stat(c, FREE_FASTPATH);
- } while (cmpxchg_local(&c->freelist, freelist, object) != freelist);
-#else
unsigned long flags;
local_irq_save(flags);
- debug_check_no_locks_freed(object, s->objsize);
c = get_cpu_slab(s, smp_processor_id());
+ debug_check_no_locks_freed(object, c->objsize);
if (likely(page == c->page && c->node >= 0)) {
object[c->offset] = c->freelist;
c->freelist = object;
@@ -1818,7 +1734,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
__slab_free(s, page, x, addr, c->offset);
local_irq_restore(flags);
-#endif
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1973,20 +1888,21 @@ static unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size)
{
/*
- * If the user wants hardware cache aligned objects then
- * follow that suggestion if the object is sufficiently
- * large.
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
*
- * The hardware cache alignment cannot override the
- * specified alignment though. If that is greater
- * then use it.
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
*/
- if ((flags & SLAB_HWCACHE_ALIGN) &&
- size > cache_line_size() / 2)
- return max_t(unsigned long, align, cache_line_size());
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
if (align < ARCH_SLAB_MINALIGN)
- return ARCH_SLAB_MINALIGN;
+ align = ARCH_SLAB_MINALIGN;
return ALIGN(align, sizeof(void *));
}
@@ -1995,19 +1911,22 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
struct kmem_cache_cpu *c)
{
c->page = NULL;
- c->freelist = (void *)PAGE_MAPPING_ANON;
+ c->freelist = NULL;
c->node = 0;
c->offset = s->offset / sizeof(void *);
c->objsize = s->objsize;
+#ifdef CONFIG_SLUB_STATS
+ memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
+#endif
}
static void init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
- atomic_long_set(&n->nr_slabs, 0);
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_set(&n->nr_slabs, 0);
INIT_LIST_HEAD(&n->full);
#endif
}
@@ -2176,7 +2095,8 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
init_tracking(kmalloc_caches, n);
#endif
init_kmem_cache_node(n);
- atomic_long_inc(&n->nr_slabs);
+ inc_slabs_node(kmalloc_caches, node);
+
/*
* lockdep requires consistent irq usage for each lock
* so even though there cannot be a race this early in
@@ -2258,6 +2178,14 @@ static int calculate_sizes(struct kmem_cache *s)
unsigned long align = s->align;
/*
+ * Round up object size to the next word boundary. We can only
+ * place the free pointer at word boundaries and this determines
+ * the possible location of the free pointer.
+ */
+ size = ALIGN(size, sizeof(void *));
+
+#ifdef CONFIG_SLUB_DEBUG
+ /*
* Determine if we can poison the object itself. If the user of
* the slab may touch the object after free or before allocation
* then we should never poison the object itself.
@@ -2268,14 +2196,7 @@ static int calculate_sizes(struct kmem_cache *s)
else
s->flags &= ~__OBJECT_POISON;
- /*
- * Round up object size to the next word boundary. We can only
- * place the free pointer at word boundaries and this determines
- * the possible location of the free pointer.
- */
- size = ALIGN(size, sizeof(void *));
-#ifdef CONFIG_SLUB_DEBUG
/*
* If we are Redzoning then check if there is some space between the
* end of the object and the free pointer. If not then add an
@@ -2428,7 +2349,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
/*
* We could also check if the object is on the slabs freelist.
* But this would be too expensive and it seems that the main
- * purpose of kmem_ptr_valid is to check if the object belongs
+ * purpose of kmem_ptr_valid() is to check if the object belongs
* to a certain slab.
*/
return 1;
@@ -2487,7 +2408,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)
struct kmem_cache_node *n = get_node(s, node);
n->nr_partial -= free_list(s, n, &n->partial);
- if (atomic_long_read(&n->nr_slabs))
+ if (slabs_node(s, node))
return 1;
}
free_kmem_cache_nodes(s);
@@ -2520,10 +2441,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
EXPORT_SYMBOL(kmalloc_caches);
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
-#endif
-
static int __init setup_slub_min_order(char *str)
{
get_option(&str, &slub_min_order);
@@ -2583,6 +2500,7 @@ panic:
}
#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
static void sysfs_add_func(struct work_struct *w)
{
@@ -2715,13 +2633,24 @@ void *__kmalloc(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(__kmalloc);
+static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+{
+ struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
+ get_order(size));
+
+ if (page)
+ return page_address(page);
+ else
+ return NULL;
+}
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
if (unlikely(size > PAGE_SIZE))
- return kmalloc_large(size, flags);
+ return kmalloc_large_node(size, flags, node);
s = get_slab(size, flags);
@@ -2738,19 +2667,17 @@ size_t ksize(const void *object)
struct page *page;
struct kmem_cache *s;
- BUG_ON(!object);
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
page = virt_to_head_page(object);
- BUG_ON(!page);
if (unlikely(!PageSlab(page)))
return PAGE_SIZE << compound_order(page);
s = page->slab;
- BUG_ON(!s);
+#ifdef CONFIG_SLUB_DEBUG
/*
* Debugging requires use of the padding between object
* and whatever may come after it.
@@ -2758,6 +2685,7 @@ size_t ksize(const void *object)
if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
return s->objsize;
+#endif
/*
* If we have the need to store the freelist pointer
* back there or track user information then we can
@@ -2765,7 +2693,6 @@ size_t ksize(const void *object)
*/
if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
return s->inuse;
-
/*
* Else we can use all the padding etc for the allocation
*/
@@ -2790,19 +2717,6 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
- unsigned long flags;
- unsigned long x = 0;
- struct page *page;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
- x += page->inuse;
- spin_unlock_irqrestore(&n->list_lock, flags);
- return x;
-}
-
/*
* kmem_cache_shrink removes empty slabs from the partial lists and sorts
* the remaining slabs by the number of items in use. The slabs with the
@@ -2916,7 +2830,7 @@ static void slab_mem_offline_callback(void *arg)
* and offline_pages() function shoudn't call this
* callback. So, we must fail.
*/
- BUG_ON(atomic_long_read(&n->nr_slabs));
+ BUG_ON(slabs_node(s, offline_node));
s->node[offline_node] = NULL;
kmem_cache_free(kmalloc_caches, n);
@@ -3042,7 +2956,7 @@ void __init kmem_cache_init(void)
/*
* Patch up the size_index table if we have strange large alignment
* requirements for the kmalloc array. This is only the case for
- * mips it seems. The standard arches will not generate any code here.
+ * MIPS it seems. The standard arches will not generate any code here.
*
* Largest permitted alignment is 256 bytes due to the way we
* handle the index determination for the smaller caches.
@@ -3071,7 +2985,6 @@ void __init kmem_cache_init(void)
kmem_size = sizeof(struct kmem_cache);
#endif
-
printk(KERN_INFO
"SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
" CPUs=%d, Nodes=%d\n",
@@ -3168,12 +3081,15 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
*/
for_each_online_cpu(cpu)
get_cpu_slab(s, cpu)->objsize = s->objsize;
+
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
up_write(&slub_lock);
+
if (sysfs_slab_alias(s, name))
goto err;
return s;
}
+
s = kmalloc(kmem_size, GFP_KERNEL);
if (s) {
if (kmem_cache_open(s, GFP_KERNEL, name,
@@ -3269,7 +3185,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
struct kmem_cache *s;
if (unlikely(size > PAGE_SIZE))
- return kmalloc_large(size, gfpflags);
+ return kmalloc_large_node(size, gfpflags, node);
s = get_slab(size, gfpflags);
@@ -3279,12 +3195,27 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return slab_alloc(s, gfpflags, node, caller);
}
+#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
+static unsigned long count_partial(struct kmem_cache_node *n)
+{
+ unsigned long flags;
+ unsigned long x = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ x += page->inuse;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return x;
+}
+#endif
+
#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
void *p;
- void *addr = slab_address(page);
+ void *addr = page_address(page);
if (!check_slab(s, page) ||
!on_freelist(s, page, NULL))
@@ -3567,7 +3498,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc)
{
- void *addr = slab_address(page);
+ void *addr = page_address(page);
DECLARE_BITMAP(map, s->objects);
void *p;
@@ -3676,8 +3607,8 @@ enum slab_stat_type {
#define SO_CPU (1 << SL_CPU)
#define SO_OBJECTS (1 << SL_OBJECTS)
-static unsigned long slab_objects(struct kmem_cache *s,
- char *buf, unsigned long flags)
+static ssize_t show_slab_objects(struct kmem_cache *s,
+ char *buf, unsigned long flags)
{
unsigned long total = 0;
int cpu;
@@ -3687,6 +3618,8 @@ static unsigned long slab_objects(struct kmem_cache *s,
unsigned long *per_cpu;
nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
+ if (!nodes)
+ return -ENOMEM;
per_cpu = nodes + nr_node_ids;
for_each_possible_cpu(cpu) {
@@ -3839,25 +3772,25 @@ SLAB_ATTR_RO(aliases);
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
- return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
+ return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU);
}
SLAB_ATTR_RO(slabs);
static ssize_t partial_show(struct kmem_cache *s, char *buf)
{
- return slab_objects(s, buf, SO_PARTIAL);
+ return show_slab_objects(s, buf, SO_PARTIAL);
}
SLAB_ATTR_RO(partial);
static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
{
- return slab_objects(s, buf, SO_CPU);
+ return show_slab_objects(s, buf, SO_CPU);
}
SLAB_ATTR_RO(cpu_slabs);
static ssize_t objects_show(struct kmem_cache *s, char *buf)
{
- return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
+ return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS);
}
SLAB_ATTR_RO(objects);
@@ -4056,7 +3989,6 @@ SLAB_ATTR(remote_node_defrag_ratio);
#endif
#ifdef CONFIG_SLUB_STATS
-
static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
{
unsigned long sum = 0;
@@ -4076,10 +4008,12 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
len = sprintf(buf, "%lu", sum);
+#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
if (data[cpu] && len < PAGE_SIZE - 20)
- len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]);
+ len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
}
+#endif
kfree(data);
return len + sprintf(buf + len, "\n");
}
@@ -4240,8 +4174,8 @@ static struct kset *slab_kset;
#define ID_STR_LENGTH 64
/* Create a unique string id for a slab cache:
- * format
- * :[flags-]size:[memory address of kmemcache]
+ *
+ * Format :[flags-]size
*/
static char *create_unique_id(struct kmem_cache *s)
{
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index cd75b21dd4c3..99c4f36eb8a3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -76,7 +76,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
pte_t entry;
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
- return 0;
+ return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -89,7 +89,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
if (pmd_none(*pmd)) {
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
- return 0;
+ return NULL;
pmd_populate_kernel(&init_mm, pmd, p);
}
return pmd;
@@ -101,7 +101,7 @@ pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
if (pud_none(*pud)) {
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
- return 0;
+ return NULL;
pud_populate(&init_mm, pud, p);
}
return pud;
@@ -113,7 +113,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
if (pgd_none(*pgd)) {
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
- return 0;
+ return NULL;
pgd_populate(&init_mm, pgd, p);
}
return pgd;
diff --git a/mm/sparse.c b/mm/sparse.c
index f6a43c09c322..98d6b39c3472 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -149,8 +149,18 @@ static inline int sparse_early_nid(struct mem_section *section)
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
+ unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
unsigned long pfn;
+ /*
+ * Sanity checks - do not allow an architecture to pass
+ * in larger pfns than the maximum scope of sparsemem:
+ */
+ if (start >= max_arch_pfn)
+ return;
+ if (end >= max_arch_pfn)
+ end = max_arch_pfn;
+
start &= PAGE_SECTION_MASK;
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
diff --git a/mm/swap.c b/mm/swap.c
index 710a20bb9749..aa1139ccf3a7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,12 +78,11 @@ void put_page(struct page *page)
EXPORT_SYMBOL(put_page);
/**
- * put_pages_list(): release a list of pages
+ * put_pages_list() - release a list of pages
+ * @pages: list of pages threaded on page->lru
*
* Release a list of pages which are strung together on page.lru. Currently
* used by read_cache_pages() and related error recovery code.
- *
- * @pages: list of pages threaded on page->lru
*/
void put_pages_list(struct list_head *pages)
{
@@ -176,7 +175,7 @@ void activate_page(struct page *page)
SetPageActive(page);
add_page_to_active_list(zone, page);
__count_vm_event(PGACTIVATE);
- mem_cgroup_move_lists(page_get_page_cgroup(page), true);
+ mem_cgroup_move_lists(page, true);
}
spin_unlock_irq(&zone->lru_lock);
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ec42f01a8d02..50757ee3f9f3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -115,6 +115,7 @@ void __delete_from_swap_cache(struct page *page)
/**
* add_to_swap - allocate swap space for a page
* @page: page we want to move to swap
+ * @gfp_mask: memory allocation flags
*
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
@@ -315,6 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
* @vma: user vma this address belongs to
* @addr: target address for mempolicy
*
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 702083638c16..ae532f501943 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -39,12 +39,11 @@ static int __init init_tmpfs(void)
}
module_init(init_tmpfs)
-/*
+/**
* shmem_file_setup - get an unlinked file living in tmpfs
- *
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
- *
+ * @flags: vm_flags
*/
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
{
@@ -89,15 +88,16 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
close_file:
put_filp(file);
+ return ERR_PTR(error);
+
put_dentry:
dput(dentry);
put_memory:
return ERR_PTR(error);
}
-/*
+/**
* shmem_zero_setup - setup a shared anonymous mapping
- *
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
*/
int shmem_zero_setup(struct vm_area_struct *vma)
diff --git a/mm/truncate.c b/mm/truncate.c
index c35c49e54fb6..7d20ce41ecf5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -134,8 +134,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
}
/**
- * truncate_inode_pages - truncate range of pages specified by start and
- * end byte offsets
+ * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
* @lend: offset to which to truncate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 950c0be9ca81..ecf91f8034bf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -757,7 +757,8 @@ finished:
* @vma: vma to cover (map full range of vma)
* @addr: vmalloc memory
* @pgoff: number of pages into addr before first page to map
- * @returns: 0 for success, -Exxx on failure
+ *
+ * Returns: 0 for success, -Exxx on failure
*
* This function checks that addr is a valid vmalloc'ed area, and
* that it is big enough to cover the vma. Will return failure if
@@ -829,7 +830,8 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
/**
* alloc_vm_area - allocate a range of kernel address space
* @size: size of the area
- * @returns: NULL on failure, vm_struct on success
+ *
+ * Returns: NULL on failure, vm_struct on success
*
* This function reserves a range of kernel address space, and
* allocates pagetables to map that range. No actual mappings
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a26dabd62fed..f80a5b7c057f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,13 +70,6 @@ struct scan_control {
int order;
- /*
- * Pages that have (or should have) IO pending. If we run into
- * a lot of these, we're better off waiting a little for IO to
- * finish rather than scanning more pages in the VM.
- */
- int nr_io_pages;
-
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -126,7 +119,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
#define scan_global_lru(sc) (!(sc)->mem_cgroup)
#else
#define scan_global_lru(sc) (1)
@@ -512,10 +505,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
wait_on_page_writeback(page);
- else {
- sc->nr_io_pages++;
+ else
goto keep_locked;
- }
}
referenced = page_referenced(page, 1, sc->mem_cgroup);
@@ -554,10 +545,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page)) {
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
goto keep_locked;
- if (!may_enter_fs) {
- sc->nr_io_pages++;
+ if (!may_enter_fs)
goto keep_locked;
- }
if (!sc->may_writepage)
goto keep_locked;
@@ -568,10 +557,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page)) {
- sc->nr_io_pages++;
+ if (PageWriteback(page) || PageDirty(page))
goto keep;
- }
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
@@ -1128,7 +1115,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
ClearPageActive(page);
list_move(&page->lru, &zone->inactive_list);
- mem_cgroup_move_lists(page_get_page_cgroup(page), false);
+ mem_cgroup_move_lists(page, false);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1156,8 +1143,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
VM_BUG_ON(!PageActive(page));
+
list_move(&page->lru, &zone->active_list);
- mem_cgroup_move_lists(page_get_page_cgroup(page), true);
+ mem_cgroup_move_lists(page, true);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1343,7 +1331,6 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
- sc->nr_io_pages = 0;
if (!priority)
disable_swap_token();
nr_reclaimed += shrink_zones(priority, zones, sc);
@@ -1378,8 +1365,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
}
/* Take a nap, wait for some writeback to complete */
- if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
- sc->nr_io_pages > sc->swap_cluster_max)
+ if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
congestion_wait(WRITE, HZ/10);
}
/* top priority shrink_caches still had more to do? don't OOM, then */
@@ -1427,7 +1413,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
return do_try_to_free_pages(zones, gfp_mask, &sc);
}
-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask)
@@ -1513,7 +1499,6 @@ loop_again:
if (!priority)
disable_swap_token();
- sc.nr_io_pages = 0;
all_zones_ok = 1;
/*
@@ -1606,8 +1591,7 @@ loop_again:
* OK, kswapd is getting into trouble. Take a nap, then take
* another pass across the zones.
*/
- if (total_scanned && priority < DEF_PRIORITY - 2 &&
- sc.nr_io_pages > sc.swap_cluster_max)
+ if (total_scanned && priority < DEF_PRIORITY - 2)
congestion_wait(WRITE, HZ/10);
/*
@@ -1663,11 +1647,10 @@ static int kswapd(void *p)
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
- cpumask_t cpumask;
+ node_to_cpumask_ptr(cpumask, pgdat->node_id);
- cpumask = node_to_cpumask(pgdat->node_id);
- if (!cpus_empty(cpumask))
- set_cpus_allowed(tsk, cpumask);
+ if (!cpus_empty(*cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
/*
@@ -1896,17 +1879,16 @@ out:
static int __devinit cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- pg_data_t *pgdat;
- cpumask_t mask;
int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_HIGH_MEMORY) {
- pgdat = NODE_DATA(nid);
- mask = node_to_cpumask(pgdat->node_id);
- if (any_online_cpu(mask) != NR_CPUS)
+ pg_data_t *pgdat = NODE_DATA(nid);
+ node_to_cpumask_ptr(mask, pgdat->node_id);
+
+ if (any_online_cpu(*mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
- set_cpus_allowed(pgdat->kswapd, mask);
+ set_cpus_allowed_ptr(pgdat->kswapd, mask);
}
}
return NOTIFY_OK;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 422d960ffcd8..7c7286e9506d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -388,6 +388,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
"Reclaimable",
"Movable",
"Reserve",
+ "Isolate",
};
static void *frag_start(struct seq_file *m, loff_t *pos)